In [1]:
import re
import constants
import os
import requests
import pandas as pd
import multiprocessing
import time
from time import time as timer
from tqdm import tqdm
import numpy as np
from pathlib import Path
from functools import partial
import requests
import urllib
from PIL import Image
import constants

def common_mistake(unit):
    if unit in constants.allowed_units:
        return unit
    if unit.replace('ter', 'tre') in constants.allowed_units:
        return unit.replace('ter', 'tre')
    if unit.replace('feet', 'foot') in constants.allowed_units:
        return unit.replace('feet', 'foot')
    return unit

def parse_string(s):
    s_stripped = "" if s==None or str(s)=='nan' else s.strip()
    if s_stripped == "":
        return None, None
    pattern = re.compile(r'^-?\d+(\.\d+)?\s+[a-zA-Z\s]+$')
    if not pattern.match(s_stripped):
        #raise ValueError("Invalid format in {}".format(s))
        return None, None
    parts = s_stripped.split(maxsplit=1)
    number = float(parts[0])
    unit = common_mistake(parts[1])
    if unit not in constants.allowed_units:
        return None, None
        #raise ValueError("Invalid unit [{}] found in {}. Allowed units: {}".format(
            #unit, s, constants.allowed_units))
    return number, unit


def create_placeholder_image(image_save_path):
    try:
        placeholder_image = Image.new('RGB', (100, 100), color='black')
        placeholder_image.save(image_save_path)
    except Exception as e:
        return

def download_image(image_link, save_folder, retries=3, delay=3):
    if not isinstance(image_link, str):
        return

    filename = Path(image_link).name
    image_save_path = os.path.join(save_folder, filename)

    if os.path.exists(image_save_path):
        return

    for _ in range(retries):
        try:
            urllib.request.urlretrieve(image_link, image_save_path)
            return
        except:
            time.sleep(delay)
    
    create_placeholder_image(image_save_path) #Create a black placeholder image for invalid links/images

def download_images(image_links, download_folder, allow_multiprocessing=False):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    if allow_multiprocessing:
        download_image_partial = partial(
            download_image, save_folder=download_folder, retries=3, delay=3)

        with multiprocessing.Pool(64) as pool:
            list(tqdm(pool.imap(download_image_partial, image_links), total=len(image_links)))
            pool.close()
            pool.join()
    else:
        for image_link in tqdm(image_links, total=len(image_links)):
            download_image(image_link, save_folder=download_folder, retries=3, delay=3)
        

In [2]:
import os
import urllib
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from PIL import Image
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
import numpy as np
import torch
from torchvision import models, transforms

In [3]:
# Image feature extraction using a pre-trained ResNet model
def extract_image_features(image_path, model, transform):
    try:
        image = Image.open(image_path)
        image = image.convert('RGB')
        image_tensor = transform(image).unsqueeze(0)
        with torch.no_grad():
            features = model(image_tensor).flatten().numpy()
            #features,_ = apply_pca(features)
        return features
    except Exception:
        return None

# Function to extract features for all images in the dataset
def extract_features_for_dataset(image_paths, model, transform):
    features_list = []
    for image_path in tqdm(image_paths, total=len(image_paths)):
        features = extract_image_features(image_path, model, transform)
        if features is not None:
            features_list.append(features)
        else:
            features_list.append(np.zeros(1280))  # Ensure consistent size with zeros for failed images
    return features_list

# Apply PCA to reduce dimensionality of image features
def apply_pca(image_features, n_components=50):
    pca = PCA(n_components=n_components)
    reduced_features = pca.fit_transform(image_features)
    return reduced_features, pca


In [4]:
train_df = pd.read_csv('dataset//train.csv')
test_df = pd.read_csv('dataset//test.csv')

In [5]:
sample_test_df = pd.read_csv('dataset//sample_test.csv')

In [6]:
y = pd.DataFrame()

In [7]:
# Extract numeric_value and unit from entity_value in train data
y['numeric_value'],y['unit'] = zip(*train_df['entity_value'].map(parse_string))

In [8]:
#group_id_encoded = pd.get_dummies(train_df['group_id'], prefix='group')
entity_name_encoded = pd.get_dummies(train_df['entity_name'], prefix='entity')

In [9]:
#group_id_encoded1 = pd.get_dummies(test_df['group_id'], prefix='group')
entity_name_encoded1 = pd.get_dummies(test_df['entity_name'], prefix='entity')

In [10]:
entity_name_encoded_sample = pd.get_dummies(sample_test_df['entity_name'],prefix='entity')

In [11]:
train_df = pd.concat([train_df,entity_name_encoded],axis=1)

In [12]:
test_df = pd.concat([test_df,entity_name_encoded1],axis=1)

In [13]:
sample_test_df = pd.concat([sample_test_df,entity_name_encoded_sample],axis=1)

In [14]:
train_df1 = train_df.drop(['image_link','group_id','entity_name','entity_value'],axis=1)

In [15]:
test_df1 = test_df.drop(['image_link','group_id','entity_name'],axis=1)

In [16]:
test_df1_sample = test_df.drop(['image_link','group_id','entity_name'],axis=1)

In [17]:
# Download images
download_folder_path = 'C:\\Users\\yaswa\\ML\\temp_images'
train_image_links = train_df['image_link'].tolist()
test_image_links = test_df['image_link'].tolist()
#all_image_links = train_image_links + test_image_links

In [18]:
sample_test_image_links = sample_test_df['image_link'].to_list()

In [19]:
import torch.nn as nn

# Load MobileNet and apply global average pooling to reduce feature dimensionality
mobilenet_model = models.mobilenet_v2(pretrained=True)
mobilenet_model.eval()

# Modify the MobileNet model by adding global average pooling
class MobileNetWithGAP(nn.Module):
    def __init__(self, mobilenet_model):
        super(MobileNetWithGAP, self).__init__()
        self.features = mobilenet_model.features
        self.pool = nn.AdaptiveAvgPool2d((1, 1))  # Global average pooling

    def forward(self, x):
        x = self.features(x)
        x = self.pool(x)  # Apply global average pooling
        x = torch.flatten(x, 1)  # Flatten to get a single feature vector
        return x

# Create a new model with global average pooling
mobilenet_model_with_gap = MobileNetWithGAP(mobilenet_model)
mobilenet_model_with_gap.eval()

# Use this model for feature extraction




MobileNetWithGAP(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(96,

In [39]:
download_images(train_image_links[:1000], download_folder_path)
download_images(test_image_links,download_folder_path)

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:56<00:00,  4.22it/s]
100%|████████████████████████████████████████████████████████████████████████| 131187/131187 [5:50:37<00:00,  6.24it/s]


In [51]:
download_images(sample_test_image_links, download_folder_path)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88/88 [00:14<00:00,  6.28it/s]


In [41]:
# Set up image transformation
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [47]:
# Get image paths for train and test
train_image_paths = [os.path.join(download_folder_path, Path(link).name) for link in train_image_links[:1000]]
test_image_paths = [os.path.join(download_folder_path, Path(link).name) for link in test_image_links]

In [58]:
sample_test_image_paths = [os.path.join(download_folder_path, Path(link).name) for link in sample_test_image_links]

In [49]:
# Extract features for train and test images
train_image_features = extract_features_for_dataset(train_image_paths, mobilenet_model_with_gap, transform)
test_image_features = extract_features_for_dataset(test_image_paths, mobilenet_model_with_gap, transform)

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [02:13<00:00,  7.46it/s]
 46%|████████████████████████████████▌                                      | 60106/131187 [1:22:08<2:03:38,  9.58it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

100%|████████████████████████████████████████████████████████████████████████| 131187/131187 [4:22:36<00:00,  8.33it/s]


In [62]:
sample_test_image_features = extract_features_for_dataset(sample_test_image_paths, mobilenet_model_with_gap, transform)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88/88 [00:07<00:00, 11.65it/s]


In [51]:
# Apply PCA to train and test image features
train_image_features_pca, pca_model = apply_pca(train_image_features, n_components=50)
test_image_features_pca = pca_model.transform(test_image_features)

In [66]:
sample_test_image_features_pca = pca_model.transform(sample_test_image_features)

In [53]:
sum(pca_model.explained_variance_ratio_)

0.6080668293219683

In [55]:
train_image_features_df = pd.DataFrame(train_image_features_pca)
test_image_features_df = pd.DataFrame(test_image_features_pca)

In [72]:
sample_test_image_features_df = pd.DataFrame(sample_test_image_features_pca)

In [57]:
# Combine the image features with metadata (group_id, entity_name)
X_train = pd.concat([train_df1[:1000],train_image_features_df], axis=1)
X_test = pd.concat([test_df1, test_image_features_df], axis=1)

In [76]:
X_test_sample = pd.concat([sample_test_df, sample_test_image_features_df], axis=1)

In [59]:
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

In [61]:
X_test_sample.columns = X_test_sample.columns.astype(str)

NameError: name 'X_test_sample' is not defined

In [63]:
# Ensure there are no NaN values in the image features
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

In [84]:
X_test_sample = X_test_sample.fillna(0)

In [65]:
# Target variables
y_numeric = y['numeric_value'][:1000]
y_unit = y['unit'][:1000]

In [67]:
y_numeric = y_numeric.fillna(0)
y_unit = y_unit.fillna(y_unit.mode()[0])

In [69]:
from sklearn.model_selection import train_test_split

In [71]:
X_train_numeric,X_test_numeric,y_train_numeric,y_test_numeric = train_test_split(X_train,y_numeric,test_size=0.2)
X_train_unit,X_test_unit,y_train_unit,y_test_unit = train_test_split(X_train,y_unit,test_size=0.2)

In [73]:
# Build a RandomForest model for numeric value prediction
numeric_model =  RandomForestRegressor(n_estimators=100, random_state=42)
unit_model = RandomForestClassifier(n_estimators=100,random_state=42)

In [75]:
# Train the numeric value prediction model
numeric_model.fit(X_train_numeric, y_train_numeric)

In [77]:
numeric_model.score(X_test_numeric,y_test_numeric)

-536880.7562037646

In [79]:
unit_model.fit(X_train_unit,y_train_unit)

In [81]:
unit_model.score(X_test_unit,y_test_unit)

0.64

In [83]:
X_test1 = X_test.drop('index',axis=1)

In [85]:
test_numeric_predictions = numeric_model.predict(X_test1)

In [87]:
test_unit_predictions = unit_model.predict(X_test1)

In [89]:
test_predictions = [
    f"{num:.2f} {unit}" for num, unit in zip(test_numeric_predictions, test_unit_predictions)]

In [91]:
# Prepare the final output DataFrame for submission
test_output = pd.DataFrame({
    'index': X_test['index'],
    'prediction': test_predictions
})

In [93]:
test_output

Unnamed: 0,index,prediction
0,0,23576310.28 gram
1,1,171.20 gram
2,2,171.94 gram
3,3,171.20 gram
4,4,111.50 gram
...,...,...
131182,131283,175.56 gram
131183,131284,168.24 gram
131184,131285,175.70 gram
131185,131286,301.99 gram


In [95]:
X_test_sample1 = X_test_sample.drop(['index','group_id','image_link','entity_name'],axis=1)

NameError: name 'X_test_sample' is not defined

In [97]:
X_test_sample1['entity_item_volume'] = False

NameError: name 'X_test_sample1' is not defined

In [99]:
X_test_sample1 = pd.DataFrame(X_test_sample1,columns=X_test1.columns)

NameError: name 'X_test_sample1' is not defined

In [123]:
sample_test_numeric_predictions = numeric_model.predict(X_test_sample1)

In [125]:
sample_test_unit_predictions = unit_model.predict(X_test_sample1)

In [127]:
sample_test_predictions = [
    f"{num:.2f} {unit}" for num, unit in zip(sample_test_numeric_predictions, sample_test_unit_predictions)]

In [129]:
# Prepare the final output DataFrame for submission
sample_test_output = pd.DataFrame({
    'index': X_test_sample['index'],
    'prediction': sample_test_predictions
})

In [131]:
sample_test_output

Unnamed: 0,index,prediction
0,0,86.78 gram
1,1,86.78 gram
2,2,48.64 gram
3,3,76.16 gram
4,4,91.88 gram
...,...,...
83,83,62.03 gram
84,84,72.33 gram
85,85,132.63 gram
86,86,166.41 gram


In [105]:
# Save the predictions
test_output.to_csv('test_predictions.csv', index=False)


In [107]:
!python src/sanity.py --test_filename dataset/test.csv --output_filename test_predictions.csv

Parsing successfull for file: test_predictions.csv
