Installing necessary modules

In [None]:
!pip install numpy pandas requests Pillow scikit-learn tqdm opencv-python matplotlib constants

Drive Mounting

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Accessing nested folders

In [12]:
import os

# Define the path to your project directory
project_path = '/content/drive/MyDrive/student_resource_3'
data_path = os.path.join(project_path, 'dataset')
src_path = os.path.join(project_path, 'src')

Getting the file names with full path

In [13]:
# List files in the project directory to verify access
print("Files in project directory:")
for root, dirs, files in os.walk(project_path):
    for file in files:
        print(os.path.join(root, file))

Files in project directory:
/content/drive/MyDrive/student_resource_3/sample_code.py
/content/drive/MyDrive/student_resource_3/README.md
/content/drive/MyDrive/student_resource_3/.DS_Store
/content/drive/MyDrive/student_resource_3/dataset/test.csv
/content/drive/MyDrive/student_resource_3/dataset/sample_test_out.csv
/content/drive/MyDrive/student_resource_3/dataset/train.csv
/content/drive/MyDrive/student_resource_3/dataset/sample_test.csv
/content/drive/MyDrive/student_resource_3/dataset/sample_test_out_fail.csv
/content/drive/MyDrive/student_resource_3/src/constants.py
/content/drive/MyDrive/student_resource_3/src/sanity.py
/content/drive/MyDrive/student_resource_3/src/test.ipynb
/content/drive/MyDrive/student_resource_3/src/utils.py
/content/drive/MyDrive/student_resource_3/src/.DS_Store
/content/drive/MyDrive/student_resource_3/src/__pycache__/constants.cpython-310.pyc
/content/drive/MyDrive/student_resource_3/downloaded_images/110EibNyclL.jpg
/content/drive/MyDrive/student_resourc

Loading test, train files from dataset folder

In [14]:
import pandas as pd
# Load data files
test_df = pd.read_csv(os.path.join(data_path, 'test.csv'))
train_df = pd.read_csv(os.path.join(data_path, 'train.csv'))

# Display the first few rows of the data
test_df.head()

Unnamed: 0,index,image_link,group_id,entity_name
0,0,https://m.media-amazon.com/images/I/110EibNycl...,156839,height
1,1,https://m.media-amazon.com/images/I/11TU2clswz...,792578,width
2,2,https://m.media-amazon.com/images/I/11TU2clswz...,792578,height
3,3,https://m.media-amazon.com/images/I/11TU2clswz...,792578,depth
4,4,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,depth


In [5]:
train_df.head()

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram


In [None]:
#!pip install pandas tqdm numpy requests pillow



In [None]:
#! pip install constants
import re
import constants
import os
import requests
import pandas as pd
import multiprocessing
import time
from time import time as timer
from tqdm import tqdm
import numpy as np
from pathlib import Path
from functools import partial
import requests
import urllib
from PIL import Image

def create_placeholder_image(image_save_path):
    try:
        placeholder_image = Image.new('RGB', (100, 100), color='black')
        placeholder_image.save(image_save_path)
    except Exception as e:
        return

def download_image(image_link, save_folder, retries=3, delay=3):
    if not isinstance(image_link, str):
        return

    filename = Path(image_link).name
    image_save_path = os.path.join(save_folder, filename)

    if os.path.exists(image_save_path):
        return

    for _ in range(retries):
        try:
            urllib.request.urlretrieve(image_link, image_save_path)
            return
        except:
            time.sleep(delay)

    create_placeholder_image(image_save_path) #Create a black placeholder image for invalid links/images

def download_images(image_links, download_folder, allow_multiprocessing=True):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    if allow_multiprocessing:
        download_image_partial = partial(
            download_image, save_folder=download_folder, retries=3, delay=3)

        with multiprocessing.Pool(64) as pool:
            list(tqdm(pool.imap(download_image_partial, image_links), total=len(image_links)))
            pool.close()
            pool.join()
    else:
        for image_link in tqdm(image_links, total=len(image_links)):
            download_image(image_link, save_folder=download_folder, retries=3, delay=3)



In [None]:
# Accessing all the image_links from the train set
image_links = train_df['image_link'].tolist()

# Downloading the images
download_images(image_links, '/content/downloaded_images')

100%|██████████| 10000/10000 [01:26<00:00, 116.11it/s]


In [None]:
folder_path = '/content/downloaded_images'

image_files = os.listdir(folder_path)

count = 0
for i in image_files:
  count+=1
print(count)

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Dropout
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import os

Load and Preprocess Data

In [None]:
import numpy as np
import pandas as pd
import os
from tensorflow.keras.preprocessing import image
from sklearn.preprocessing import LabelEncoder, StandardScaler
from PIL import ImageFile

# Allow loading of truncated images
ImageFile.LOAD_TRUNCATED_IMAGES = True

# entity_unit_map as given
entity_unit_map = {
  "width": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
  "depth": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
  "height": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
  "item_weight": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
  "maximum_weight_recommendation": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
  "voltage": {"millivolt", "kilovolt", "volt"},
  "wattage": {"kilowatt", "watt"},
  "item_volume": {
    "cubic foot", "microlitre", "cup", "fluid ounce", "centilitre", "imperial gallon",
    "pint", "decilitre", "litre", "millilitre", "quart", "cubic inch", "gallon"
  }
}

def preprocess_image(img_path, target_size=(224, 224)):
    """Preprocess the image for model input"""
    img = image.load_img(img_path, target_size=target_size)
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = img_array / 255.0  # Normalize
    return img_array

def load_data(csv_file, img_folder):
    """Load data from CSV and image folder, preprocess images and extract entity values"""
    df = pd.read_csv(csv_file)
    images = []
    values = []
    units = []
    entity_names = []

    for _, row in df.iterrows():
        img_path = os.path.join(img_folder, os.path.basename(row['image_link']))
        img_array = preprocess_image(img_path)
        images.append(img_array)

        # Extract the entity_name, entity_value, and unit from the CSV
        entity_name = row['entity_name']
        value, unit = row['entity_value'].split(maxsplit=1)

        # Validate unit against entity_unit_map
        if unit in entity_unit_map.get(entity_name, []):
            values.append(float(value))
            units.append(unit)
            entity_names.append(entity_name)
        else:
            print(f"Skipping invalid unit for {entity_name}: {unit}")
            continue

    return np.vstack(images), np.array(values), np.array(units), np.array(entity_names)

# Replace with your actual file paths
train_csv = '/content/drive/MyDrive/student_resource_3/dataset/train.csv'
img_folder = '/content/downloaded_images'

X, y_values, y_units, y_entity_names = load_data(train_csv, img_folder)

# Encode units and entity names
unit_encoder = LabelEncoder()
entity_encoder = LabelEncoder()

y_units_encoded = unit_encoder.fit_transform(y_units)
y_entity_names_encoded = entity_encoder.fit_transform(y_entity_names)

# Scale values
value_scaler = StandardScaler()
y_values_scaled = value_scaler.fit_transform(y_values.reshape(-1, 1))

num_classes = len(unit_encoder.classes_)


Build and Train the Custom Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Dropout

def build_custom_model(input_shape, num_units, num_entities):
    model = Sequential()

    # Add convolutional layers
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(128, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))

    # Flatten and add fully connected layers
    model.add(Flatten())
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.5))

    # Output layers
    model.add(Dense(1, name='value'))  # Predict value
    model.add(Dense(num_units, activation='softmax', name='unit'))  # Predict unit
    model.add(Dense(num_entities, activation='softmax', name='entity_name'))  # Predict entity name

    model.compile(optimizer='adam',
                  loss={'value': 'mean_squared_error',
                        'unit': 'sparse_categorical_crossentropy',
                        'entity_name': 'sparse_categorical_crossentropy'},
                  metrics={'value': 'mae', 'unit': 'accuracy', 'entity_name': 'accuracy'})

    return model

input_shape = (224, 224, 3)
model = build_custom_model(input_shape, num_classes, len(entity_encoder.classes_))

# Split data
X_train, X_val, y_values_train, y_values_val, y_units_train, y_units_val, y_entities_train, y_entities_val = train_test_split(
    X, y_values_scaled, y_units_encoded, y_entity_names_encoded, test_size=0.2, random_state=42
)

# Train the model
history = model.fit(
    X_train,
    {'value': y_values_train, 'unit': y_units_train, 'entity_name': y_entities_train},
    epochs=10,
    batch_size=32,
    validation_data=(X_val, {'value': y_values_val, 'unit': y_units_val, 'entity_name': y_entities_val})
)

Predict from test.csv

In [None]:
def predict_from_test_file(test_csv, img_folder, output_csv):
    """Load the test CSV, make predictions, and save results to a new CSV file"""
    test_df = pd.read_csv(test_csv)

    # Create lists to store results
    indices = []
    entity_values = []

    for index, row in test_df.iterrows():
        img_path = os.path.join(img_folder, os.path.basename(row['image_link']))

        # Check if the image exists
        if os.path.exists(img_path):
            try:
                prediction = predict_image(img_path)
                indices.append(index)
                entity_values.append(prediction)
            except Exception as e:
                print(f"Error predicting for image {img_path}: {e}")
                indices.append(index)
                entity_values.append("Error")
        else:
            print(f"Image not found: {img_path}")
            indices.append(index)
            entity_values.append("Image not found")

    # Create a DataFrame with the results
    results_df = pd.DataFrame({
        'index': indices,
        'entity_values': entity_values
    })

    # Save the results to a new CSV file
    results_df.to_csv(output_csv, index=False)
    print(f"Predictions saved to {output_csv}")

# File paths
test_csv = '/content/test.csv'
img_folder = '/content/downloaded_images'
output_csv = '/content/predictions.csv'

# Make predictions and save them to a CSV file
predict_from_test_file(test_csv, img_folder, output_csv)


In [16]:
import pandas as pd
import random
import os

# Your entity_unit_map definition
entity_unit_map = {
    "width": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "depth": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "height": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "item_weight": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "maximum_weight_recommendation": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "voltage": {"millivolt", "kilovolt", "volt"},
    "wattage": {"kilowatt", "watt"},
    "item_volume": {"cubic foot", "microlitre", "cup", "fluid ounce", "centilitre", "imperial gallon", "pint",
                    "decilitre", "litre", "millilitre", "quart", "cubic inch", "gallon"}
}

# Example input data (replace with your actual data)

test_data = pd.read_csv(os.path.join(data_path, 'test.csv'))

# Function to generate a random value and unit for a specific entity
def generate_random_value_for_entity(entity_name):
    if entity_name in entity_unit_map:
        unit = random.choice(list(entity_unit_map[entity_name]))  # Choose a random unit from the corresponding entity
        value = random.uniform(1, 1000)    # Generate a random value
        return f"{value:.2f} {unit}"  # Return value and unit as a string
    else:
        return "Unknown entity"  # Fallback for unknown entities

# Create output list to collect results
output_data = []

# Generate values for each row in test_data
for idx, row in test_data.iterrows():
    entity_name = row['entity_name']  # Get the entity_name from the input
    entity_value = generate_random_value_for_entity(entity_name)
    output_data.append({
        'index': row['index'],
        'entity_values': entity_value
    })

# Convert the output list to a DataFrame
output_df = pd.DataFrame(output_data)

# Save to output.csv
output_df.to_csv('/content/predictions.csv', index=False)
