In [None]:
!pip install pytesseract pillow torch torchvision


In [None]:
import pytesseract
from PIL import Image
import torch
import torchvision.models as models
import torchvision.transforms as transforms
import torch.nn as nn


In [1]:
pip install tqdm pillow requests



In [2]:
import pandas as pd

# Load the train and test CSV files
train_df = pd.read_csv('/content/drive/MyDrive/dataset/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/dataset/test.csv')

# Extract image links
train_image_links = train_df['image_link'].tolist()
test_image_links = test_df['image_link'].tolist()


In [3]:
pip install constants

Collecting constants
  Downloading constants-2023.2.0.tar.gz (5.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tox (from constants)
  Downloading tox-4.18.1-py3-none-any.whl.metadata (5.0 kB)
INFO: pip is looking at multiple versions of constants to determine which version is compatible with other requirements. This could take a while.
Collecting constants
  Downloading constants-0.6.0.tar.gz (5.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: constants
  Building wheel for constants (setup.py) ... [?25l[?25hdone
  Created wheel for constants: filename=constants-0.6.0-py3-none-any.whl size=5457 sha256=d8dd4e299a4f543ccf70970bd20ca57f21d41438f4a6a60292f35ec6b49d59e5
  Stored in directory: /root/.cache/pip/wheels/5b/96/3c/386c2342a8a1bdd317f2f250bd076c13938c6f598c4a40ec14
Successfully built constants
Installing collected packages: constants
Successfully installed constants-0.6.0


In [4]:
import re
import constants
import os
import requests
import pandas as pd
import multiprocessing
import time
from time import time as timer
from tqdm import tqdm
import numpy as np
from pathlib import Path
from functools import partial
import requests
import urllib
from PIL import Image

def common_mistake(unit):
    if unit in constants.allowed_units:
        return unit
    if unit.replace('ter', 'tre') in constants.allowed_units:
        return unit.replace('ter', 'tre')
    if unit.replace('feet', 'foot') in constants.allowed_units:
        return unit.replace('feet', 'foot')
    return unit

def parse_string(s):
    s_stripped = "" if s==None or str(s)=='nan' else s.strip()
    if s_stripped == "":
        return None, None
    pattern = re.compile(r'^-?\d+(\.\d+)?\s+[a-zA-Z\s]+$')
    if not pattern.match(s_stripped):
        raise ValueError("Invalid format in {}".format(s))
    parts = s_stripped.split(maxsplit=1)
    number = float(parts[0])
    unit = common_mistake(parts[1])
    if unit not in constants.allowed_units:
        raise ValueError("Invalid unit [{}] found in {}. Allowed units: {}".format(
            unit, s, constants.allowed_units))
    return number, unit


def create_placeholder_image(image_save_path):
    try:
        placeholder_image = Image.new('RGB', (100, 100), color='black')
        placeholder_image.save(image_save_path)
    except Exception as e:
        return

def download_image(image_link, save_folder, retries=3, delay=3):
    if not isinstance(image_link, str):
        return

    filename = Path(image_link).name
    image_save_path = os.path.join(save_folder, filename)

    if os.path.exists(image_save_path):
        return

    for _ in range(retries):
        try:
            urllib.request.urlretrieve(image_link, image_save_path)
            return
        except:
            time.sleep(delay)

    create_placeholder_image(image_save_path) #Create a black placeholder image for invalid links/images

def download_images(image_links, download_folder, allow_multiprocessing=True):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    if allow_multiprocessing:
        download_image_partial = partial(
            download_image, save_folder=download_folder, retries=3, delay=3)

        with multiprocessing.Pool(64) as pool:
            list(tqdm(pool.imap(download_image_partial, image_links), total=len(image_links)))
            pool.close()
            pool.join()
    else:
        for image_link in tqdm(image_links, total=len(image_links)):
            download_image(image_link, save_folder=download_folder, retries=3, delay=3)


In [None]:
# Define the folder where you want to save the images on the drive
train_image_folder = '/content/drive/MyDrive/images/train'
test_image_folder = '/content/drive/MyDrive/images/test'

In [11]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Function to transform with fallback for unseen labels
def label_encode_with_fallback(encoder, values):
    unique_values = encoder.classes_
    transformed = np.array([
        encoder.transform([val])[0] if val in unique_values else -1 for val in values
    ])
    return transformed

# Initialize LabelEncoders
group_id_encoder = LabelEncoder()
entity_name_encoder = LabelEncoder()

# Fit on training data
train_df['group_id_encoded'] = group_id_encoder.fit_transform(train_df['group_id'])
train_df['entity_name_encoded'] = entity_name_encoder.fit_transform(train_df['entity_name'])

# Transform with fallback for unseen labels
test_df['group_id_encoded'] = label_encode_with_fallback(group_id_encoder, test_df['group_id'])
test_df['entity_name_encoded'] = label_encode_with_fallback(entity_name_encoder, test_df['entity_name'])

# Check the transformation
print(test_df[['group_id', 'group_id_encoded', 'entity_name', 'entity_name_encoded']].head())


   group_id  group_id_encoded entity_name  entity_name_encoded
0    156839                45      height                    1
1    792578               568       width                    7
2    792578               568      height                    1
3    792578               568       depth                    0
4    792578               568       depth                    0


In [12]:
# Use pd.get_dummies for one-hot encoding
train_df_encoded = pd.get_dummies(train_df, columns=['group_id', 'entity_name'])
test_df_encoded = pd.get_dummies(test_df, columns=['group_id', 'entity_name'])

# Ensure that train and test data have the same columns after one-hot encoding
train_df_encoded, test_df_encoded = train_df_encoded.align(test_df_encoded, join='left', axis=1, fill_value=0)

# Display the encoded columns
print(train_df_encoded.head())

                                          image_link    entity_value  \
0  https://m.media-amazon.com/images/I/61I9XdN6OF...      500.0 gram   
1  https://m.media-amazon.com/images/I/71gSRbyXmo...         1.0 cup   
2  https://m.media-amazon.com/images/I/61BZ4zrjZX...      0.709 gram   
3  https://m.media-amazon.com/images/I/612mrlqiI4...      0.709 gram   
4  https://m.media-amazon.com/images/I/617Tl40LOX...  1400 milligram   

   group_id_encoded  entity_name_encoded  group_id_101697  group_id_104874  \
0               535                    3            False            False   
1               661                    2            False            False   
2               284                    3            False            False   
3               284                    3            False            False   
4               520                    3            False            False   

   group_id_106003  group_id_107694  group_id_107915  group_id_108478  ...  \
0            False  

In [13]:
# Install tesseract-ocr and pytesseract
!sudo apt install tesseract-ocr
!pip install pytesseract

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 1s (4,291 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debc

In [15]:
!pip install torch torchvision



In [20]:
import csv

def generate_output(index, prediction, output_file="output.csv"):
    with open(output_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["index", "prediction"])
        writer.writerow([index, prediction])

# Example usage
generate_output(1, "2 gram")


In [22]:
sample_test_out_df = pd.read_csv('/content/drive/MyDrive/dataset/sample_test_out.csv')

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

def evaluate_model(true_labels, predicted_labels):
    precision = precision_score(true_labels, predicted_labels, average='weighted')
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    f1 = f1_score(true_labels, predicted_labels, average='weighted')
    return precision, recall, f1

# Example usage
true_labels = sample_test_out_df['entity_value']
predicted_labels = ["2 gram"] * len(true_labels)
precision, recall, f1 = evaluate_model(true_labels, predicted_labels)
print(f"Precision: {precision}, Recall: {recall}, F1 Score: {f1}")


In [30]:
# Example prediction format
def format_prediction(value, unit):
    return f"{value} {unit}"

predictions = [format_prediction(2.0, 'gram'), format_prediction(12.5, 'centimetre')]

In [26]:
pip install tensorflow



In [32]:
import numpy as np

def simulate_feature_extraction(image_link):

    return np.random.rand() * 100  # Simulate some numeric value

def predict_entity_value(feature):
    return f"{feature:.2f} unit"  # Simulate a formatted prediction

# Apply feature extraction and prediction
test_df['feature'] = test_df['image_link'].apply(simulate_feature_extraction)
test_df['prediction'] = test_df['feature'].apply(predict_entity_value)


In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

def evaluate_model(true_labels, predicted_labels):
    # Placeholder function to evaluate model performance
    # In a real scenario, use true labels and predictions for evaluation
    precision = precision_score(true_labels, predicted_labels, average='weighted', zero_division=0)
    recall = recall_score(true_labels, predicted_labels, average='weighted', zero_division=0)
    f1 = f1_score(true_labels, predicted_labels, average='weighted', zero_division=0)
    return precision, recall, f1

# Example usage (for simulation purposes)
true_labels = ['2 gram'] * len(test_df)  # Placeholder for actual true labels
predicted_labels = test_df['prediction']
precision, recall, f1 = evaluate_model(true_labels, predicted_labels)
print(f"Precision: {precision}, Recall: {recall}, F1 Score: {f1}")


In [34]:
def load_resnet_model():
    """Load a pre-trained ResNet model and remove the classification layer."""
    resnet = models.resnet50(pretrained=True)
    resnet = nn.Sequential(*list(resnet.children())[:-1])  # Remove the classification layer
    resnet.eval()  # Set the model to evaluation mode
    return resnet

def preprocess_image(image_path):
    """Preprocess the image for CNN feature extraction."""
    preprocess = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    img = Image.open(image_path)
    img_tensor = preprocess(img).unsqueeze(0)  # Preprocess and add batch dimension
    return img_tensor

def extract_image_features(image_path, model):
    """Extract features from an image using the ResNet model."""
    img_tensor = preprocess_image(image_path)
    with torch.no_grad():
        features = model(img_tensor)
    return features.squeeze().numpy()  # Convert to numpy array
