In [23]:
import numpy as np
from PIL import Image
import pandas as pd
import requests
from io import BytesIO
import torch
from torchvision import models, transforms
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import pickle
import ast
import math
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data_path = 'A2_Data.csv'
data = pd.read_csv(data_path)
data.head(10)

Unnamed: 0.1,Unnamed: 0,Image,Review Text
0,3452,['https://images-na.ssl-images-amazon.com/imag...,Loving these vintage springs on my vintage str...
1,1205,['https://images-na.ssl-images-amazon.com/imag...,Works great as a guitar bench mat. Not rugged ...
2,1708,['https://images-na.ssl-images-amazon.com/imag...,We use these for everything from our acoustic ...
3,2078,['https://images-na.ssl-images-amazon.com/imag...,Great price and good quality. It didn't quite...
4,801,['https://images-na.ssl-images-amazon.com/imag...,I bought this bass to split time as my primary...
5,126,['https://images-na.ssl-images-amazon.com/imag...,"it's more on toy side than on instrument side,..."
6,1329,['https://images-na.ssl-images-amazon.com/imag...,Absolute BEST guitar hangers on the market... ...
7,325,['https://images-na.ssl-images-amazon.com/imag...,"Great nylon strings, just as expected. They wo..."
8,245,['https://images-na.ssl-images-amazon.com/imag...,I bought this stand for church because I been ...
9,1714,['https://images-na.ssl-images-amazon.com/imag...,Awesome stand!\n\nTip: The bottom part that su...


Collecting all image links

In [30]:
links = []
for idx, row in data.iterrows():
    link_arr = ast.literal_eval(row[1])
    for link in link_arr:
        links.append(link)

In [31]:
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [32]:
model = models.resnet50(pretrained = True)
model = torch.nn.Sequential(*(list(model.children())[:-1]))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()



Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)


In [49]:
def download_images(link):
    response = requests.get(link)
    try:
        img = Image.open(BytesIO(response.content)).convert('RGB')
        return img
    except Exception as e:
        print(f"Failed to get image from {link} due to error {e}")
        return None

images = {}
for link in links:
    image = download_images(link)
    if images is None:
        continue
    else:
        images[link] = image
    

Failed to get image from https://images-na.ssl-images-amazon.com/images/I/71F3npeHUDL._SY88.jpg due to error cannot identify image file <_io.BytesIO object at 0x0000025E8842E700>
Failed to get image from https://images-na.ssl-images-amazon.com/images/I/71wHUWncMGL._SY88.jpg due to error cannot identify image file <_io.BytesIO object at 0x0000025E8AA72DE0>
Failed to get image from https://images-na.ssl-images-amazon.com/images/I/71B8OOE5N8L._SY88.jpg due to error cannot identify image file <_io.BytesIO object at 0x0000025E8842EB10>
Failed to get image from https://images-na.ssl-images-amazon.com/images/I/81SX3oAWbNL._SY88.jpg due to error cannot identify image file <_io.BytesIO object at 0x0000025E8842E7F0>
Failed to get image from https://images-na.ssl-images-amazon.com/images/I/718niQ1GEwL._SY88.jpg due to error cannot identify image file <_io.BytesIO object at 0x0000025E88596C50>
Failed to get image from https://images-na.ssl-images-amazon.com/images/I/61OboZT-kcL._SY88.jpg due to er

In [50]:
len(images)

1618

In [47]:
def preprocess_and_extract_features(img):
    # Apply the preprocessing transformations
    img_t = transform(img)
    img_t = img_t.unsqueeze(0)  # Add a batch dimension
    img_t = img_t.to(device)  # Move the input tensor to the device
    
    # Extract features with no gradient calculation for efficiency
    with torch.no_grad():
        features = model(img_t)

    features_flattened = features.view(-1)
    # Move the features to CPU for further processing or storage
    features_flattened = features_flattened.cpu()
    
    return features_flattened

In [52]:
images_features = {}
for link, img in images.items():
    if img is None:
        continue
    images_features[link] = preprocess_and_extract_features(img)

In [54]:
features_path = 'image_features.pkl'

with open(features_path, 'wb') as file:
    pickle.dump(images_features, file)

In [55]:
features_path = 'image_features.pkl'

with open(features_path, 'rb') as file:
    image_features = pickle.load(file)

In [5]:
reviews = []
for idx, row in data.iterrows():
    if pd.isna(row[2]):
        reviews.append("")
        continue
    text = row[2]
    reviews.append(text)

In [6]:
def preprocess_text(text):
    # Lowercasing the text
    text = text.lower()

    # Removing all punctuations
    text = ''.join([char for char in text if char not in string.punctuation])
    
    # Tokenizing text
    tokens = nltk.word_tokenize(text)
    
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization on tokens 
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

In [16]:
# Tokenizing all documents
tokenized_docs = []
for doc in reviews:
    tokenized_docs.append(preprocess_text(doc))

for doc in tokenized_docs:
    for term in doc:
        print(term)
    break

loving
vintage
spring
vintage
strat
good
tension
great
stability
floating
bridge
want
spring
way
go


Defining functions for calculating term frequency and inverse document frequency

In [19]:
def compute_tf(tokenized_docs):
    tf_dict = {}
    for document in tokenized_docs:
        for term in document:
            if term in tf_dict:
                tf_dict[term] += 1
            else:
                tf_dict[term] = 1
    for term, count in tf_dict.items():
        tf_dict[term] = count / float(len(tokenized_docs))
    return tf_dict

def compute_idf(documents):
    idf_dict = {}
    N = len(documents)
    
    # Initialize document frequency (DF) counts
    for document in documents:
        for word in document:
            if word in idf_dict:
                idf_dict[word] += 1
            else:
                idf_dict[word] = 1
    
    # Convert DF counts to IDF scores
    for word, val in idf_dict.items():
        idf_dict[word] = math.log10(N / float(val))
        
    return idf_dict


def compute_tf_idf(tf, idf):
    tf_idf = {}
    for word, val in tf.items():
        tf_idf[word] = val * idf[word]
    return tf_idf

In [57]:
# Creating term frequency dictionary 
tf_dict = compute_tf(tokenized_docs)
print(tf_dict)
print(len(tf_dict))

# Creating inverse document frequency dictionary
idf_dict = compute_idf(tokenized_docs)
print(idf_dict)
print(len(idf_dict))

# Creating TF-IDF dictionary
tf_idf_scores = compute_tf_idf(tf_dict, idf_dict)
print(tf_idf_scores)
print(len(tf_idf_scores))

6125
6125
6125


In [58]:
text_features = 'doc_tf_idf_scores.pkl'

with open(text_features, 'wb') as file:
    pickle.dump(tf_idf_scores, file)