In [None]:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import os
import string
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import math
import pickle
from PIL import Image, ImageEnhance, ImageOps, UnidentifiedImageError
import cv2
import numpy as np
import requests
from io import BytesIO
import ast
import torch
from torchvision import models, transforms
import asyncio
import aiohttp
# import tabulate


In [9]:
csv_path = '/content/A2_Data.csv'

df = pd.read_csv(csv_path)
os.makedirs('reviews')
os.makedirs('processed_images')


In [55]:
df

Unnamed: 0,Serial,Image,Review Text
0,3452,['https://images-na.ssl-images-amazon.com/imag...,Loving these vintage springs on my vintage str...
1,1205,['https://images-na.ssl-images-amazon.com/imag...,Works great as a guitar bench mat. Not rugged ...
2,1708,['https://images-na.ssl-images-amazon.com/imag...,We use these for everything from our acoustic ...
3,2078,['https://images-na.ssl-images-amazon.com/imag...,Great price and good quality. It didn't quite...
4,801,['https://images-na.ssl-images-amazon.com/imag...,I bought this bass to split time as my primary...
...,...,...,...
995,1265,['https://images-na.ssl-images-amazon.com/imag...,Extremely impressed with this kit.
996,1882,['https://images-na.ssl-images-amazon.com/imag...,This is a great stereo reverb with plenty of c...
997,1547,['https://images-na.ssl-images-amazon.com/imag...,I really like the simplicity of this bridge. I...
998,1004,['https://images-na.ssl-images-amazon.com/imag...,"Great Product, but there is no warranty in the..."


# Q1 - Image Feature Extraction [25]

In [None]:
def preprocess_image(image_data):
    try:
        img = Image.open(BytesIO(image_data))

        # Apply preprocessing
        img = img.resize((256, 256))
        enhancer = ImageEnhance.Contrast(img)
        img = enhancer.enhance(1.2)
        enhancer = ImageEnhance.Brightness(img)
        img = enhancer.enhance(1.2)

        # Random flip
        if np.random.rand() > 0.5:
            img = ImageOps.flip(img)
        if np.random.rand() > 0.5:
            img = ImageOps.mirror(img)

        # Adjust exposure
        hsv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2HSV)
        hsv = np.array(hsv, dtype=np.float64)
        hsv[:, :, 2] = hsv[:, :, 2] * 1.3
        hsv[:, :, 2][hsv[:, :, 2] > 255] = 255
        hsv = np.array(hsv, dtype=np.uint8)
        processed_img = Image.fromarray(cv2.cvtColor(hsv, cv2.COLOR_HSV2RGB))

        return processed_img
    except Exception as e:
        print(f"Error processing image with ID: {e}")
        return None

async def fetch_image(session, url):
    async with session.get(url) as response:
        return await response.read()

async def preprocess_and_save_image(urls, serial):
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_image(session, url) for url in urls]
        images = await asyncio.gather(*tasks)

        i = 1
        for image_data in images:
            processed_img = preprocess_image(image_data)
            if processed_img:
                # Save processed image
                processed_img.save(f"processed_images/{serial}_{i}.jpg")
                i += 1

# Assuming initialData is already loaded in your notebook
tasks = []
for index, row in df.iterrows():
    try:
        urls = ast.literal_eval(str(row['Image']))
        tasks.append(preprocess_and_save_image(urls, row['Serial']))
    except Exception as e:
        print(f"Error processing row {row['Serial']}: {e}")

# Run the tasks asynchronously
await asyncio.gather(*tasks)


In [None]:
model = models.resnet50(pretrained=True)
my_model = torch.nn.Sequential(*(list(model.children())[:-1]))
my_model.eval()

In [13]:
Matrix = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def feature_extraction(image_path, neural_net):
    image = Image.open(image_path).convert('RGB')
    preprocessed_image = Matrix(image)
    img_bat = torch.unsqueeze(preprocessed_image, 0)  # Add batch dimension

    with torch.no_grad():
        features = neural_net(img_bat)
    return features.cpu().numpy().flatten()

extracted_features = {}

for image_file in os.listdir('processed_images'):
    if image_file.endswith('.jpg'):
        image_path = os.path.join('processed_images', image_file)
        features = feature_extraction(image_path, my_model)
        extracted_features[image_file] = features

with open('image_features.pkl', 'wb') as file:
    pickle.dump(extracted_features, file)





In [56]:
pick = pd.read_pickle('image_features.pkl')
pick


{'3246_1.jpg': array([0.6505777 , 0.36373988, 0.43628898, ..., 0.44552633, 0.52106196,
        0.56624043], dtype=float32),
 '3305_1.jpg': array([0.02370333, 0.6220905 , 0.31054088, ..., 0.56659466, 0.08571987,
        0.53868747], dtype=float32),
 '3482_3.jpg': array([0.5597773 , 0.3058252 , 0.4641837 , ..., 0.17269003, 0.8370169 ,
        0.17897555], dtype=float32),
 '1953_1.jpg': array([0.41019368, 0.05721625, 0.2544602 , ..., 0.08672953, 0.5904397 ,
        0.24659225], dtype=float32),
 '3131_1.jpg': array([0.18368661, 0.37331516, 0.02981079, ..., 0.39301455, 0.02943265,
        0.15206678], dtype=float32),
 '3385_2.jpg': array([0.08961152, 0.6202171 , 0.33332953, ..., 0.59013975, 0.86161923,
        0.6308837 ], dtype=float32),
 '116_4.jpg': array([0.32752824, 0.4794018 , 0.25258723, ..., 0.18839796, 0.17395975,
        0.1182198 ], dtype=float32),
 '2570_1.jpg': array([0.8279813 , 0.2529753 , 0.43573612, ..., 0.18699047, 0.57402134,
        0.4731781 ], dtype=float32),
 '1550_1.

In [28]:
import pickle
import numpy as np

with open('image_features.pkl', 'rb') as f:
    features_dict = pickle.load(f)

normal_feat_dict = {}
for k, v in features_dict.items():
    if np.linalg.norm(v) > 0:
        normalized_vector = v / np.linalg.norm(v)
        normal_feat_dict[k] = normalized_vector
    else:
        normal_feat_dict[k] = v

with open('image_features_normalized.pkl', 'wb') as f:
    pickle.dump(normal_feat_dict, f)

In [75]:
pick = pd.read_pickle('image_features_normalized_.pkl')
for key in (pick.keys()):
    print(key, ":", pick[key])

3246_1.jpg : [0.02298081 0.01284864 0.01541134 ... 0.01573764 0.01840584 0.02000171]
3305_1.jpg : [0.00081183 0.02130637 0.01063591 ... 0.01940566 0.00293587 0.01844984]
3482_3.jpg : [0.01996722 0.01090876 0.0165574  ... 0.00615984 0.02985633 0.00638404]
1953_1.jpg : [0.01485107 0.00207152 0.00921274 ... 0.00314005 0.02137689 0.00892788]
3131_1.jpg : [0.00603531 0.01226586 0.00097948 ... 0.01291311 0.00096706 0.00499639]
3385_2.jpg : [0.00300646 0.02080821 0.01118317 ... 0.01979912 0.02890723 0.02116608]
116_4.jpg : [0.0107353  0.01571321 0.00827898 ... 0.00617507 0.00570183 0.00387486]
2570_1.jpg : [0.02770649 0.00846524 0.01458091 ... 0.00625721 0.01920831 0.01583382]
1550_1.jpg : [0.01650663 0.01060194 0.03270977 ... 0.02716612 0.00926085 0.01078879]
3842_3.jpg : [0.01631217 0.01171478 0.00358185 ... 0.002952   0.04095766 0.0103694 ]
2577_1.jpg : [0.0208213  0.01004682 0.00354347 ... 0.0009017  0.01896323 0.01321682]
107_1.jpg : [0.01349708 0.03664231 0.01017546 ... 0.00496392 0.040

In [29]:
with open('image_features_normalized.pkl', 'rb') as file:
    normalized_features = pickle.load(file)

# Q2 Text Feature Extraction [25]

In [20]:
def process_text(data):
    data = data.lower()
    tokens = word_tokenize(data)
    custom_table = str.maketrans('', '', string.punctuation)
    stripped_tokens = [token.translate(custom_table) for token in tokens]
    stop_words = set(stopwords.words('english'))
    processed_words = [word for word in stripped_tokens if word not in stop_words and word.isalpha()]
    porter_stemmer = PorterStemmer()
    stemmed_words = [porter_stemmer.stem(word) for word in processed_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in stemmed_words]
    return ' '.join(lemmatized_words)

result_folder = 'reviews'
os.makedirs(result_folder, exist_ok=True)

# Apply text processing
for idx, record in df.iterrows():
    text = process_text(str(record['Review Text']))
    path = os.path.join(result_folder, f"{record['Serial']}.txt")

    with open(path, 'w', encoding='utf-8') as file:
        file.write(text)


In [21]:
def calculate_tf(text_data):
    tokens = text_data.split()
    num_tokens = {}
    for token in tokens:
        num_tokens[token] = num_tokens.get(token, 0) + 1
    total_tokens = len(tokens)
    tf = {token: count / total_tokens for token, count in num_tokens.items()}
    return tf


def calculate_idf(documents):
    idf_counts = {}
    total_documents = len(documents)
    for document in documents:
        seen_tokens = set()
        for token in document.split():
            if token not in seen_tokens:
                idf_counts[token] = idf_counts.get(token, 0) + 1
                seen_tokens.add(token)

    idf = {}
    for token, count in idf_counts.items():
        idf[token] = math.log(total_documents / count)
    return idf


def calculate_tfidf(documents_data):
    inverse_document_frequency = calculate_idf(documents_data)
    tfidf_documents_list = []

    for document_data in documents_data:
        term_frequency = calculate_tf(document_data)
        tfidf = {}
        for token, tf_score in term_frequency.items():
          tfidf[token] = tf_score * inverse_document_frequency[token]
        tfidf_documents_list.append(tfidf)
    return tfidf_documents_list


def ids_tfidf(document_data, doc_ids):
    inverse_document_frequency = calculate_idf(document_data)
    tfidf_documents = {}
    for document, doc_id in zip(document_data, doc_ids):
        term_frequency = calculate_tf(document)
        tfidf = {}
        for token, tf_score in term_frequency.items():
          tfidf[token] = tf_score * inverse_document_frequency[token]
        tfidf_documents[doc_id] = tfidf
    return tfidf_documents


processed_texts = []
ids = []
data_folder = 'reviews'

for filename in sorted(os.listdir(data_folder)):
    if filename.endswith('.txt'):
        ids.append(filename.split('.')[0])
        file_path = os.path.join(data_folder, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            processed_texts.append(file.read())

tfidf_results = ids_tfidf(processed_texts, ids)

with open('tfidf_scores_with_ids.pkl', 'wb') as f:
    pickle.dump(tfidf_results, f)


In [74]:
pick = pd.read_pickle('tfidf_scores_with_ids.pkl')
for key in reversed(pick.keys()):
    print(key, ":", pick[key])

996 : {'search': 0.08937143062021581, 'find': 0.05344258576149217, 'reverb': 0.1485634996093016, 'pedal': 0.1106259587291671, 'would': 0.026903076611809355, 'abl': 0.10315099130746841, 'fill': 0.0742817498046508, 'shoe': 0.08937143062021581, 'builtin': 0.0787076278423705, 'use': 0.017149871855353743, 'due': 0.05943435140398022, 'unheard': 0.10627315813818673, 'low': 0.051142097546554265, 'price': 0.028387080707002618, 'half': 0.14493940620448978, 'expect': 0.04737098280758527, 'get': 0.02445592746366032, 'fast': 0.06180590032439958, 'perform': 0.06180590032439958, 'fortun': 0.0849455525824961, 'case': 0.03944538241736082, 'upon': 0.0787076278423705, 'plug': 0.051142097546554265, 'unit': 0.06180590032439958, 'test': 0.06461084735199889, 'le': 0.0420518155244077, 'paul': 0.05295414424896017, 'guitar': 0.019420128943675376, 'practic': 0.05344258576149217, 'amp': 0.04277878298364686, 'amaz': 0.04770911983402796, 'hear': 0.06567227614410583, 'well': 0.025150088006436776, 'inexpens': 0.05738

# Q3 - Image Retrieval and Text Retrieval [25]

In [24]:
def cosine_similarity(v1, v2):
    dot_product = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    similarity = dot_product / (norm_v1 * norm_v2)
    return similarity

## a)

In [66]:
def top3_Images(id, img_features, top_k=3):
    """Find the top_k most similar images to the input image."""
    # Extract the feature vector for the input image
    input_features = img_features[id]


    similarities = {}
    for iden, features in img_features.items():
        if iden != id:
            similarity = cosine_similarity(input_features, features)
            similarities[iden] = similarity


    sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    all_ids = set(id.split('_')[0] for id in [id])
    top_3_images = []
    for image_id, similarity in sorted_similarities:
        first = image_id.split('_')[0]
        if first not in all_ids:
            all_ids.add(first)
            top_3_images.append((image_id, similarity))
            if len(top_3_images) == top_k:
                break
    return top_3_images

with open('image_features_normalized.pkl', 'rb') as f:
    image_features = pickle.load(f)


Image_Name = "4_1.jpg"
top_similar_images = top3_Images(Image_Name, image_features)
for id, similarity in top_similar_images:
    print(f"Image Name in folder is: {id}, Similarilty: {similarity}")

Image Name in folder is: 2883_2.jpg, Similarilty: 0.8872929215431213
Image Name in folder is: 575_1.jpg, Similarilty: 0.8113231658935547
Image Name in folder is: 1322_2.jpg, Similarilty: 0.7778549790382385


## b)

In [65]:
with open('tfidf_scores_with_ids.pkl', 'rb') as f:
    pickle_tfidf = pickle.load(f)


def cosine_similarity_sparse(v1, v2):
    shared_keys = set(v1.keys()) & set(v2.keys())
    dot_product = sum([v1[x] * v2[x] for x in shared_keys])

    sum1 = sum([v1[x]**2 for x in v1.keys()])
    sum2 = sum([v2[x]**2 for x in v2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(dot_product) / denominator

def top3_reviews(id, pickle_tfidf, top_k=3):
    input_tfidf = pickle_tfidf[id]

    similarities = {}
    for review_id, tfidf in pickle_tfidf.items():
        if review_id != id:
            similarity = cosine_similarity_sparse(input_tfidf, tfidf)
            similarities[review_id] = similarity

    # Sort reviews by similarity
    sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

    most_similar_reviews = sorted_similarities[:top_k]

    return most_similar_reviews

review_id = '4'
most_similar_reviews = top3_reviews(review_id, pickle_tfidf)

for id, similarity in most_similar_reviews:
    print(f"Review Serial is: {id}, Similarilty : {similarity}")

Review Serial is: 2883, Similarilty : 1.0
Review Serial is: 429, Similarilty : 0.1724655546550282
Review Serial is: 3712, Similarilty : 0.17103550209316773


In [96]:
review_corpus_sim, image_corpus_sim = {}, {}

for input_review_id in pickle_tfidf.keys():
    rws = top3_reviews(input_review_id, pickle_tfidf)
    review_corpus_sim[input_review_id] = rws

for input_id in image_features.keys():
    imgs = top3_Images(input_id, image_features)
    image_corpus_sim[input_id] = imgs


with open('review_only_retrieval.pkl', 'wb') as f:
    pickle.dump(review_corpus_sim, f)

with open('image_only_retrieval.pkl', 'wb') as f:
    pickle.dump(image_corpus_sim, f)


# 4. Combined Retrieval (Text and Image)

In [97]:
def avg_similarities(id, image_features, tfidf_scores):
    # image similarities
    img_similarities = {}
    img_key = id + "_1.jpg"
    primary_image = image_features[img_key]
    for id1, features in image_features.items():
        if id1 != img_key:
            similar = cosine_similarity(primary_image, features)
            img_similarities[id1.split('_')[0]] = similar

    # review similarities
    review_similarities = {}
    primary_image_tfidf = tfidf_scores[id]
    for id1, tfidf in tfidf_scores.items():
        if id1 != id:
            sim = cosine_similarity_sparse(primary_image_tfidf, tfidf)
            review_similarities[id1] = sim

    # average similarities
    dict_avg_similarities = {}
    for id1 in set(img_similarities.keys()).union(review_similarities.keys()):
        avg_sim = (img_similarities.get(id1, 0) + review_similarities.get(id1, 0)) / 2
        dict_avg_similarities[id1] = avg_sim
    # show top threee
    sorted_similarities = sorted(dict_avg_similarities.items(), key=lambda x: x[1], reverse=True)[:3]

    return sorted_similarities

Serial = '4'
sorted_similarities = avg_similarities(Serial, image_features, pickle_tfidf)

for id, sim in sorted_similarities:
    print(f"Serial ID: {id}, Average Similarity: {sim}")

composite_similarities_all = {}
for Serial in image_features.keys():
    this = Serial.split('_')
    this = this[0]
    top3_similar = avg_similarities(this, image_features, pickle_tfidf)
    # composite_similarities in a dictionary
    composite_similarities_all[this] = top3_similar

# Save the results
with open('composite_retrieval.pkl', 'wb') as f:
    pickle.dump(composite_similarities_all, f)
print("The tuple looks like the following")
print(composite_similarities_all['4'])

Serial ID: 2883, Average Similarity: 0.9436464607715607
Serial ID: 3533, Average Similarity: 0.43267007247769557
Serial ID: 2039, Average Similarity: 0.4261809874984757
The tuple looks like the following
[('2883', 0.9436464607715607), ('3533', 0.43267007247769557), ('2039', 0.4261809874984757)]


# Result

In [91]:
image_url = 'https://images-na.ssl-images-amazon.com/images/I/71UyzJrC3DL._SY88.jpg'
input_review = '''Awesome stand!

Tip: The bottom part that supports the guitar had a weird angle when arrived, making the guitar slide back, becoming almost 100% on a vertical.
To solve this, I assembled the product and the put a some pressure on the support frame, making it bend a little. Now my guitar sits perfectly. Check photos!'''


In [93]:
image_serial = (df[df['Image'].astype(str).str.contains(image_url)])['Serial'].unique().tolist()
review_serial = (df[df['Review Text'] == input_review])['Serial'].unique().tolist()
image_serial, review_serial

([307], [1714])

In [94]:
url_id = str(image_serial[0]) + "_1.jpg"
review_id = str(review_serial[0])
print(url_id, review_id)

307_1.jpg 1714


In [98]:
pickle_loaded_images = pd.read_pickle('image_only_retrieval.pkl')
print(f"Provided Image Serial: {url_id}]:")

input_code = url_id.split('_')[0]
i = 1
for search_id, sim_score in pickle_loaded_images[url_id]:
    print("-" * 50)
    fetch_id = search_id.split('_')[0]

    image_urls = df[df['Serial'] == int(fetch_id)]['Image'].values
    reviews = df[df['Serial'] == int(fetch_id)]['Review Text'].values
    print(f"Similar Image {i} | Serial : ", search_id)
    i+=1

    for url in image_urls:
        print("Associated urls: ", url)

    for review in reviews:
        print("Serial Review: ", review)

    print(f"Cosine Similarity of the Images: {sim_score}")

    # similar text between inout and review using tfidf
    text_similarity = cosine_similarity_sparse(pickle_tfidf[input_code], pickle_tfidf[str(fetch_id)])
    print(f"Cosine similarity of text (between {input_code} & {str(fetch_id)}): {text_similarity}")

    composite_similarity = (sim_score + text_similarity) / 2
    print(f"Similarity Score(Composite ): {composite_similarity}")
    print("-" * 50)



Provided Image Serial: 307_1.jpg]:
--------------------------------------------------
Similar Image 1 | Serial :  2983_1.jpg
Associated urls:  ['https://images-na.ssl-images-amazon.com/images/I/810onrz-lPL._SY88.jpg']
Serial Review:  Worked perfect for what I needed them for!!!!
Cosine Similarity of the Images: 0.8711623549461365
Cosine similarity of text (between 307 & 2983): 0.0
Similarity Score(Composite ): 0.43558117747306824
--------------------------------------------------
--------------------------------------------------
Similar Image 2 | Serial :  3145_1.jpg
Associated urls:  ['https://images-na.ssl-images-amazon.com/images/I/81Eq6y34BYL._SY88.jpg']
Serial Review:  Superb feel and sound!

Tips for wah success:
- rock pedal all of the way forward
- match the volume knob to your Bass
- crank the "Q"
- love life!

Seriously, the range this pedal hits when the "Q" is amazing. I love how expressive the tone is when you get it going! I think it retains the low end and adds a creamy

In [99]:
pickle_loaded_reviews = pd.read_pickle('review_only_retrieval.pkl')

enter_id_text = '307'
print(f"[Provided Text Serial: {enter_id_text}]: ")
i = 1
for search_id, sim_score in pickle_loaded_reviews[enter_id_text]:
    print("-" * 50)
    image_urls = df[df['Serial'] == int(search_id)]['Image'].values
    reviews = df[df['Serial'] == int(search_id)]['Review Text'].values

    print(f"Similar Text {i} | Serial: ", search_id)
    i+=1
    for url in image_urls:
        print("Associated urls: ", url)
    for review in reviews:
        print("Serial Review: ", review)

    image_similarity = cosine_similarity(image_features[str(enter_id_text) + "_1.jpg"], image_features[str(search_id) + "_1.jpg"])
    print(f"Cosine Similarity of the images (between {enter_id_text} & {str(search_id)}): {image_similarity}")

    print(f"Cosine similarity of textual content: {sim_score}")

    composite_similarity = (sim_score + image_similarity)/2
    print(f"Similarity Score(Composite): {composite_similarity}")
    print("-" * 50)



[Provided Text Serial: 307]: 
--------------------------------------------------
Similar Text 1 | Serial:  2301
Associated urls:  ['https://images-na.ssl-images-amazon.com/images/I/71VbqpzO7WL._SY88.jpg']
Serial Review:  I find this unit adequate, I use it with my MacBook as part of a portable 'winter' studio. I also own a Scarlet 2i2 (Gen 1)  which I use in my primary studio  I find the 2i2 has superior sound quality  less audible distortion / lower noise  and I prefer the 2i2's Neutrik style XLR/TRS output connector. The Behringer is a good value  it works fine and has good features / performance for the price  but like other Behringer products I've used, the sound quality / noise / distortion, leave me wanting a better product. If features or budget are at the top of your list, this product should do the job; if however sound quality is of primary importance then I'd recommend the Scarlet 2i2 (look for sales or a used Gen 1).
Cosine Similarity of the images (between 307 & 2301): 0.7