<a href="https://colab.research.google.com/github/arya-snh/CSE508_Winter2024_A2_2020498/blob/main/IR_A2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
import re
import pickle
from string import punctuation
from collections import Counter
import math

In [2]:
import numpy as np
import pandas as pd
import cv2
import requests
from keras.applications import VGG16
from keras.applications.vgg16 import preprocess_input
from keras.preprocessing.image import img_to_array, load_img
from sklearn.preprocessing import StandardScaler
from io import BytesIO

In [3]:
import pandas as pd
data = pd.read_csv('/content/drive/MyDrive/IR/A2/A2_Data.csv', header=None, names=['ProductID', 'Image path', 'Review'], skiprows=1)

### Q1. Image Feature Extraction

In [4]:
# Image preprocessing function
def preprocess_image(image):
    image = cv2.resize(image, (224, 224))
    image = img_to_array(image)
    image = np.expand_dims(image, axis=0)
    image = preprocess_input(image)
    return image

# Feature extraction using VGG16
def extract_features_vgg(image):
    model = VGG16(weights='imagenet', include_top=False)
    try:
        image = preprocess_image(image)
        features = model.predict(image)
        features = features.flatten()
        return features.tolist()  # Convert to list before storing
    except Exception as e:
        print(f"Error extracting features: {e}")
        return None

In [None]:
# Apply image preprocessing and feature extraction
data['extracted_features'] = [[]] * len(data)
for index, row in data.iterrows():
    image_url_list = eval(row['Image path'])
    features_list = []
    for image_url in image_url_list:
        try:
            response = requests.get(image_url)
            image = cv2.imdecode(np.frombuffer(response.content, np.uint8), -1)
            features = extract_features_vgg(image)
            if features is not None:
                features_list = features
            break
        except Exception as e:
            print(f"Error processing image: {e}")
    data.at[index, 'extracted_features'] = features_list

In [None]:
scaler = StandardScaler()
normalized_features = []

for features in data['extracted_features']:
    if features is not None and len(features) > 0:  # Check if features list is not empty
        normalized_feature = scaler.fit_transform([features]).flatten().tolist()  # Convert array to list
        normalized_features.append(normalized_feature)
    else:
        normalized_features.append(None)

data['extracted_features'] = normalized_features

data.to_csv("dataset_with_features.csv", index=False)

In [None]:
with open('/content/drive/MyDrive/IR/A2/image_features.pkl', 'wb') as f:
    pickle.dump(data['extracted_features'], f)

In [9]:
with open('/content/drive/MyDrive/IR/A2/image_features.pkl', 'rb') as f:
    image_features_loaded = pickle.load(f)

print(image_features_loaded)

0      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2      [0.0, 0.0, 0.0, 0.0, 10.804707527160645, 10.81...
3      [10.733154296875, 0.0, 27.070093154907227, 5.0...
4      [0.0, 0.0, 0.0, 33.649898529052734, 0.0, 19.03...
                             ...                        
995    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
996    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 45.360614776611...
997    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
998    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
999    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: extracted_features, Length: 1000, dtype: object


### Q2. Text Feature Extraction

In [5]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):

    if pd.isnull(text):
        return []
    # Lowercasing
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Removing punctuations
    tokens = [token for token in tokens if token not in string.punctuation]
    # Stop Word Removal
    stop_words = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))
    tokens = [token for token in tokens if token not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
    return lemmatized_tokens

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [6]:
data['Processed Text'] = data['Review'].apply(preprocess_text)

In [7]:
def calculate_tfidf(docs):
    tfidf_scores = []
    num_docs = len(docs)
    all_words = set([word for doc in docs for word in doc])

    for doc in docs:
        word_freq = Counter(doc)
        tfidf_doc = {}
        for word in set(doc):
            tf = word_freq[word] / len(doc)
            idf = math.log(num_docs / sum(1 for doc in docs if word in doc))
            tfidf_doc[word] = tf * idf
        tfidf_scores.append(tfidf_doc)

    return tfidf_scores

In [86]:
tfidf_scores = calculate_tfidf(data['Processed Text'])

In [87]:
with open('/content/drive/MyDrive/IR/A2/tfidf_scores.pkl', 'wb') as f:
    pickle.dump(tfidf_scores, f)

In [8]:
with open('/content/drive/MyDrive/IR/A2/tfidf_scores.pkl', 'rb') as f:
    tfidf_scores = pickle.load(f)

# Display the loaded TF-IDF scores
print(tfidf_scores)

[{'bridg': 0.24152819674042683, 'strat': 0.22311183179027683, 'stabil': 0.3448795526644501, 'love': 0.14796238371647566, 'spring': 0.7088350185609748, 'vintag': 0.5739119315837103, 'tension': 0.3448795526644501, 'want': 0.15386893484826944, 'way': 0.1869497027024104, 'good': 0.11284850787089686, 'float': 0.41493878502243053, 'great': 0.0813881630848832}, {'mat': 0.3270846367590627, 'care': 0.3656597973154727, 'screw': 0.13996105457540936, 'make': 0.10738004360666076, 'organ': 0.2788588087656861, 'abus': 0.3057443679112646, 'workspac': 0.3635660673148493, 'color': 0.15265379440787716, "n't": 0.057245913085777644, 'roll': 0.25412177564748956, 'work': 0.07712302991896544, 'bench': 0.3270846367590627, 'good': 0.08315153211539768, 'wo': 0.17953935355345563, 'guitar': 0.06643728322836312, 'rug': 0.24792266850767986, 'easier': 0.20589594765411293, 'great': 0.05997022543096656}, {'avail': 0.08351549849078078, 'greatest': 0.10618194072812012, 'music': 0.06051067621708797, 'comput': 0.0954200986

In [None]:
tfidf_scores_loaded[0]

{'strat': 0.22161575603506845,
 'floating': 0.3872761993542685,
 'vintage': 0.5356511361447963,
 'want': 0.1763383601293881,
 'great': 0.07596228554589099,
 'tension': 0.3218875824868201,
 'go': 0.1659276447457026,
 'loving': 0.36809739452414975,
 'way': 0.1763383601293881,
 'bridge': 0.2274165145010438,
 'springs': 0.7064423155397381,
 'stability': 0.35322115776986907,
 'good': 0.10564968665624859}

### Q3. Image Retrieval and Text Retrieval

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

#### Image Retrieval

In [41]:
input_review = input("Input review: ")
input_image_url = 'https://images-na.ssl-images-amazon.com/images/I/71HSx4Y-5dL._SY88.jpg'
# input_image_url = input("Input image url: ")
response = requests.get(input_image_url)
input_image = cv2.imdecode(np.frombuffer(response.content, np.uint8), -1)
input_features = extract_features_vgg(input_image)

Input review: arya


In [11]:
from sklearn.metrics.pairwise import cosine_similarity

# Function to compute cosine similarity between two feature vectors
def compute_cosine_similarity(input_features, dataset_features):
    similarity_scores = cosine_similarity(input_features.reshape(1, -1), dataset_features)
    return similarity_scores[0]

In [42]:
try:

    # Sample code for indices retrieval
    indices = [idx for idx, f in enumerate(image_features_loaded) if len(f) == len(input_features)]
    dataset_reviews = [data['Review'].iloc[idx] for idx in indices]
    dataset_features = [image_features_loaded.iloc[idx] for idx in indices]
    input_features = np.array(input_features)

    similarity_scores = compute_cosine_similarity(input_features, dataset_features)

    sorted_indices = np.argsort(similarity_scores)[::-1]
    top_three_indices = sorted_indices[0:3]

    print("Top three most similar results based on input image URL:\n")

    for i, index in enumerate(top_three_indices):
        # print(eval(image_url))
        print('Review: ', dataset_reviews[index])
        print('Image URL: ', eval(data['Image path'].iloc[index])[0])
        print("Similarity score:", similarity_scores[index], "\n")

except Exception as e:
    print(f"Error computing similarity: {e}")


Top three most similar results based on input image URL:

Review:  Works great as a guitar bench mat. Not rugged enough for abuse but if you take care of it, it will take care of you. Makes organization of workspace much easier because screws won't roll around. Color is good too.
Image URL:  https://images-na.ssl-images-amazon.com/images/I/71HSx4Y-5dL._SY88.jpg
Similarity score: 0.9999999999999997 

Review:  Great little set. Practice pads with options!

**Update**
I have been smashing this little kit for a few weeks now, and it is holding up just fine..I play Classic rock and 90's metal on it, no problems.
Looks like a toy, plays like a pro set.
Image URL:  https://images-na.ssl-images-amazon.com/images/I/714WqVbbWvL._SY88.jpg
Similarity score: 0.22523683626702284 

Review:  It nicer than I thought a protects my soprano better than the other bag I had it in, I would recommend it.
Image URL:  https://images-na.ssl-images-amazon.com/images/I/71yh456bWGL._SY88.jpg
Similarity score: 0.222

#### Text Retrieval

In [13]:
import math
from collections import Counter

import math

def cosine_similarity_dict(input_dict, dict_scores):
    # Convert dictionaries to vectors
    keys = set(input_dict.keys()).union(dict_scores.keys())
    vector1 = [input_dict.get(key, 0) for key in keys]
    vector2 = [dict_scores.get(key, 0) for key in keys]

    # Reshape vectors into 2D arrays
    vector1 = np.array(vector1).reshape(1, -1)
    vector2 = np.array(vector2).reshape(1, -1)

    # Compute cosine similarity
    similarity = cosine_similarity(vector1, vector2)[0, 0]
    return similarity

In [29]:
def calculate_tfidf_input(docs, input):
    tfidf_scores = []
    num_docs = len(docs)
    all_words = set([word for doc in docs for word in doc])

    for doc in input:
        word_freq = Counter(doc)
        tfidf_doc = {}
        for word in set(doc):
            tf = word_freq[word] / len(doc)
            doc_freq = sum(1 for doc in docs if word in doc)
            if doc_freq == 0:
                tfidf_doc[word] = 0
            else:
                idf = math.log(num_docs / sum(1 for doc in docs if word in doc))
                tfidf_doc[word] = tf * idf
        tfidf_scores.append(tfidf_doc)

    return tfidf_scores

def calculate_query_tfidf(corpus_tfidf, query, num_docs =1000):
    # Initialize TF-IDF scores for the query
    tfidf_query = {}
    word_freq_query = Counter(query)

    # Calculate TF-IDF for the query
    for word in set(query):
        tf = word_freq_query[word] / len(query)
        idf = math.log(num_docs / sum(1 for doc_tfidf in corpus_tfidf if word in doc_tfidf))
        tfidf_query[word] = tf * idf

    return tfidf_query

def score(n):
    return n[2]

In [33]:
input_review = input("Input review: ")
input_image_url = input("Input image url: ")

processed_input_review = preprocess_text(input_review)

# Calculate TF-IDF scores for the input review
input_tfidf = calculate_tfidf_input(data['Processed Text'], [processed_input_review])[0]
# print(input_tfidf)
# Compute cosine similarity between the input review and all other reviews
similarities = [
    (data['Review'][i], data['Image path'][i], cosine_similarity_dict(input_tfidf, tfidf_scores[i]))
    for i in range(len(data['Review']))
]

# Sort the reviews based on cosine similarity scores in descending order
# sorted(similarites, key = score)
similarities.sort(key=score, reverse=True)

# Get the top three most similar reviews
top_three_similar_reviews = similarities[:3]

print("\nTop three most similar results based on text reviews:\n")

for review, image_url, similarity in top_three_similar_reviews:
    # print(eval(image_url))
    print('Review: ', review)
    print('Image URL: ', eval(image_url)[0])
    print("Similarity score:", similarity, "\n")

Input review: Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go.
Input image url: sfdrgdg

Top three most similar results based on text reviews:

Review:  Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go.
Image URL:  https://images-na.ssl-images-amazon.com/images/I/81q5+IxFVUL._SY88.jpg
Similarity score: 1.0000000000000002 

Review:  Nice solid springs and defeinitely more silent. Easy installation and the black looks cool.

Pictured with some old uninstalled springs next to them.
Image URL:  https://images-na.ssl-images-amazon.com/images/I/81Z1d7HaBfL._SY88.jpg
Similarity score: 0.33774204412815967 

Review:  Fits great but the only complaint I have is that the tremolo springs are cheap quality and 

### Q4. Combined Retrieval (Text and Image)