## Creating the Text vectors and finding the Similarity Matrix.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from ast import literal_eval
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_json('Dataset_with_img_embeddings.json')

In [3]:
df.shape

(14223, 12)

### Applying TF-IDF Vectorizer on the description column.

In [4]:
tfv = TfidfVectorizer(min_df = 3, max_features = None,
                     strip_accents = 'unicode', analyzer = "word", token_pattern = r'\w{1,}',
                     ngram_range = (1,3),
                     stop_words = 'english')

In [5]:
tfv_matrix = tfv.fit_transform(df['description'])

In [6]:
result = pd.DataFrame(
    data = tfv_matrix.toarray(),
    columns = tfv.get_feature_names_out()
)

In [7]:
del tfv_matrix

In [8]:
df = df.drop(df.columns[~df.columns.isin(['p_id','img_embedding'])], axis=1)

In [9]:
df = pd.concat([df,result], axis=1)

In [10]:
df = df.dropna(subset=['p_id'])

In [11]:
del result

### Creating columns for every element in the Image Embedding vector.

In [12]:
split_df = df['img_embedding'].apply(pd.Series)
df = pd.concat([df.drop(columns=['img_embedding'], axis=1), split_df], axis = 1)

In [13]:
del split_df

In [14]:
df.columns = df.columns.astype(str)

In [16]:
df.head(5)

Unnamed: 0,p_id,0,0 40,0 5,0 5 lycra,0 5 mtr,0 5 spendex,0 6,0 7,0 7 m,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,17048614.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.424743,0.0,1.154702,4.43566,2.593419,0.0,0.0,0.0,0.0,0.0
1,16524740.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.089751,0.0,1.575045,0.46871,0.0,0.0,0.0,1.370939,0.0,0.0
2,16331376.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.142239,0.0,2.062879,2.604944,0.0,0.0,0.0,0.0,0.879324,0.0
3,14709966.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.89077,0.0,0.611349,3.531166,0.0,0.0,0.0,0.0,6.466933,0.0
4,11056154.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.651135,0.0,0.0,0.965572,0.0,0.471014,0.0,0.0,3.518062,0.0


In [17]:
df.fillna(0, inplace=True)

### Creating the Similarity Matrix by applying Cosine Similarity on data.

In [18]:
similarity_matrix = cosine_similarity(df.drop(columns=['p_id'], axis=1))

In [19]:
df.shape

(14223, 40808)

In [20]:
import json
final_json = {}
for idx, i in enumerate(similarity_matrix):
    final_json[int(df.iloc[idx]['p_id'])] = i.tolist()

with open('Multi_modal_similarity.json', 'a') as f:
    json.dump(final_json, f)