In [81]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
from sklearn.decomposition import PCA
from tqdm.notebook import tqdm as tqdm 
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [82]:
df_final = pd.read_csv("df_final.csv")

In [83]:
print(df_final.dtypes)
print(df_final.shape)

User-ID          int64
Age            float64
ISBN            object
Book-Rating      int64
AgeGroup        object
Book-Title      object
Book-Author     object
dtype: object
(153764, 7)


In [84]:
df_final.head()

Unnamed: 0,User-ID,Age,ISBN,Book-Rating,AgeGroup,Book-Title,Book-Author
0,67544,30.0,2005018,8,Midle-Age (31-60),Clara Callan,Richard Bruce Wright
1,219008,60.0,2005018,7,Elderly (61-100),Clara Callan,Richard Bruce Wright
2,263325,27.0,2005018,6,Adult (19-30),Clara Callan,Richard Bruce Wright
3,2954,71.0,60973129,8,Elderly (61-100),Decision in Normandy,Carlo D'Este
4,152827,40.0,60973129,7,Midle-Age (31-60),Decision in Normandy,Carlo D'Este


In [85]:
df_final

Unnamed: 0,User-ID,Age,ISBN,Book-Rating,AgeGroup,Book-Title,Book-Author
0,67544,30.0,0002005018,8,Midle-Age (31-60),Clara Callan,Richard Bruce Wright
1,219008,60.0,0002005018,7,Elderly (61-100),Clara Callan,Richard Bruce Wright
2,263325,27.0,0002005018,6,Adult (19-30),Clara Callan,Richard Bruce Wright
3,2954,71.0,0060973129,8,Elderly (61-100),Decision in Normandy,Carlo D'Este
4,152827,40.0,0060973129,7,Midle-Age (31-60),Decision in Normandy,Carlo D'Este
...,...,...,...,...,...,...,...
153759,272482,40.0,1853045616,10,Midle-Age (31-60),The Xenophobe's Guide to the Germans,Stephan Ziedenitz
153760,272715,31.0,0312273150,10,Midle-Age (31-60),All About All About Eve: The Complete Behind-T...,Sam Staggs
153761,276263,33.0,0312273150,5,Midle-Age (31-60),All About All About Eve: The Complete Behind-T...,Sam Staggs
153762,275383,43.0,0679449132,8,Midle-Age (31-60),Virtuous Reality: How America Surrendered Disc...,Jon Katz


In [86]:
df_100 = df_final.head(100)

In [87]:
df_100

Unnamed: 0,User-ID,Age,ISBN,Book-Rating,AgeGroup,Book-Title,Book-Author
0,67544,30.0,0002005018,8,Midle-Age (31-60),Clara Callan,Richard Bruce Wright
1,219008,60.0,0002005018,7,Elderly (61-100),Clara Callan,Richard Bruce Wright
2,263325,27.0,0002005018,6,Adult (19-30),Clara Callan,Richard Bruce Wright
3,2954,71.0,0060973129,8,Elderly (61-100),Decision in Normandy,Carlo D'Este
4,152827,40.0,0060973129,7,Midle-Age (31-60),Decision in Normandy,Carlo D'Este
...,...,...,...,...,...,...,...
95,111637,53.0,0375759778,9,Midle-Age (31-60),Prague : A Novel,ARTHUR PHILLIPS
96,219008,60.0,0375759778,7,Elderly (61-100),Prague : A Novel,ARTHUR PHILLIPS
97,235867,36.0,0375759778,7,Midle-Age (31-60),Prague : A Novel,ARTHUR PHILLIPS
98,95703,31.0,0425163091,9,Midle-Age (31-60),Chocolate Jesus,Stephan Jaramillo


In [88]:
df_100.to_csv("df_100.csv", index=False)

In [89]:
df_droped =df_100.drop(["Age","ISBN","Book-Author", "User-ID", "AgeGroup"], axis=1)

In [90]:
df_dummied = pd.get_dummies(df_droped, columns=['Book-Title']).mul(df_100["Book-Rating"],0).drop("Book-Rating", axis=1)

In [91]:
df_dummied

Unnamed: 0,Book-Title_Bant/Spec.Last of the Breed,Book-Title_Chocolate Jesus,Book-Title_Clara Callan,Book-Title_Death in the Clouds,Book-Title_Decision in Normandy,Book-Title_Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It,Book-Title_New Vegetarian: Bold and Beautiful Recipes for Every Occasion,Book-Title_OUT OF THE SILENT PLANET,Book-Title_Our Dumb Century: The Onion Presents 100 Years of Headlines from America's Finest News Source,Book-Title_Piercing the Darkness,Book-Title_Prague : A Novel,Book-Title_Prelude to Foundation (Foundation Novels (Paperback)),Book-Title_Prophet,Book-Title_The Kitchen God's Wife,Book-Title_Winter Solstice
0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0
97,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0
98,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0


In [92]:
scaled_features = StandardScaler().fit_transform(df_dummied.values)

In [93]:
df_scaled = pd.DataFrame(scaled_features, columns=df_dummied.columns)

In [94]:
df_scaled

Unnamed: 0,Book-Title_Bant/Spec.Last of the Breed,Book-Title_Chocolate Jesus,Book-Title_Clara Callan,Book-Title_Death in the Clouds,Book-Title_Decision in Normandy,Book-Title_Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It,Book-Title_New Vegetarian: Bold and Beautiful Recipes for Every Occasion,Book-Title_OUT OF THE SILENT PLANET,Book-Title_Our Dumb Century: The Onion Presents 100 Years of Headlines from America's Finest News Source,Book-Title_Piercing the Darkness,Book-Title_Prague : A Novel,Book-Title_Prelude to Foundation (Foundation Novels (Paperback)),Book-Title_Prophet,Book-Title_The Kitchen God's Wife,Book-Title_Winter Solstice
0,-0.20383,-0.140028,6.478408,-0.203275,-0.142534,-0.226319,-0.250781,-0.141966,-0.308574,-0.400866,-0.284556,-0.291414,-0.269985,-0.288746,-0.457854
1,-0.20383,-0.140028,5.646777,-0.203275,-0.142534,-0.226319,-0.250781,-0.141966,-0.308574,-0.400866,-0.284556,-0.291414,-0.269985,-0.288746,-0.457854
2,-0.20383,-0.140028,4.815146,-0.203275,-0.142534,-0.226319,-0.250781,-0.141966,-0.308574,-0.400866,-0.284556,-0.291414,-0.269985,-0.288746,-0.457854
3,-0.20383,-0.140028,-0.174643,-0.203275,7.459295,-0.226319,-0.250781,-0.141966,-0.308574,-0.400866,-0.284556,-0.291414,-0.269985,-0.288746,-0.457854
4,-0.20383,-0.140028,-0.174643,-0.203275,6.509066,-0.226319,-0.250781,-0.141966,-0.308574,-0.400866,-0.284556,-0.291414,-0.269985,-0.288746,-0.457854
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.20383,-0.140028,-0.174643,-0.203275,-0.142534,-0.226319,-0.250781,-0.141966,-0.308574,-0.400866,4.547524,-0.291414,-0.269985,-0.288746,-0.457854
96,-0.20383,-0.140028,-0.174643,-0.203275,-0.142534,-0.226319,-0.250781,-0.141966,-0.308574,-0.400866,3.473728,-0.291414,-0.269985,-0.288746,-0.457854
97,-0.20383,-0.140028,-0.174643,-0.203275,-0.142534,-0.226319,-0.250781,-0.141966,-0.308574,-0.400866,3.473728,-0.291414,-0.269985,-0.288746,-0.457854
98,-0.20383,8.261652,-0.174643,-0.203275,-0.142534,-0.226319,-0.250781,-0.141966,-0.308574,-0.400866,-0.284556,-0.291414,-0.269985,-0.288746,-0.457854


In [95]:
pca = PCA(n_components=3)
pca_trans = pca.fit_transform(df_dummied)
pca_trans_df = pd.DataFrame(pca_trans)

In [96]:
pca_trans_df

Unnamed: 0,0,1,2
0,-0.195017,-1.255439,-0.134747
1,-0.191649,-1.228849,-0.130436
2,-0.188281,-1.202259,-0.126124
3,-0.186680,-1.188298,-0.123201
4,-0.184354,-1.170101,-0.120333
...,...,...,...
95,-0.265602,-1.875131,-0.282545
96,-0.243929,-1.690151,-0.242037
97,-0.243929,-1.690151,-0.242037
98,-0.189083,-1.207261,-0.126268


In [97]:
pca_trans_df['User-ID'] = df_final['User-ID']
#pca_trans_df['AgeGroup'] = df_final['AgeGroup']

In [99]:
#pca_trans_df['AgeGroup'].unique()

In [100]:
pca_trans_df

Unnamed: 0,0,1,2,User-ID
0,-0.195017,-1.255439,-0.134747,67544
1,-0.191649,-1.228849,-0.130436,219008
2,-0.188281,-1.202259,-0.126124,263325
3,-0.186680,-1.188298,-0.123201,2954
4,-0.184354,-1.170101,-0.120333,152827
...,...,...,...,...
95,-0.265602,-1.875131,-0.282545,111637
96,-0.243929,-1.690151,-0.242037,219008
97,-0.243929,-1.690151,-0.242037,235867
98,-0.189083,-1.207261,-0.126268,95703


In [48]:
#pca_trans_df.set_index("User-ID", inplace=True)

# first_User_ID = 12345
# second_User_ID = 6789
# cosine_similarity = 0.80

# first_User_ID = 9876
# second_User_ID = 54321
# cosine_similarity = 0.23

# temp={
#     "first_User_ID": first_User_ID,
#     "second_User_ID": second_User_ID,
#     "cosine_similarity": cosine_similarity
# }

# df = pd.DataFrame()

# df = df.append(temp, ignore_index=True)

# similarity = cosine(np.array(pca_trans_df.iloc[0]).reshape(-1, 1), np.array(pca_trans_df.iloc[1]).reshape(-1, 1))

# '{:.20f}'.format(similarity)

In [101]:
df = pd.DataFrame()
for ind in tqdm(pca_trans_df.index):
  for i in range(len(pca_trans_df)):
    try:
      cosine_similarity =1- cosine(np.array(pca_trans_df.drop("User-ID", axis=1).iloc[ind]).reshape(-1, 1), np.array(pca_trans_df.drop("User-ID", axis=1).iloc[ind+1+i]).reshape(-1, 1))
      cosine_similarity_string = '{:.20f}'.format(cosine_similarity)
      first_User_ID = pca_trans_df["User-ID"][ind]
      second_User_ID = pca_trans_df["User-ID"][ind+1+i]

      temp={
      "first_User_ID": first_User_ID,
      "second_User_ID": second_User_ID,
      "cosine_similarity": cosine_similarity,
      "cosine_similarity_string": cosine_similarity_string
      }
      df = df.append(temp, ignore_index=True)
    except:
      break

  0%|          | 0/100 [00:00<?, ?it/s]

In [50]:
#cosine(np.array(pca_trans_df.drop("User-ID", axis=1).iloc[0]).reshape(-1, 1), np.array(pca_trans_df.drop("User-ID", axis=1).iloc[1]).reshape(-1, 1))

In [102]:
df

Unnamed: 0,first_User_ID,second_User_ID,cosine_similarity,cosine_similarity_string
0,67544.0,219008.0,0.999999,0.99999913539771012605
1,67544.0,263325.0,0.999996,0.99999638671270829615
2,67544.0,2954.0,0.999992,0.99999206900105086504
3,67544.0,152827.0,0.999988,0.99998789800942322614
4,67544.0,35704.0,0.999793,0.99979268482884364744
...,...,...,...,...
4945,219008.0,95703.0,0.999213,0.99921321735464541103
4946,219008.0,101606.0,0.999098,0.99909828737646033670
4947,235867.0,95703.0,0.999213,0.99921321735464541103
4948,235867.0,101606.0,0.999098,0.99909828737646033670


In [103]:
df['first_User_ID'] = df['first_User_ID'].astype("int")
df['second_User_ID'] = df['second_User_ID'].astype("int")

In [104]:
df

Unnamed: 0,first_User_ID,second_User_ID,cosine_similarity,cosine_similarity_string
0,67544,219008,0.999999,0.99999913539771012605
1,67544,263325,0.999996,0.99999638671270829615
2,67544,2954,0.999992,0.99999206900105086504
3,67544,152827,0.999988,0.99998789800942322614
4,67544,35704,0.999793,0.99979268482884364744
...,...,...,...,...
4945,219008,95703,0.999213,0.99921321735464541103
4946,219008,101606,0.999098,0.99909828737646033670
4947,235867,95703,0.999213,0.99921321735464541103
4948,235867,101606,0.999098,0.99909828737646033670


In [240]:
df.to_csv("User-Pairs-Books.data.csv", index=False)