In [56]:
import pandas as pd
import numpy as np

In [57]:
df = pd.read_csv('cleaned_top_5k.csv')
df.columns

Index(['Unnamed: 0', 'App Name', 'App Id', 'Category', 'Rating',
       'Rating Count', 'Installs', 'Minimum Installs', 'Maximum Installs',
       'Free', 'Price', 'Currency', 'Size', 'Minimum Android',
       'Developer Website', 'Released', 'Last Updated', 'Content Rating',
       'Privacy Policy', 'Ad Supported', 'In App Purchases', 'Editors Choice'],
      dtype='object')

In [58]:
# converting categorical data into numeric form 
binary_columns = ['Ad Supported', 'In App Purchases', 'Editors Choice', 'Free']

for x in binary_columns:
    df.loc[df[x] == True, x] = 1
    df.loc[df[x] == False, x] = 0

In [59]:
# drop useless columns
drop_columns = ['Released', 'Last Updated', 'Currency']

for x in drop_columns:
    df.drop([x], axis=1, inplace=True)

In [60]:
# create one hot encoding for category
df_categories = pd.get_dummies(df['Category'])
df_categories

Unnamed: 0,Action,Adventure,Arcade,Art & Design,Auto & Vehicles,Beauty,Board,Books & Reference,Business,Card,...,Simulation,Social,Sports,Strategy,Tools,Travel & Local,Trivia,Video Players & Editors,Weather,Word
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
299996,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
299997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
299998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
df = pd.concat([df, df_categories], axis=1)

In [62]:
# one hot encoding for content rating
df_content_rating = pd.get_dummies(df['Content Rating'])
df = pd.concat([df, df_content_rating], axis=1)

In [63]:
# one hot encoding for min android verison
df_min_android = pd.get_dummies(df['Minimum Android'])
df = pd.concat([df, df_min_android], axis=1)
df.head()

Unnamed: 0.1,Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,...,5.0 - 8.0,5.0 and up,5.1 and up,6.0 - 7.1.1,6.0 and up,7.0,7.0 and up,7.1 and up,8.0 and up,Varies with device
0,385470,WhatsApp Messenger,com.whatsapp,Communication,4.0,138557570.0,"5,000,000,000+",5000000000.0,6265637751,1,...,0,0,0,0,0,0,0,0,0,1
1,304824,Instagram,com.instagram.android,Social,3.8,120206190.0,"1,000,000,000+",1000000000.0,3559871277,1,...,0,0,0,0,0,0,0,0,0,1
2,2222701,Facebook,com.facebook.katana,Social,2.3,117850066.0,"5,000,000,000+",5000000000.0,6782619635,1,...,0,0,0,0,0,0,0,0,0,1
3,881403,YouTube,com.google.android.youtube,Video Players & Editors,4.4,112440547.0,"5,000,000,000+",5000000000.0,9766230924,1,...,0,0,0,0,0,0,0,0,0,1
4,244319,Garena Free Fire - Rampage,com.dts.freefireth,Action,4.2,89177097.0,"500,000,000+",500000000.0,976536041,1,...,0,0,0,0,0,0,0,0,0,0


In [64]:
columns_to_drop = ['App Name', 'App Id', 'Category', 'Installs', 'Minimum Android', 'Content Rating']

for x in columns_to_drop:
    df.drop([x], axis=1, inplace=True)
    
df.head()

Unnamed: 0.1,Unnamed: 0,Rating,Rating Count,Minimum Installs,Maximum Installs,Free,Price,Size,Developer Website,Privacy Policy,...,5.0 - 8.0,5.0 and up,5.1 and up,6.0 - 7.1.1,6.0 and up,7.0,7.0 and up,7.1 and up,8.0 and up,Varies with device
0,385470,4.0,138557570.0,5000000000.0,6265637751,1,0.0,19.2,1,1,...,0,0,0,0,0,0,0,0,0,1
1,304824,3.8,120206190.0,1000000000.0,3559871277,1,0.0,19.2,1,1,...,0,0,0,0,0,0,0,0,0,1
2,2222701,2.3,117850066.0,5000000000.0,6782619635,1,0.0,19.2,1,1,...,0,0,0,0,0,0,0,0,0,1
3,881403,4.4,112440547.0,5000000000.0,9766230924,1,0.0,19.2,1,1,...,0,0,0,0,0,0,0,0,0,1
4,244319,4.2,89177097.0,500000000.0,976536041,1,0.0,19.2,1,1,...,0,0,0,0,0,0,0,0,0,0


In [65]:
ids = df["Unnamed: 0"]

df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,Rating,Rating Count,Minimum Installs,Maximum Installs,Free,Price,Size,Developer Website,Privacy Policy,Ad Supported,...,5.0 - 8.0,5.0 and up,5.1 and up,6.0 - 7.1.1,6.0 and up,7.0,7.0 and up,7.1 and up,8.0 and up,Varies with device
0,4.0,138557570.0,5000000000.0,6265637751,1,0.0,19.2,1,1,0,...,0,0,0,0,0,0,0,0,0,1
1,3.8,120206190.0,1000000000.0,3559871277,1,0.0,19.2,1,1,1,...,0,0,0,0,0,0,0,0,0,1
2,2.3,117850066.0,5000000000.0,6782619635,1,0.0,19.2,1,1,1,...,0,0,0,0,0,0,0,0,0,1
3,4.4,112440547.0,5000000000.0,9766230924,1,0.0,19.2,1,1,1,...,0,0,0,0,0,0,0,0,0,1
4,4.2,89177097.0,500000000.0,976536041,1,0.0,19.2,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [66]:
cols_to_norm = ['Rating','Rating Count', 'Maximum Installs', 'Minimum Installs', 'Price', 'Size']
df[cols_to_norm] = df[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [67]:
df.head()

Unnamed: 0,Rating,Rating Count,Minimum Installs,Maximum Installs,Free,Price,Size,Developer Website,Privacy Policy,Ad Supported,...,5.0 - 8.0,5.0 and up,5.1 and up,6.0 - 7.1.1,6.0 and up,7.0,7.0 and up,7.1 and up,8.0 and up,Varies with device
0,0.75,1.0,0.5,0.519641,1,0.0,0.012799,1,1,0,...,0,0,0,0,0,0,0,0,0,1
1,0.7,0.867554,0.1,0.295238,1,0.0,0.012799,1,1,1,...,0,0,0,0,0,0,0,0,0,1
2,0.325,0.850549,0.5,0.562517,1,0.0,0.012799,1,1,1,...,0,0,0,0,0,0,0,0,0,1
3,0.85,0.811508,0.5,0.809963,1,0.0,0.012799,1,1,1,...,0,0,0,0,0,0,0,0,0,1
4,0.8,0.64361,0.05,0.080989,1,0.0,0.012799,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [68]:
from sklearn.metrics.pairwise import cosine_similarity
cos_sim_data = pd.DataFrame(cosine_similarity(df))

cos_sim_data
    

MemoryError: Unable to allocate 671. GiB for an array with shape (300000, 300000) and data type float64