In [7]:
import pandas as pd
import numpy as np

In [8]:
df_raw = pd.read_csv('cleaned_top_5k.csv')
df_raw.columns

Index(['Unnamed: 0', 'App Name', 'App Id', 'Category', 'Rating',
       'Rating Count', 'Installs', 'Minimum Installs', 'Maximum Installs',
       'Free', 'Price', 'Currency', 'Size', 'Minimum Android',
       'Developer Website', 'Released', 'Last Updated', 'Content Rating',
       'Privacy Policy', 'Ad Supported', 'In App Purchases', 'Editors Choice'],
      dtype='object')

In [9]:
df = df_raw.copy()

In [10]:
# converting categorical data into numeric form 
binary_columns = ['Ad Supported', 'In App Purchases', 'Editors Choice', 'Free']

for x in binary_columns:
    df.loc[df[x] == True, x] = 1
    df.loc[df[x] == False, x] = 0

In [11]:
# drop useless columns
drop_columns = ['Released', 'Last Updated', 'Currency']

for x in drop_columns:
    df.drop([x], axis=1, inplace=True)

In [12]:
# create one hot encoding for category
df_categories = pd.get_dummies(df['Category'])
df_categories

Unnamed: 0,Action,Adventure,Arcade,Art & Design,Auto & Vehicles,Beauty,Board,Books & Reference,Business,Card,...,Simulation,Social,Sports,Strategy,Tools,Travel & Local,Trivia,Video Players & Editors,Weather,Word
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df = pd.concat([df, df_categories], axis=1)

In [14]:
# one hot encoding for content rating
df_content_rating = pd.get_dummies(df['Content Rating'])
df = pd.concat([df, df_content_rating], axis=1)

In [15]:
# one hot encoding for min android verison
df_min_android = pd.get_dummies(df['Minimum Android'])
df = pd.concat([df, df_min_android], axis=1)
df.head()

Unnamed: 0.1,Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,...,4.4 and up,4.4W and up,5.0 - 8.0,5.0 and up,5.1 and up,6.0 and up,7.0 and up,7.1 and up,8.0 and up,Varies with device
0,385470,WhatsApp Messenger,com.whatsapp,Communication,4.0,138557570.0,"5,000,000,000+",5000000000.0,6265637751,1,...,0,0,0,0,0,0,0,0,0,1
1,304824,Instagram,com.instagram.android,Social,3.8,120206190.0,"1,000,000,000+",1000000000.0,3559871277,1,...,0,0,0,0,0,0,0,0,0,1
2,2222701,Facebook,com.facebook.katana,Social,2.3,117850066.0,"5,000,000,000+",5000000000.0,6782619635,1,...,0,0,0,0,0,0,0,0,0,1
3,881403,YouTube,com.google.android.youtube,Video Players & Editors,4.4,112440547.0,"5,000,000,000+",5000000000.0,9766230924,1,...,0,0,0,0,0,0,0,0,0,1
4,244319,Garena Free Fire - Rampage,com.dts.freefireth,Action,4.2,89177097.0,"500,000,000+",500000000.0,976536041,1,...,0,0,0,0,0,0,0,0,0,0


In [16]:
columns_to_drop = ['App Name', 'App Id', 'Category', 'Installs', 'Minimum Android', 'Content Rating']

for x in columns_to_drop:
    df.drop([x], axis=1, inplace=True)
    
df.head()

Unnamed: 0.1,Unnamed: 0,Rating,Rating Count,Minimum Installs,Maximum Installs,Free,Price,Size,Developer Website,Privacy Policy,...,4.4 and up,4.4W and up,5.0 - 8.0,5.0 and up,5.1 and up,6.0 and up,7.0 and up,7.1 and up,8.0 and up,Varies with device
0,385470,4.0,138557570.0,5000000000.0,6265637751,1,0.0,19.2,1,1,...,0,0,0,0,0,0,0,0,0,1
1,304824,3.8,120206190.0,1000000000.0,3559871277,1,0.0,19.2,1,1,...,0,0,0,0,0,0,0,0,0,1
2,2222701,2.3,117850066.0,5000000000.0,6782619635,1,0.0,19.2,1,1,...,0,0,0,0,0,0,0,0,0,1
3,881403,4.4,112440547.0,5000000000.0,9766230924,1,0.0,19.2,1,1,...,0,0,0,0,0,0,0,0,0,1
4,244319,4.2,89177097.0,500000000.0,976536041,1,0.0,19.2,1,1,...,0,0,0,0,0,0,0,0,0,0


In [17]:
ids = df["Unnamed: 0"]

df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,Rating,Rating Count,Minimum Installs,Maximum Installs,Free,Price,Size,Developer Website,Privacy Policy,Ad Supported,...,4.4 and up,4.4W and up,5.0 - 8.0,5.0 and up,5.1 and up,6.0 and up,7.0 and up,7.1 and up,8.0 and up,Varies with device
0,4.0,138557570.0,5000000000.0,6265637751,1,0.0,19.2,1,1,0,...,0,0,0,0,0,0,0,0,0,1
1,3.8,120206190.0,1000000000.0,3559871277,1,0.0,19.2,1,1,1,...,0,0,0,0,0,0,0,0,0,1
2,2.3,117850066.0,5000000000.0,6782619635,1,0.0,19.2,1,1,1,...,0,0,0,0,0,0,0,0,0,1
3,4.4,112440547.0,5000000000.0,9766230924,1,0.0,19.2,1,1,1,...,0,0,0,0,0,0,0,0,0,1
4,4.2,89177097.0,500000000.0,976536041,1,0.0,19.2,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
cols_to_norm = ['Rating','Rating Count', 'Maximum Installs', 'Minimum Installs', 'Price', 'Size']
df[cols_to_norm] = df[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [19]:
df.head()

Unnamed: 0,Rating,Rating Count,Minimum Installs,Maximum Installs,Free,Price,Size,Developer Website,Privacy Policy,Ad Supported,...,4.4 and up,4.4W and up,5.0 - 8.0,5.0 and up,5.1 and up,6.0 and up,7.0 and up,7.1 and up,8.0 and up,Varies with device
0,0.742857,1.0,0.5,0.519641,1,0.0,0.017313,1,1,0,...,0,0,0,0,0,0,0,0,0,1
1,0.685714,0.867405,0.1,0.295238,1,0.0,0.017313,1,1,1,...,0,0,0,0,0,0,0,0,0,1
2,0.257143,0.850381,0.5,0.562517,1,0.0,0.017313,1,1,1,...,0,0,0,0,0,0,0,0,0,1
3,0.857143,0.811295,0.5,0.809963,1,0.0,0.017313,1,1,1,...,0,0,0,0,0,0,0,0,0,1
4,0.8,0.643208,0.05,0.080989,1,0.0,0.017313,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
from sklearn.metrics.pairwise import cosine_similarity
cos_sim_data = pd.DataFrame(cosine_similarity(df))

cos_sim_data
    

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,1.000000,0.680085,0.606102,0.666219,0.585036,0.934860,0.566943,0.553154,0.449072,0.786280,...,0.515776,0.520989,0.673039,0.520881,0.519027,0.409781,0.436498,0.588761,0.406138,0.594868
1,0.680085,1.000000,0.929181,0.773850,0.634433,0.732417,0.621298,0.822254,0.823702,0.577199,...,0.585865,0.588578,0.625487,0.588499,0.587647,0.798992,0.623060,0.424767,0.693568,0.433074
2,0.606102,0.929181,1.000000,0.816529,0.523822,0.660303,0.505003,0.716868,0.832624,0.599399,...,0.581084,0.577595,0.613817,0.577466,0.579096,0.797285,0.606304,0.408443,0.690373,0.403254
3,0.666219,0.773850,0.816529,1.000000,0.468220,0.611481,0.454986,0.663332,0.673878,0.678707,...,0.631063,0.525488,0.558444,0.525359,0.522875,0.637234,0.559836,0.462768,0.634499,0.477002
4,0.585036,0.634433,0.523822,0.468220,1.000000,0.655535,0.662628,0.655564,0.544329,0.483020,...,0.520570,0.526657,0.559690,0.526624,0.524391,0.639794,0.443218,0.463456,0.524400,0.476101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.409781,0.798992,0.797285,0.637234,0.639794,0.515399,0.538806,0.653509,0.880634,0.471741,...,0.651455,0.657474,0.577369,0.657323,0.655219,1.000000,0.579275,0.478483,0.770021,0.494126
4996,0.436498,0.623060,0.606304,0.559836,0.443218,0.546982,0.691060,0.573354,0.574325,0.503807,...,0.568380,0.577599,0.613818,0.577537,0.574092,0.579275,1.000000,0.370270,0.574101,0.394564
4997,0.588761,0.424767,0.408443,0.462768,0.463456,0.581525,0.604989,0.341105,0.606951,0.669342,...,0.605077,0.610236,0.648532,0.610299,0.608411,0.478483,0.370270,1.000000,0.608400,0.845221
4998,0.406138,0.693568,0.690373,0.634499,0.524400,0.512705,0.648636,0.650814,0.879765,0.466273,...,0.649755,0.654461,0.573211,0.654514,0.652825,0.770021,0.574101,0.608400,1.000000,0.487721


In [21]:
df_test = df.head(5000)
df_validation = df_raw.head(5000)

df_test.set_index(np.arange(0,5000), inplace=True)
df_validation.set_index(np.arange(0,5000), inplace=True)

df_test


Unnamed: 0,Rating,Rating Count,Minimum Installs,Maximum Installs,Free,Price,Size,Developer Website,Privacy Policy,Ad Supported,...,4.4 and up,4.4W and up,5.0 - 8.0,5.0 and up,5.1 and up,6.0 and up,7.0 and up,7.1 and up,8.0 and up,Varies with device
0,0.742857,1.000000e+00,0.5000,0.519641,1,0.000000,0.017313,1,1,0,...,0,0,0,0,0,0,0,0,0,1
1,0.685714,8.674045e-01,0.1000,0.295238,1,0.000000,0.017313,1,1,1,...,0,0,0,0,0,0,0,0,0,1
2,0.257143,8.503807e-01,0.5000,0.562517,1,0.000000,0.017313,1,1,1,...,0,0,0,0,0,0,0,0,0,1
3,0.857143,8.112949e-01,0.5000,0.809963,1,0.000000,0.017313,1,1,1,...,0,0,0,0,0,0,0,0,0,1
4,0.800000,6.432080e-01,0.0500,0.080989,1,0.000000,0.017313,1,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.885714,1.502876e-06,0.0001,0.000142,1,0.000000,0.006403,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4996,0.971429,1.192186e-06,0.0001,0.000131,0,0.265688,0.013495,1,1,1,...,0,0,0,0,0,0,0,0,0,1
4997,0.685714,1.062129e-06,0.0010,0.001337,1,0.000000,0.026224,1,1,0,...,1,0,0,0,0,0,0,0,0,0
4998,0.800000,6.358323e-07,0.0010,0.001153,1,0.000000,0.030770,1,1,1,...,1,0,0,0,0,0,0,0,0,0


In [22]:
def recommend(index):
    index_recomm = cos_sim_data.loc[index].sort_values(ascending=False).index.tolist()[1:6]
    
    print(index_recomm)
    app_recomm =  df_validation['App Name'].loc[index_recomm].values
    result = {'Applications':app_recomm,'Index':index_recomm}
    
    return result

In [23]:
recommend(23)

[4768, 617, 1151, 3399, 499]


{'Applications': array(['Stick Fight: The Game Mobile', 'Soul Knight',
        'Royal Revolt 2: Tower Defense RTS & Castle Builder',
        'Tanks A Lot! - Realtime Multiplayer Battle Arena', 'Archero'],
       dtype=object),
 'Index': [4768, 617, 1151, 3399, 499]}

In [24]:
df_validation.head(40)

Unnamed: 0.1,Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,...,Size,Minimum Android,Developer Website,Released,Last Updated,Content Rating,Privacy Policy,Ad Supported,In App Purchases,Editors Choice
0,385470,WhatsApp Messenger,com.whatsapp,Communication,4.0,138557570.0,"5,000,000,000+",5000000000.0,6265637751,True,...,19.2,Varies with device,1,"Oct 18, 2010","Jun 09, 2021",Everyone,1,False,False,True
1,304824,Instagram,com.instagram.android,Social,3.8,120206190.0,"1,000,000,000+",1000000000.0,3559871277,True,...,19.2,Varies with device,1,"Apr 3, 2012","Jun 14, 2021",Teen,1,True,True,True
2,2222701,Facebook,com.facebook.katana,Social,2.3,117850066.0,"5,000,000,000+",5000000000.0,6782619635,True,...,19.2,Varies with device,1,"Jun 16, 2021","Jun 16, 2021",Teen,1,True,True,False
3,881403,YouTube,com.google.android.youtube,Video Players & Editors,4.4,112440547.0,"5,000,000,000+",5000000000.0,9766230924,True,...,19.2,Varies with device,1,"Oct 20, 2010","Jun 16, 2021",Teen,1,True,False,False
4,244319,Garena Free Fire - Rampage,com.dts.freefireth,Action,4.2,89177097.0,"500,000,000+",500000000.0,976536041,True,...,19.2,4.1 and up,1,"Dec 7, 2017","Jun 04, 2021",Mature 17+,1,False,True,True
5,2095852,Messenger – Text and Video Chat for Free,com.facebook.orca,Communication,4.0,78563229.0,"5,000,000,000+",5000000000.0,5054312355,True,...,19.2,Varies with device,1,"Jan 30, 2014","Jun 14, 2021",Everyone,1,False,True,True
6,423997,Clash of Clans,com.supercell.clashofclans,Strategy,4.5,56025424.0,"500,000,000+",500000000.0,643789632,True,...,182.0,4.4 and up,1,"Sep 30, 2013","Jun 09, 2021",Everyone 10+,1,False,True,True
7,58082,PUBG MOBILE - Traverse,com.tencent.ig,Action,4.3,37479011.0,"500,000,000+",500000000.0,505818718,True,...,19.2,Varies with device,0,"Mar 19, 2018","May 10, 2021",Teen,1,True,True,True
8,65037,TikTok,com.zhiliaoapp.musically,Social,4.4,36446381.0,"1,000,000,000+",1000000000.0,1645811582,True,...,89.0,4.4 and up,1,"Jul 9, 2015","Jun 15, 2021",Teen,1,True,True,False
9,1830962,Google Photos,com.google.android.apps.photos,Photography,4.5,35369236.0,"5,000,000,000+",5000000000.0,5754179589,True,...,19.2,Varies with device,1,"May 28, 2015","Jun 14, 2021",Everyone,1,False,False,False


In [25]:
from sklearn.metrics.pairwise import euclidean_distances
euc_distance_data = pd.DataFrame(euclidean_distances(df))

euc_distance_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0.000000,2.496238,2.694465,2.476481,2.743261,1.094531,2.796035,2.823370,3.141373,1.891583,...,2.918662,2.920848,2.351837,2.921610,2.919358,3.246583,3.105260,2.553621,3.243652,2.564192
1,2.496238,0.000000,1.189706,2.111274,2.666782,2.295465,2.708864,1.855896,1.850642,2.758416,...,2.801688,2.806855,2.622633,2.807470,2.804053,1.972199,2.646071,3.138255,2.421140,3.151929
2,2.694465,1.189706,0.000000,1.850070,2.961753,2.517348,3.013016,2.266549,1.746845,2.599527,...,2.737917,2.765458,2.578283,2.766272,2.753827,1.919513,2.621094,3.077470,2.362658,3.130900
3,2.476481,2.111274,1.850070,0.000000,3.124967,2.688064,3.156545,2.467087,2.432916,2.328440,...,2.565480,2.926083,2.749865,2.926904,2.926846,2.562195,2.765428,2.930962,2.562190,2.929125
4,2.743261,2.666782,2.961753,3.124967,0.000000,2.515158,2.467265,2.478280,2.856227,2.918561,...,2.903416,2.902764,2.725031,2.903297,2.902282,2.535785,3.085893,2.904411,2.902184,2.907909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,3.246583,1.972199,1.919513,2.562195,2.535785,2.960906,2.862436,2.465835,1.450404,2.922731,...,2.455506,2.449662,2.645928,2.450575,2.451256,0.000000,2.660449,2.835559,2.001985,2.830679
4996,3.105260,2.646071,2.621094,2.765428,3.085893,2.805226,2.295489,2.677307,2.680579,2.760471,...,2.671466,2.661541,2.466509,2.662189,2.664737,2.660449,0.000000,3.025291,2.664635,3.012473
4997,2.553621,3.138255,3.077470,2.930962,2.904411,2.597668,2.495479,3.177619,2.475719,2.140159,...,2.449673,2.455607,2.242647,2.455937,2.452209,2.835559,3.025291,0.000000,2.452159,1.443784
4998,3.243652,2.421140,2.362658,2.562190,2.902184,2.957725,2.488682,2.465316,1.450048,2.923680,...,2.451026,2.450332,2.646403,2.450529,2.449518,2.001985,2.664635,2.452159,0.000000,2.834043


In [33]:
# supported metrics are the following
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import manhattan_distances

euc_distance_data = pd.DataFrame(euclidean_distances(df))
cos_sim_data = pd.DataFrame(cosine_similarity(df))
man_distance_data = pd.DataFrame(manhattan_distances(df))

def recommend(index, metric="Cosine", number_of_recommendations=5):
    print("Recommendations using metric: " + metric)
    
    if metric == "Cosine":
        index_recomm = cos_sim_data.loc[index].sort_values(ascending=False).index.tolist()[1:number_of_recommendations + 1]
    elif metric == "Euclidean":
        index_recomm = euc_distance_data.loc[index].sort_values(ascending=False).index.tolist()[1:number_of_recommendations + 1]
    elif metric == "Manhattan":
        index_recomm = man_distance_data.loc[index].sort_values(ascending=False).index.tolist()[1:number_of_recommendations + 1]
    
    app_recomm =  df_validation['App Name'].loc[index_recomm].values
    result = {'Applications':app_recomm,'Index':index_recomm}
    
    return result

In [29]:
recommend_euc_distance(39)

[4901, 2970, 10, 2439, 2593]


{'Applications': array(['BusyBox', 'Personal stickers for WhatsApp',
        'Google Play services', 'Nova Launcher Prime', 'Terraria'],
       dtype=object),
 'Index': [4901, 2970, 10, 2439, 2593]}

In [40]:
recommend(15, "Euclidean")

Recommendations using metric: Euclidean


{'Applications': array(['BusyBox', 'Samsung Calculator', 'Grand Theft Auto: San Andreas',
        'Google Play services', 'Nova Launcher Prime'], dtype=object),
 'Index': [4901, 1877, 1338, 10, 2439]}