In [171]:
import numpy as np
import pandas as pd
from surprise import KNNWithMeans
from surprise import Dataset, Reader, SVD
from surprise.model_selection import GridSearchCV

In [170]:
df = pd.read_csv("user-item-100k-cleaned.csv")
df_raw = pd.read_csv("cleaned_top_5k.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110187 entries, 0 to 110186
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   Unnamed: 0  110187 non-null  int64
 1   user_id     110187 non-null  int64
 2   app_id      110187 non-null  int64
 3   rating      110187 non-null  int64
dtypes: int64(4)
memory usage: 3.4 MB


In [172]:
# Here is how df looks
df.head()

Unnamed: 0.1,Unnamed: 0,user_id,app_id,rating
0,0,1,538,1
1,1,1,137,1
2,2,1,3,5
3,3,1,0,2
4,4,1,4,4


In [173]:
# Here is how df_raw looks
df_raw.head()

Unnamed: 0.1,Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,...,Size,Minimum Android,Developer Website,Released,Last Updated,Content Rating,Privacy Policy,Ad Supported,In App Purchases,Editors Choice
0,385470,WhatsApp Messenger,com.whatsapp,Communication,4.0,138557570.0,"5,000,000,000+",5000000000.0,6265637751,True,...,19.2,Varies with device,1,"Oct 18, 2010","Jun 09, 2021",Everyone,1,False,False,True
1,304824,Instagram,com.instagram.android,Social,3.8,120206190.0,"1,000,000,000+",1000000000.0,3559871277,True,...,19.2,Varies with device,1,"Apr 3, 2012","Jun 14, 2021",Teen,1,True,True,True
2,2222701,Facebook,com.facebook.katana,Social,2.3,117850066.0,"5,000,000,000+",5000000000.0,6782619635,True,...,19.2,Varies with device,1,"Jun 16, 2021","Jun 16, 2021",Teen,1,True,True,False
3,881403,YouTube,com.google.android.youtube,Video Players & Editors,4.4,112440547.0,"5,000,000,000+",5000000000.0,9766230924,True,...,19.2,Varies with device,1,"Oct 20, 2010","Jun 16, 2021",Teen,1,True,False,False
4,244319,Garena Free Fire - Rampage,com.dts.freefireth,Action,4.2,89177097.0,"500,000,000+",500000000.0,976536041,True,...,19.2,4.1 and up,1,"Dec 7, 2017","Jun 04, 2021",Mature 17+,1,False,True,True


In [104]:
# transfer the data from pandas dataframe into surprise's dataset
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[["user_id", "app_id", "rating"]], reader)

In [174]:
# perform cross validation to find the best parameters

param_grid = {
    "n_epochs": [5, 10],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

1.2960829205499438
{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}


In [176]:
# create a model with the best parameters found in the previous step

svd = SVD(verbose=True, n_epochs=5, lr_all=0.002, reg_all=0.6)
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.2952  1.2951  1.2985  1.2963  0.0016  
MAE (testset)     1.1228  1.1246  1.1273  1.1249  0.0018  
Fit time          0.27    0.31    0.32    0.30    0.02    
Test time         0.22    0.26    0.24    0.24    0.02    


{'test_rmse': array([1.29522845, 1.29511907, 1.29851853]),
 'test_mae': array([1.12284659, 1.12464417, 1.1272761 ]),
 'fit_time': (0.2651538848876953, 0.3122992515563965, 0.31542515754699707),
 'test_time': (0.21659278869628906, 0.2640516757965088, 0.2353212833404541)}

In [177]:
trainset = data.build_full_trainset()
svd.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1d06f3b6ac0>

In [116]:
svd.predict(uid=1, iid=100)

Prediction(uid=1, iid=100, r_ui=None, est=3.4710672420146698, details={'was_impossible': False})

In [117]:
def predict_rating(user_id, app_id, model):
    prediction = model.predict(uid=user_id, iid=app_id)
    return prediction.est

In [124]:
def get_recommendations_for_user(user_id, number_of_recommendations=5, threshold=3.7):
    appIds = np.arange(0,5000)
    # shuffle the appIds to introduce some randomness
    random.shuffle(appIds)
    
    recommendations = []

    for i in range(0, len(appIds)):
        res = predict_rating(user_id, appIds[i], svd)
        if(res > threshold):
            if appIds[i] not in recommendations:
                recommendations.append(appIds[i])
                if len(recommendations) == number_of_recommendations: 
                    return recommendations
    
    
    print("You need to lower the threshold...")
    return []
        

In [125]:
def get_app_metadata(app_id):
    return df_raw.iloc[app_id, :]



In [127]:
get_recommendations_for_user(1)


[2624, 303, 4334, 1238, 1694]

In [132]:
def generate_recommendation_dataframe(ids):
    apps = []
    
    for x in ids:
        apps.append(get_app_metadata(x))
        
    return pd.DataFrame(apps)
    

In [133]:
generate_recommendation_dataframe(get_recommendations_for_user(1))

Unnamed: 0.1,Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,...,Size,Minimum Android,Developer Website,Released,Last Updated,Content Rating,Privacy Policy,Ad Supported,In App Purchases,Editors Choice
126,1921617,"Canva: Graphic Design, Video Collage, Logo Maker",com.canva.editor,Art & Design,4.7,5213141.0,"100,000,000+",100000000.0,134619454,True,...,32.0,5.0 and up,1,"Nov 27, 2017","Jun 16, 2021",Everyone,1,False,True,True
2932,1831211,Buddy.ai: English for kids,ai.mybuddy.talkingflashcards_new,Education,4.7,285745.0,"5,000,000+",5000000.0,7350905,True,...,144.0,7.0 and up,1,"Aug 20, 2019","Jun 08, 2021",Everyone,1,False,True,False
2093,1764789,Serasa,br.com.serasaexperian.consumidor,Finance,4.4,398743.0,"10,000,000+",10000000.0,30521607,True,...,61.0,5.0 and up,1,"Mar 20, 2019","Jun 10, 2021",Everyone,1,False,False,False
4382,1573155,냥코 대전쟁,jp.co.ponos.battlecatskr,Casual,4.1,181364.0,"5,000,000+",5000000.0,8047321,True,...,146.0,5.0 and up,1,"May 20, 2021","May 20, 2021",Everyone 10+,1,True,True,False
1065,1365443,"Viki: Stream Asian Drama, Movies and TV Shows",com.viki.android,Entertainment,4.2,785834.0,"10,000,000+",10000000.0,41903321,True,...,19.2,Varies with device,1,"Mar 4, 2012","Jun 14, 2021",Teen,1,True,True,True


In [166]:
class AppRecommenderSystem:
    
    def __init__(self, ratings_dataset, app_metadata, model):
        self._ratings = ratings_dataset
        self._app_metadata = app_metadata
        self._model = model
        
    def _predict_rating(self, user_id, app_id):
        prediction = self._model.predict(uid=user_id, iid=app_id)
        return prediction.est
    
    def _get_app_metadata(self, app_id):
        return self._app_metadata.iloc[app_id, :]
        
    def _generate_recommendations(self, user_id, number_of_recommendations=5, threshold=3.7):
        appIds = np.arange(0, df_raw.shape[0])
        # shuffle the appIds to introduce some randomness
        random.shuffle(appIds)
        
        recommendations = []
        
        for i in range(0, len(appIds)):
            res = self._predict_rating(user_id, appIds[i])
            if(res > threshold):
                if appIds[i] not in recommendations:
                    recommendations.append(appIds[i])
                    if len(recommendations) == number_of_recommendations: 
                        return recommendations

    
        print("You need to lower the threshold...")
        return []
    
    def _generate_recommendations_dataframe(self, ids):
        apps = []
    
        for x in ids:
            apps.append(self._get_app_metadata(x))

        return pd.DataFrame(apps)
    
    def recommend(self, user_id, number_of_recommendations=5, threshold=3.7):
        result = self._generate_recommendations_dataframe(self._generate_recommendations(user_id, number_of_recommendations, threshold))
        
        return result
        
    def get_user_ratings(self, user_id):
        user_ratings = self._ratings[self._ratings['user_id'] == user_id]

        apps = []

        for x in user_ratings['app_id'].values:
            apps.append(self._app_metadata.iloc[x, 1])

        applications = pd.DataFrame(apps)
        applications["users_given_rating"] = user_ratings['rating'].values

        return applications
        
        

In [167]:
app_recommender = AppRecommenderSystem(df, df_raw, svd)

In [168]:
app_recommender.recommend(1, 2, 3.7)

Unnamed: 0.1,Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,...,Size,Minimum Android,Developer Website,Released,Last Updated,Content Rating,Privacy Policy,Ad Supported,In App Purchases,Editors Choice
3788,1888200,Wort Guru,com.wordgames.wordconnect.de,Word,4.6,215439.0,"5,000,000+",5000000.0,7760932,True,...,81.0,4.4 and up,1,"Apr 26, 2017","May 24, 2021",Teen,1,True,True,False
1525,709861,Block! Hexa Puzzle™,com.bitmango.go.blockhexapuzzle,Puzzle,3.9,546861.0,"50,000,000+",50000000.0,89941865,True,...,69.0,4.4 and up,1,"Apr 25, 2016","May 03, 2021",Everyone,1,True,True,False


In [169]:
app_recommender.get_user_ratings(1)

Unnamed: 0,0,users_given_rating
0,OLX,1
1,Temple Run,1
2,YouTube,5
3,WhatsApp Messenger,2
4,Garena Free Fire - Rampage,4
5,Facebook,4
6,8 Ball Pool,5
7,TikFans - Boost Followers and Likes for Tik Tok,5
8,eFootball PES 2021,4
9,Spotify: Listen to podcasts & find music you love,5
