In [33]:
import numpy as np
import pandas as pd
import random
from surprise import KNNWithMeans
from surprise import Dataset, Reader, SVD
from surprise.model_selection import GridSearchCV, cross_validate

In [34]:
df = pd.read_csv("user-item-100k-cleaned.csv")
df_1m = pd.read_csv("user-item-1m-cleaned.csv")
df_raw = pd.read_csv("cleaned_top_5k.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110187 entries, 0 to 110186
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   Unnamed: 0  110187 non-null  int64
 1   user_id     110187 non-null  int64
 2   app_id      110187 non-null  int64
 3   rating      110187 non-null  int64
dtypes: int64(4)
memory usage: 3.4 MB


In [35]:
# Here is how df looks
df.head()

Unnamed: 0.1,Unnamed: 0,user_id,app_id,rating
0,0,1,538,1
1,1,1,137,1
2,2,1,3,5
3,3,1,0,2
4,4,1,4,4


In [36]:
# Here is how df_raw looks
df_raw.head()

Unnamed: 0.1,Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,...,Size,Minimum Android,Developer Website,Released,Last Updated,Content Rating,Privacy Policy,Ad Supported,In App Purchases,Editors Choice
0,385470,WhatsApp Messenger,com.whatsapp,Communication,4.0,138557570.0,"5,000,000,000+",5000000000.0,6265637751,True,...,19.2,Varies with device,1,"Oct 18, 2010","Jun 09, 2021",Everyone,1,False,False,True
1,304824,Instagram,com.instagram.android,Social,3.8,120206190.0,"1,000,000,000+",1000000000.0,3559871277,True,...,19.2,Varies with device,1,"Apr 3, 2012","Jun 14, 2021",Teen,1,True,True,True
2,2222701,Facebook,com.facebook.katana,Social,2.3,117850066.0,"5,000,000,000+",5000000000.0,6782619635,True,...,19.2,Varies with device,1,"Jun 16, 2021","Jun 16, 2021",Teen,1,True,True,False
3,881403,YouTube,com.google.android.youtube,Video Players & Editors,4.4,112440547.0,"5,000,000,000+",5000000000.0,9766230924,True,...,19.2,Varies with device,1,"Oct 20, 2010","Jun 16, 2021",Teen,1,True,False,False
4,244319,Garena Free Fire - Rampage,com.dts.freefireth,Action,4.2,89177097.0,"500,000,000+",500000000.0,976536041,True,...,19.2,4.1 and up,1,"Dec 7, 2017","Jun 04, 2021",Mature 17+,1,False,True,True


In [37]:
# transfer the data from pandas dataframe into surprise's dataset
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df_1m[["user_id", "app_id", "rating"]], reader)

In [38]:
# perform cross validation to find the best parameters

param_grid = {
    "n_epochs": [5, 10],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

1.2946199599470978
{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}


In [39]:
# create a model with the best parameters found in the previous step

svd = SVD(verbose=True, n_epochs=5, lr_all=0.002, reg_all=0.6)
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.2940  1.2944  1.2955  1.2946  0.0006  
MAE (testset)     1.1233  1.1240  1.1256  1.1243  0.0010  
Fit time          2.66    2.95    4.09    3.23    0.62    
Test time         3.27    3.35    3.65    3.42    0.16    


{'test_rmse': array([1.29399184, 1.29436417, 1.29550154]),
 'test_mae': array([1.12327059, 1.12402081, 1.12556565]),
 'fit_time': (2.6637558937072754, 2.9457311630249023, 4.0902204513549805),
 'test_time': (3.270613193511963, 3.3521056175231934, 3.648163080215454)}

In [40]:
trainset = data.build_full_trainset()
svd.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x24f12d24460>

In [71]:
class AppRecommenderSystem:
    
    def __init__(self, ratings_dataset, app_metadata, model):
        self._ratings = ratings_dataset
        self._app_metadata = app_metadata
        self._model = model
        
    def _predict_rating(self, user_id, app_id):
        prediction = self._model.predict(uid=user_id, iid=app_id)
        return prediction.est
    
    def _get_app_metadata(self, app_id):
        return self._app_metadata.iloc[app_id, :]
        
    def _generate_recommendations(self, user_id, number_of_recommendations=5, threshold=3.7):
        appIds = np.arange(0, df_raw.shape[0])
        # shuffle the appIds to introduce some randomness
        random.shuffle(appIds)
        
        recommendations = []
        
        for i in range(0, len(appIds)):
            res = self._predict_rating(user_id, appIds[i])
            if(res > threshold):
                if appIds[i] not in recommendations:
                    recommendations.append(appIds[i])
                    if len(recommendations) == number_of_recommendations: 
                        return recommendations

    
        print("You need to lower the threshold...")
        return []
    
    def _generate_top_n_recommendations(self, user_id, number_of_recommendations):
        appIds = np.arange(0, df_raw.shape[0])
        
        recommendations = { 'Predicted Rating' : [], 'App Id': []}
        
        for i in range(0, len(appIds)):
            res = self._predict_rating(user_id, appIds[i])
            
            recommendations['Predicted Rating'].append(res)
            recommendations['App Id'].append(i)
            
        
        return pd.DataFrame(recommendations).sort_values('Predicted Rating', ascending=False).head(number_of_recommendations)
    
    def _generate_recommendations_dataframe(self, ids):
        apps = []
    
        for x in ids:
            apps.append(self._get_app_metadata(x))

        return pd.DataFrame(apps)
    
    def recommend_with_threshold(self, user_id, number_of_recommendations=5, threshold=3.7):
        result = self._generate_recommendations_dataframe(self._generate_recommendations(user_id, number_of_recommendations, threshold))
        
        return result
    
    def recommend(self, user_id, number_of_recommendations):
        result = self._generate_top_n_recommendations(user_id, number_of_recommendations)
    
        print(result)
    
        return self._generate_recommendations_dataframe(list(result['App Id']))
        
    def get_user_ratings(self, user_id):
        user_ratings = self._ratings[self._ratings['user_id'] == user_id]

        apps = []

        for x in user_ratings['app_id'].values:
            apps.append(self._app_metadata.iloc[x, 1])

        applications = pd.DataFrame(apps)
        applications["users_given_rating"] = user_ratings['rating'].values

        return applications
        
        

In [72]:
app_recommender = AppRecommenderSystem(df_1m, df_raw, svd)

In [80]:
app_recommender.recommend(10, 5)

      Predicted Rating  App Id
3822          3.815601    3822
2005          3.793054    2005
3941          3.776237    3941
1674          3.771148    1674
4590          3.768556    4590


Unnamed: 0.1,Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,...,Size,Minimum Android,Developer Website,Released,Last Updated,Content Rating,Privacy Policy,Ad Supported,In App Purchases,Editors Choice
3822,2220011,ドラゴンボールZ ドッカンバトル,com.bandainamcogames.dbzdokkan,Action,4.2,213130.0,"1,000,000+",1000000.0,4886283,True,...,19.2,4.4 and up,1,"May 12, 2021","May 12, 2021",Teen,1,True,False,False
2005,1367348,POKO - Play With New Friends,com.huya.pokogame,Social,4.4,418522.0,"10,000,000+",10000000.0,25098860,True,...,138.0,5.0 and up,1,"Jul 24, 2019","Jan 27, 2021",Teen,1,False,True,False
3941,1892648,Contacts+,com.contapps.android,Communication,3.9,206428.0,"10,000,000+",10000000.0,24056306,True,...,13.0,5.0 and up,1,"Aug 5, 2010","Jun 06, 2021",Everyone,1,True,True,False
1674,1898150,Grim Soul: Dark Fantasy Survival,fantasy.survival.game.rpg,Role Playing,4.4,496622.0,"10,000,000+",10000000.0,16366095,True,...,136.0,4.4 and up,1,"Feb 16, 2018","Jun 08, 2021",Teen,1,False,True,False
4590,1432773,ONE PUNCH MAN: The Strongest (Authorized),com.onepunchman.ggplay.sea,Role Playing,4.1,171847.0,"5,000,000+",5000000.0,6220920,True,...,36.0,4.1 and up,1,"Jun 04, 2021","Jun 04, 2021",Teen,1,False,False,False


In [82]:
app_recommender.get_user_ratings(10)

Unnamed: 0,0,users_given_rating
0,Shopee 6.6 & 7.7 Mid-Year Sale,4
1,PewDiePie's Tuber Simulator,5
2,WhatsApp Messenger,5
3,"Safe Security - Antivirus, Booster, Phone Cle...",1
4,Amber Weather,2
5,PUBG MOBILE - Traverse,3
6,Video Editor APP - VivaCut,3
7,Garena Free Fire - Rampage,4
