Задача: написать рекомендательная модель, на данных, представленных в датасете с сайта игр. Необходимо, чтобы модель, предлагала пользователям игры, основываясь на их оценках. 

In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('reviews_Video_Games.csv', sep=';')
df['reviewTime'] = pd.to_datetime(df['reviewTime'])
df.head()

Unnamed: 0.1,Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,reviewTime
0,0,A2HD75EMZR8QLN,700099867,123,"[8, 12]",Installing the game was a struggle (because of...,1.0,Pay to unlock content? I don't think so.,2012-07-09
1,1,A3UR8NLLY1ZHCX,700099867,"Alejandro Henao ""Electronic Junky""","[0, 0]",If you like rally cars get this game you will ...,4.0,Good rally game,2013-06-30
2,2,A1INA0F5CWW3J4,700099867,"Amazon Shopper ""Mr.Repsol""","[0, 0]",1st shipment received a book instead of the ga...,1.0,Wrong key,2014-06-28
3,3,A1DLMTOTHQ4AST,700099867,ampgreen,"[7, 10]","I got this version instead of the PS3 version,...",,"awesome game, if it did not crash frequently !!",2011-09-14
4,4,A361M14PU2GUEG,700099867,"Angry Ryan ""Ryan A. Forrest""","[2, 2]",I had Dirt 2 on Xbox 360 and it was an okay ga...,4.0,DIRT 3,2011-06-14


<br>•	reviewerID - уникальный ID пользователя например A2SUAM1J3GNN3B
<br>•	asin - уникальный ID игры например 0000013714
<br>•	reviewerName - имя ревьюэра
<br>•	helpful – признак насколько полезен рейтинг
<br>•	reviewText – текст ревью
<br>•	overall – рейтинг
<br>•	summary – краткое суммари по ревью
<br>•	reviewTime – время ревью

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185206 entries, 0 to 185205
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Unnamed: 0    185206 non-null  int64         
 1   reviewerID    185206 non-null  object        
 2   asin          185206 non-null  object        
 3   reviewerName  182939 non-null  object        
 4   helpful       185206 non-null  object        
 5   reviewText    185166 non-null  object        
 6   overall       167628 non-null  float64       
 7   summary       185198 non-null  object        
 8   reviewTime    185206 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(1), object(6)
memory usage: 12.7+ MB


Для начала нужно восстановить пропущенные оценки пользователей. Воспользуемся их текстом и оценкой, для обучения модели. (остальные признаки на первый взгляд не очень полезны)

In [7]:
# df_ = df.drop(['Unnamed: 0', 'reviewerName', 'reviewText', 'helpful', 'summary', 'reviewTime'], axis=1)
# df_.head()

Unnamed: 0,reviewerID,asin,overall
0,A2HD75EMZR8QLN,700099867,1.0
1,A3UR8NLLY1ZHCX,700099867,4.0
2,A1INA0F5CWW3J4,700099867,1.0
3,A1DLMTOTHQ4AST,700099867,
4,A361M14PU2GUEG,700099867,4.0


In [8]:
nan = df.loc[df['overall'].isnull()]

In [9]:
list_of_nan= list(nan['Unnamed: 0'])

In [10]:
df_wo_nan = df.dropna()

In [11]:
#X, y = df['reviewText'], df_wo_nan['overall']
X_train, y_train = df_wo_nan['reviewText'][:150000], df_wo_nan['overall'][:150000]
X_train_t, y_train_t = df_wo_nan['reviewText'][150000:], df_wo_nan['overall'][150000:]
X_test, y_test = nan['reviewText'], nan['overall']

In [12]:
cv = CountVectorizer()
logit = LogisticRegression(n_jobs=-5, random_state=42)

In [13]:
cv.fit(X_train, y_train)

CountVectorizer()

In [14]:
X_tr = cv.transform(X_train)
x_te = cv.transform(X_test.values.astype('U'))

In [17]:
logit.fit(X_tr, y_train)

LogisticRegression(n_jobs=-5, random_state=42)

In [18]:
pred = logit.predict((x_te))

In [19]:
pred_t = logit.predict(cv.transform(X_train_t))

In [20]:
print("score:", round(accuracy_score(y_train_t, pred_t),2))

score: 0.63


LogisticRegression 0.63. Слабовато.

In [21]:
cv.vocabulary_.get(u'algorithm')

11856

In [103]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

In [106]:
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3,random_state=42)),])

In [112]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3),}

In [98]:
from xgboost import XGBClassifier

In [130]:
model = XGBClassifier(max_depth=10, random_state=42, n_estimators=10, n_jobs=-1)
model.fit(X_tr, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=None, n_estimators=10, n_jobs=-1,
              nthread=None, objective='multi:softprob', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [132]:
predict_xgb = model.predict(X_train_t_tr)

In [134]:
print("score:", round(accuracy_score(y_train_t, predict_xgb),2))

score: 0.59


Logit лучше предсказывает, чем XGB. Возможно не настроены гиперпараметры, но перебором по сетке параметров показало, что это лучший результат текущий.

In [22]:
nan_w_pred = nan.copy()
nan_w_pred['overall'] = pred
nan_w_pred.head()

Unnamed: 0.1,Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,reviewTime
3,3,A1DLMTOTHQ4AST,700099867,ampgreen,"[7, 10]","I got this version instead of the PS3 version,...",1.0,"awesome game, if it did not crash frequently !!",2011-09-14
8,9,A2JLT2WY0F2HVI,700099867,D. Sweetapple,"[1, 1]",I still haven't figured this one out. Did ever...,5.0,Couldn't get this one to work,2014-02-08
19,25,A37M0B3NHDHN9V,6050036071,Fernando,"[0, 0]","Works good, however is not ""like a new"" with a...",5.0,Works good!,2012-04-10
25,34,AZGQXYB4TAODL,7100027950,FightForTheLost,"[0, 6]",The greatest bait and switch betrayal of a fan...,1.0,The most hated videogame of all time and great...,2014-02-21
31,41,A2ZYJOZO6BPV6K,7293000936,"H. Clark ""vivchick""","[1, 1]",This product is very good and works fine. Buy ...,5.0,For the price its great.,2011-09-05


In [23]:
total = df_wo_nan.merge(nan_w_pred, how='outer')
total

Unnamed: 0.1,Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,reviewTime
0,0,A2HD75EMZR8QLN,0700099867,123,"[8, 12]",Installing the game was a struggle (because of...,1.0,Pay to unlock content? I don't think so.,2012-07-09
1,1,A3UR8NLLY1ZHCX,0700099867,"Alejandro Henao ""Electronic Junky""","[0, 0]",If you like rally cars get this game you will ...,4.0,Good rally game,2013-06-30
2,2,A1INA0F5CWW3J4,0700099867,"Amazon Shopper ""Mr.Repsol""","[0, 0]",1st shipment received a book instead of the ga...,1.0,Wrong key,2014-06-28
3,4,A361M14PU2GUEG,0700099867,"Angry Ryan ""Ryan A. Forrest""","[2, 2]",I had Dirt 2 on Xbox 360 and it was an okay ga...,4.0,DIRT 3,2011-06-14
4,5,A2UTRVO4FDCBH6,0700099867,A.R.G.,"[0, 0]","Overall this is a well done racing game, with ...",4.0,"Good racing game, terrible Windows Live Requir...",2013-05-11
...,...,...,...,...,...,...,...,...,...
183101,231693,A244FWBXEBGG10,B00JM3R6M6,Raven,"[1, 3]",Had Fun With It But Starting To Get Boring,5.0,Fun But Gets Boring When Unlock Everything,2014-07-12
183102,231703,ATKKCGDROU0X0,B00JQ8YH6A,Jpaul,"[2, 6]",As far as Nancy Drew games go this one is very...,3.0,Not a typical mystery but still good,2014-06-04
183103,231741,A3TAS1AG6FMBQW,B00JXW6GE0,"Jeri Zerr ""formerly a Road Warrior of the Nth...","[0, 0]",The KLORTS wired gaming mouse is such a freaki...,5.0,The KLORTS wired gaming mouse is such a freaki...,2014-07-16
183104,231754,A3E89VW74Z8TK8,B00KAI3KW2,FallenSphinx,"[2, 9]",With the unneeded kinect out of the way this c...,5.0,The Future,2014-06-18


In [24]:
new_total = total.drop(['Unnamed: 0', 'reviewerName', 'reviewText', 'helpful', 'summary', 'reviewTime'], axis=1)
new_total.head()

Unnamed: 0,reviewerID,asin,overall
0,A2HD75EMZR8QLN,700099867,1.0
1,A3UR8NLLY1ZHCX,700099867,4.0
2,A1INA0F5CWW3J4,700099867,1.0
3,A361M14PU2GUEG,700099867,4.0
4,A2UTRVO4FDCBH6,700099867,4.0


Восстановили и объеденили со целым датасетом. Далее займемся рекомендациями. Воспользуемся LightFM. Можно и другими методами (кор Пирсона и тд), но библиотека хорошая и не надо pivot_table делать (компьютер старый, 3 раза зависал).

In [25]:
#!pip install lightfm
from lightfm import LightFM

In [26]:
#X, y = new_total[:150000], new_total[150000:]
X = new_total[:]

In [27]:
from sklearn.preprocessing import LabelEncoder
from lightfm.data import Dataset

In [29]:
le = LabelEncoder()
X_new = X.copy()
le.fit(X_new.reviewerID)
X_new['reviewerID'] = le.transform(X.reviewerID)

Нужно создать датасет sparse matrix для модели.

In [30]:
dataset = Dataset()

In [31]:
dataset.fit(users = (row['reviewerID'] for index, row in X.iterrows()),
            items = (row['asin'] for index, row in X.iterrows()))

In [32]:
(interactions, weights) = dataset.build_interactions((row['reviewerID'], row['asin'], row['overall']) for index, row in X.iterrows())

In [33]:
model_cl = LightFM(no_components=20, loss='warp')

In [34]:
model_cl.fit(interactions,
             user_features=None,
             item_features=None, 
             sample_weight=None, 
             epochs=30, 
             num_threads=4)

<lightfm.lightfm.LightFM at 0x273b5d17d88>

Модель обучена, делаем предсказания.

In [312]:
def sample_recommendation(model, data, user_ids, le): 
#на подаем(модель, данные df после Label enc, списки пользователей (string или int(id)), Label Enc)
    #number of users in training data
    n_users, n_items = data['reviewerID'].shape[0], data['asin'].shape[0]
    ids_ = []
    #проверка входа
    if isinstance(user_ids[0], int):
        flag = 0
        pass
    else:
        for user in range(len(user_ids)):
            user_ids[user] = int(le.transform([user_ids[user]]))
            ids = data.loc[data['reviewerID'] == user_ids[user]]
#             print(ids) # каким играм отметки ставил уже
            to_list = list(ids.index)
            ids_.append(to_list[0])
        flag = 1
    #generate recommendations for each user we input
    num_items = len(set(list(data['asin'])))
    if (flag == 1):     
        for user_id in ids_:
            scores = model.predict(user_id, np.arange(num_items))
            #rank them in order of most liked to least
            top_items = data['asin'][np.argsort(-scores)]

            #print out the results 
            print("For user %s" % le.inverse_transform([data['reviewerID'][user_id]]) + ' model recommended next games:')

            for x in top_items[:3]:
                print("        %s" % x)            
    elif (flag == 0):
        for user_id in user_ids:
            scores = model.predict(user_id, np.arange(num_items))
            #rank them in order of most liked to least
            top_items = data['asin'][np.argsort(-scores)]

            #print out the results
            print("For user %s" % user_id + ' model recommended next games:')

            for x in top_items[:3]:
                print("        %s" % x)

# sample_recommendation(model_cl, X_new, [0, 1, 2], le)
sample_recommendation(model_cl, X_new, ['A2HD75EMZR8QLN', 'A3UR8NLLY1ZHCX', 'A1INA0F5CWW3J4'], le)

For user 0 model recommended nex games:
        B00000K2R4
        B00000K4MC
        B00000INR2
For user 1 model recommended nex games:
        B000034DC2
        B00004T72D
        B00004RBQX
For user 2 model recommended nex games:
        B00004T72D
        B0000488VP
        B00002SVMG


На выходе получаем ID игры для пользователя. Использовал для предсказания пропущенных строк оценок Логистическую регрессию. Для рекомендаций: LigthFM.  
Что можно улучшить: использовать другую модель предсказания пропущенных оценок (0.63 ну очень смешно дял серьезной задачи). Для LigthFM добрать и использовать весь датасет для создания user_features и item_features, чтобы улучшить качество. 