In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
import seaborn as sns
import scipy
import matplotlib.pyplot as plt
import scikitplot as skplt
from wordcloud import WordCloud

In [3]:
from bs4 import BeautifulSoup
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [5]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [6]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
import tokenization

In [7]:
import os
import re
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/steam-recommendation-nlp-dataset/game_overview.csv
/kaggle/input/steam-recommendation-nlp-dataset/test.csv
/kaggle/input/steam-recommendation-nlp-dataset/train.csv
/kaggle/input/submission/submission1.csv


In [8]:
df = pd.read_csv('/kaggle/input/steam-recommendation-nlp-dataset/train.csv')
game_df = pd.read_csv('/kaggle/input/steam-recommendation-nlp-dataset/game_overview.csv')
testing_df=  pd.read_csv('/kaggle/input/steam-recommendation-nlp-dataset/test.csv')

In [9]:
def rep(text):
    text = re.sub('♥♥♥♥',"worst bad horrible game",text)
    return text

df['user_review']=df.user_review.apply(rep)
testing_df['user_review']=testing_df.user_review.apply(rep)

In [10]:
testing_df.head(5)

Unnamed: 0,review_id,title,year,user_review
0,1603,Counter-Strike: Global Offensive,2015.0,"Nice graphics, new maps, weapons and models. B..."
1,1604,Counter-Strike: Global Offensive,2018.0,I would not recommend getting into this at its...
2,1605,Counter-Strike: Global Offensive,2018.0,Edit 11/12/18I have tried playing CS:GO recent...
3,1606,Counter-Strike: Global Offensive,2015.0,The game is great. But the community is the wo...
4,1607,Counter-Strike: Global Offensive,2015.0,I thank TrulyRazor for buying this for me a lo...


In [11]:
def low(text):
    return text.lower()

df['user_review']=df.user_review.apply(low)
testing_df['user_review']=testing_df.user_review.apply(low)

testing_df.head(5)

Unnamed: 0,review_id,title,year,user_review
0,1603,Counter-Strike: Global Offensive,2015.0,"nice graphics, new maps, weapons and models. b..."
1,1604,Counter-Strike: Global Offensive,2018.0,i would not recommend getting into this at its...
2,1605,Counter-Strike: Global Offensive,2018.0,edit 11/12/18i have tried playing cs:go recent...
3,1606,Counter-Strike: Global Offensive,2015.0,the game is great. but the community is the wo...
4,1607,Counter-Strike: Global Offensive,2015.0,i thank trulyrazor for buying this for me a lo...


In [12]:
def asc(text):
    text = re.sub('[^a-zA-Z]'," ",text)
    return text

df['user_review']=df.user_review.apply(asc)
testing_df['user_review']=testing_df.user_review.apply(asc)


testing_df.head(5)

Unnamed: 0,review_id,title,year,user_review
0,1603,Counter-Strike: Global Offensive,2015.0,nice graphics new maps weapons and models b...
1,1604,Counter-Strike: Global Offensive,2018.0,i would not recommend getting into this at its...
2,1605,Counter-Strike: Global Offensive,2018.0,edit i have tried playing cs go recent...
3,1606,Counter-Strike: Global Offensive,2015.0,the game is great but the community is the wo...
4,1607,Counter-Strike: Global Offensive,2015.0,i thank trulyrazor for buying this for me a lo...


In [13]:
# def fstop(text):
#     text = re.sub('.',' . ',text)
#     return text

# df['user_review']=df.user_review.apply(fstop)
# testing_df['user_review']=testing_df.user_review.apply(fstop)


# testing_df.head(5)

In [14]:
testing_df['user_review'].sample(5)

2016    before writting this review  i didn t realise ...
3069    this is a game of patience  upgrading the comp...
7711    product received for freeearly access review  ...
3634    this game is interesting  one of the first mmo...
6903    i really hope the guys who made this game read...
Name: user_review, dtype: object

In [15]:
testing_df.drop(["review_id","title","year"],axis=1,inplace=True)

In [16]:
testing_df.head(5)

Unnamed: 0,user_review
0,nice graphics new maps weapons and models b...
1,i would not recommend getting into this at its...
2,edit i have tried playing cs go recent...
3,the game is great but the community is the wo...
4,i thank trulyrazor for buying this for me a lo...


In [17]:
df.drop(['review_id'],axis=1,inplace=True)

In [18]:
result = pd.merge(df, game_df,on='title', how='left')

In [19]:
result.drop(['year'],axis=1,inplace=True)
result.tail(5)

Unnamed: 0,title,user_review,user_suggestion,developer,publisher,tags,overview
17489,EverQuest II,arguably the single greatest mmorp that exists...,1,Daybreak Game Company,Daybreak Game Company,"['Free to Play', 'Massively Multiplayer', 'RPG...",EverQuest® II is the epitome of massively mult...
17490,EverQuest II,an older game to be sure but has its own cha...,1,Daybreak Game Company,Daybreak Game Company,"['Free to Play', 'Massively Multiplayer', 'RPG...",EverQuest® II is the epitome of massively mult...
17491,EverQuest II,when i frist started playing everquest it wa...,1,Daybreak Game Company,Daybreak Game Company,"['Free to Play', 'Massively Multiplayer', 'RPG...",EverQuest® II is the epitome of massively mult...
17492,EverQuest II,cool game the only thing that really pisses m...,1,Daybreak Game Company,Daybreak Game Company,"['Free to Play', 'Massively Multiplayer', 'RPG...",EverQuest® II is the epitome of massively mult...
17493,EverQuest II,this game since i was a little kid always hav...,1,Daybreak Game Company,Daybreak Game Company,"['Free to Play', 'Massively Multiplayer', 'RPG...",EverQuest® II is the epitome of massively mult...


In [20]:
result.isnull().any()

title              False
user_review        False
user_suggestion    False
developer          False
publisher          False
tags               False
overview           False
dtype: bool

In [21]:
result.drop(["overview","developer","publisher"],axis=1,inplace=True)

In [22]:
result.head(2)

Unnamed: 0,title,user_review,user_suggestion,tags
0,Spooky's Jump Scare Mansion,i m scared and hearing creepy voices so i ll...,1,"['Horror', 'Free to Play', 'Cute', 'First-Pers..."
1,Spooky's Jump Scare Mansion,best game more better than sam pepper s youtu...,1,"['Horror', 'Free to Play', 'Cute', 'First-Pers..."


In [23]:
testing_df.head(2)

Unnamed: 0,user_review
0,nice graphics new maps weapons and models b...
1,i would not recommend getting into this at its...


# Text Processing

In [24]:
def punctuation_removal(messy_str):
    clean_list = [char for char in messy_str if char not in string.punctuation]
    clean_str = ''.join(clean_list)
    return clean_str

In [25]:
result['user_review'] = result['user_review'].apply(punctuation_removal)
testing_df['user_review'] = testing_df['user_review'].apply(punctuation_removal)


In [None]:

from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
stop = stopwords.words('english')

In [None]:
stop.remove('not')
stop.remove("don't")

In [None]:
stop_words = []

for item in stop: 
    new_item = punctuation_removal(item)
    stop_words.append(new_item) 
# print(stop_words)

In [None]:
def stopwords_removal(messy_str):
    messy_str = word_tokenize(messy_str)
    return [word.lower() for word in messy_str 
            if word.lower() not in stop_words ]

In [None]:
result['user_review'] = result['user_review'].apply(stopwords_removal)
testing_df['user_review'] = testing_df['user_review'].apply(stopwords_removal)


In [None]:
result.head(2)



In [None]:
import re
def drop_numbers(list_text):
    list_text_new = []
    for i in list_text:
        if not re.search('\d', i):
            list_text_new.append(i)
    return ' '.join(list_text_new)
result['user_review'] = result['user_review'].apply(drop_numbers)
testing_df['user_review'] = testing_df['user_review'].apply(drop_numbers)


In [None]:
result.head(2)

In [None]:
from textblob import TextBlob
example = "This game is scary and i would recommend it playing"
blob = TextBlob(example)
blob.sentiment

#Note that the subjectivity is still high as it is still an opinion and not a fact

In [43]:
# result_reviews = result['user_review'] 
# testing_reviews = testing_df['user_review'] 

# from tqdm import tqdm
# result_sentiments = []
# for review in tqdm(result_reviews):
#     blob = TextBlob(review)
#     result_sentiments += [blob.sentiment.polarity]
    
    
# from tqdm import tqdm
# testing_sentiments = []
# for review in tqdm(testing_reviews):
#     blob = TextBlob(review)
#     testing_sentiments += [blob.sentiment.polarity]
    
    
# result['sentiment'] = result_sentiments
# testing_df['sentiment'] =testing_sentiments




# loaded_submission = pd.read_csv('/kaggle/input/submission/submission1.csv')
# loaded_submission.tail()
# loaded_submission['sentiment'] = testing_df['sentiment']



# result.user_review[13978] = "good game"

# result.user_review[13978]

In [None]:
# result.sample(20)

# Correlation with ratings

Let's see if the newly calculated sentiments have any correlation with the ratings. Our hypothesis would be that reviews with postive reviews would have higher polarity values and vice-versa.

In [42]:
# np.corrcoef(result["user_suggestion"], result["sentiment"])

We observe that there is indeed a positive but not very strong correlation between the sentiment of the reviews and the ratings given (0.378). Although the correlation is weak, it is still a useful feature for us to make predictions from as it isn't close to 0.

# Visualizing the correlation

In [39]:
# sns.boxplot(x=np.array(result["user_suggestion"]),y=np.array(result["sentiment"]))
# plt.xlabel("Recommended")
# plt.ylabel("Sentiment")
# plt.title("Sentiment vs Recommended")
# plt.show()

Improving the sentiment analysis
This was a very simple intro to sentiment analysis. We note that the sentiments calculated are only weakly correlated with the ratings, suggesting that there is a lot of room for improvement. A few basic steps that can be taken to improve our sentiment calculation are:

Pre-processing: Clean up the text. Do stemming/lemmatization. Remove stop words
Remove neutral sentences - Do sentiment analysis of individual sentences in a review and remove neutral ones. Then re-run the sentiment analysis on the polar sentences. This will give a more accurate sentiment score

In [40]:
# result.drop(['title','tags'],axis=1,inplace=True)
# result.head(2)

In [29]:
# result.sentiment.min()

In [41]:
# result.sentiment.max()

In [30]:
X = result.drop('user_suggestion',axis=1)
y = result['user_suggestion']

In [70]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state = 42, test_size=0.01)

In [71]:
print('\n train X: {} \n train y: {} \n Val X: {} \n val y: {}'.format((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape)))


 train X: (17319, 1) 
 train y: (17319,) 
 Val X: (175, 1) 
 val y: (175,)


# BERT

In [72]:
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [73]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [74]:
X_train_bert = X_train.user_review
X_test_bert = X_test.user_review



In [75]:
# from tqdm import tqdm
# train_x_sentiment = []
# for review in tqdm(X_train_bert):
#     blob = TextBlob(review)
#     train_x_sentiment += [blob.sentiment.polarity]

In [76]:
# from tqdm import tqdm
# test_x_sentiment = []
# for review in tqdm(X_test_bert):
#     blob = TextBlob(review)
#     test_x_sentiment += [blob.sentiment.polarity]

In [77]:
# X_train_bert

In [78]:
# X_train_bert.head(2)

In [79]:
def bert_encode(input_text, tokenizer, max_len = 512):
    token_input = [] 
    mask_input = []
    seg_input = []
    
    for text in input_text:
        text = tokenizer.tokenize(text)
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)      
        token_input.append(tokens + [0]*pad_len)
        mask_input.append([1]*len(input_sequence) + [0]*pad_len)
        seg_input.append([0] * max_len)
        
    return np.array(token_input), np.array(mask_input), np.array(seg_input)

In [80]:
def build_model(bert_layer, max_len = 512):
    input_word_ids = Input(shape=(max_len, ),dtype = tf.int32,name = 'input_words_ids')
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    
    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [81]:
# X_train_bert.user_review.str.len()

In [82]:
# mean_len = X_train_bert.user_review.str.len().mean()

In [83]:
# mean_len

In [84]:
# X_test_bert.user_review.str.len().mean()

In [85]:
# X_train_bert = pd.Series(X_train_bert)
# X_test_bert =pd.Series(X_test_bert)

In [86]:
type(X_train_bert)

pandas.core.series.Series

In [87]:
train_input = bert_encode(X_train_bert.values, tokenizer, max_len=380)
test_input = bert_encode(X_test_bert.values, tokenizer, max_len=380)
train_labels = y_train.values

In [88]:
testing_input = bert_encode(testing_df.user_review, tokenizer, max_len=380)

In [89]:
type(X_test)

pandas.core.frame.DataFrame

In [90]:
print(type(train_input))
print(type(test_input))
print(type(train_labels))

<class 'tuple'>
<class 'tuple'>
<class 'numpy.ndarray'>


In [91]:
model = build_model(bert_layer, max_len=380)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_words_ids (InputLayer)    [(None, 380)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 380)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 380)]        0                                            
__________________________________________________________________________________________________
keras_layer_1 (KerasLayer)      [(None, 1024), (None 335141889   input_words_ids[0][0]            
                                                                 input_mask[0][0]           

In [92]:
# jhbuh hgvvhb b  ub

In [95]:
train_history = model.fit(
    train_input, train_labels,
    validation_split=0.1,
    epochs=3,
    batch_size=5
)

Train on 15587 samples, validate on 1732 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [96]:
prediction = model.predict(test_input)
preds = []
for x in prediction:
    preds.append(int(x.round()))

from sklearn.metrics import accuracy_score
print("Accuracy: ", accuracy_score(y_test.values,preds))

Accuracy:  0.8914285714285715


# Experiment

In [102]:
# X

In [103]:
# super_train = bert_encode(X.user_review.values, tokenizer, max_len=380)

In [104]:
# super_prediction = model.predict(super_train)
# super_pred = []
# for x in super_prediction:
#     super_pred.append(int(x.round()))



In [105]:
# len(super_pred)

In [106]:
# from sklearn.metrics import accuracy_score
# print("Accuracy: ", accuracy_score(super_pred, y.values))

# now adding sentiment to it

In [107]:
# from tqdm import tqdm
# super_sentiment = []
# for review in tqdm(X.user_review.values):
#     blob = TextBlob(review)
#     super_sentiment += [blob.sentiment.polarity]

In [108]:
# def num(x):
#     if x >= 0:
#         return 1
#     elif x <= 0:
#         return -1

In [109]:
# super_dataset = pd.DataFrame(super_pred)
# super_dataset['sentiment']=super_sentiment
# super_dataset['Target']=y.values

In [110]:
# super_dataset.tail(10)

In [111]:
# super_dataset['sentiment']=super_dataset.sentiment.apply(num)

In [112]:
# super_dataset.sample(10)

In [113]:
# super_dataset.shape

In [114]:
# X_super= super_dataset.drop(['Target'],axis=1)
# y_super = super_dataset.Target

In [115]:
# from sklearn.model_selection import train_test_split

# X_train_super, X_test_super, y_train_super, y_test_super = train_test_split(X_super, y_super, test_size = 0.30, random_state = 0)

In [116]:
# print(X_train_super.shape)
# print(X_test_super.shape)

In [117]:
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()

# X_train_super = pd.DataFrame(scaler.fit_transform(X_train_super), columns = X_super.columns)

# X_test_super = pd.DataFrame(scaler.transform(X_test_super), columns = X_super.columns)

In [118]:
# X_train_super.head()

In [119]:
# from sklearn.metrics import accuracy_score

In [120]:
# from sklearn.linear_model import LogisticRegression

# clf = LogisticRegression(max_iter=200, )
# clf.fit(X_train_super, y_train_super)
# y_pred_super = clf.predict(X_test_super)

# print('LogisticRegression accuracy_score {0:0.4f}'. format(accuracy_score(y_test_super, y_pred_super)))
# clf.score(X_train_super, y_train_super)

In [121]:
# from xgboost import XGBClassifier

# clf= XGBClassifier(learning_rate=0.04, n_estimators=406, max_depth=5,
#                         min_child_weight=4, 
#                          seed=27)
# clf.fit(X_train_super, y_train_super)
# y_pred_super = clf.predict(X_test_super)

# print('XGBClassifier accuracy_score {0:0.4f}'. format(accuracy_score(y_test_super, y_pred_super)))
# clf.score(X_train_super, y_train_super)

In [122]:
# from sklearn.tree import DecisionTreeClassifier

# clf = DecisionTreeClassifier( max_depth=5)
# clf.fit(X_train_super, y_train_super)
# y_pred_super = clf.predict(X_test_super)

# print('DecisionTreeClassifier accuracy_score {0:0.4f}'. format(accuracy_score(y_test_super, y_pred_super)))
# clf.score(X_train_super, y_train_super)

In [123]:
# from catboost import CatBoostClassifier
# clf= CatBoostClassifier( iterations=100,
#                 learning_rate =0.05,
#                  max_depth=4,
#                  leaf_estimation_iterations=20)
# clf.fit(X_train_super, y_train_super)
# y_pred_super = clf.predict(X_test_super)

# print('CatBoostClassifier accuracy_score {0:0.4f}'. format(accuracy_score(y_test_super, y_pred_super)))
# clf.score(X_train_super, y_train_super)

In [124]:
# from sklearn.ensemble import GradientBoostingClassifier
# clf= GradientBoostingClassifier(                          
#     )
# clf.fit(X_train_super, y_train_super)
# y_pred_super = clf.predict(X_test_super)

# print('GradientBoostingClassifier accuracy_score {0:0.4f}'. format(accuracy_score(y_test_super, y_pred_super)))
# clf.score(X_train_super, y_train_super)

In [125]:
# from sklearn.ensemble import BaggingClassifier
# clf= BaggingClassifier(#base_estimator=base_cls,
#     n_estimators=400,
#    )
# clf.fit(X_train_super, y_train_super)
# y_pred_super = clf.predict(X_test_super)

# print('BaggingClassifier accuracy_score {0:0.4f}'. format(accuracy_score(y_test_super, y_pred_super)))
# clf.score(X_train_super, y_train_super)

In [126]:
# from sklearn.experimental import enable_hist_gradient_boosting
# from sklearn.ensemble import HistGradientBoostingClassifier

# clf= HistGradientBoostingClassifier( #loss='binary_crossentropy',
#     learning_rate=0.05,
# #     max_iter=6000,
#     max_depth=6,
# #     max_leaf_nodes=5,
#    )
# clf.fit(X_train_super, y_train_super)
# y_pred_super = clf.predict(X_test_super)

# print('HistGradientBoostingClassifier accuracy_score {0:0.4f}'. format(accuracy_score(y_test_super, y_pred_super)))
# clf.score(X_train_super, y_train_super)

In [127]:
# from lightgbm import LGBMClassifier

# clf= LGBMClassifier(# bagging_freq = 5,
#     bagging_fraction = 1,
#     boost_from_average= 'true',
#     boost =  'gbdt',
# #     feature_fraction =  0.01,
#     learning_rate =  0.05,
# #     max_depth =  15,  
# #     metric = log_loss,
#     min_data_in_leaf =  10,
#     min_sum_hessian_in_leaf =  0.05,
#     num_leaves =  14,
# #     num_threads =  8,
#     tree_learner =  'serial',
#     verbosity =  1
#                    )
# clf.fit(X_train_super, y_train_super)
# y_pred_super = clf.predict(X_test_super)

# print('LGBMClassifier accuracy_score {0:0.4f}'. format(accuracy_score(y_test_super, y_pred_super)))
# clf.score(X_train_super, y_train_super)

# Testing DataFrame Super_output

In [128]:
# submission_super=pd.DataFrame(submission)

In [129]:
# submission_super.drop("review_id",axis=1,inplace=True)

In [130]:
# submission_super.head()

In [131]:
# submission_super.shape

In [132]:
# from tqdm import tqdm
# submission_super_sentiment = []
# for review in tqdm(testing_df.user_review.values):
#     blob = TextBlob(review)
#     submission_super_sentiment += [blob.sentiment.polarity]

In [133]:
# len(submission_super_sentiment)

In [134]:
# submission_super['sentiment']=submission_super_sentiment

In [135]:
# submission_super.head()

In [136]:
# submission_super['sentiment']=submission_super.sentiment.apply(num)

In [137]:
# submission_super = pd.DataFrame(scaler.transform(submission_super), columns = submission_super.columns)

In [138]:
# submission_super.rename(columns = {'user_suggestion':0}, inplace = True) 
# submission_super.head()

# saving the super submission

In [139]:
# X_test_super

In [140]:
# super_predictions = clf.predict(submission_super)

In [141]:
# sss = pd.read_csv('/kaggle/input/steam-recommendation-nlp-dataset/test.csv')

In [142]:
# result_data_df = pd.DataFrame(sss.review_id)

In [143]:
# result_data_df['user_suggestion']=super_predictions

In [144]:
# result_data_df.head()

In [145]:
# result_data_df.to_csv('exp21.csv', index=False)

# Experiment End

# Saving Output by BerT

In [146]:
pred_result = prediction = model.predict(testing_input)
submission = pred_result.round().astype(int)
submission=pd.DataFrame(submission)

In [147]:
sub = pd.read_csv('/kaggle/input/steam-recommendation-nlp-dataset/test.csv')
submission['review_id']=sub['review_id']

In [148]:
submission= submission[['review_id',0]]
submission.rename(columns = {0:'user_suggestion'}, inplace = True) 

In [149]:
submission.head(4)

Unnamed: 0,review_id,user_suggestion
0,1603,1
1,1604,0
2,1605,0
3,1606,0


In [152]:
submission.to_csv('submission_91.csv', index=False)