In [25]:
import pandas as pd
import numpy as np

import string

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.model_selection._split import train_test_split
from sklearn.feature_selection.univariate_selection import SelectPercentile
from sklearn.feature_selection import chi2

#algorithms
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from sklearn import svm
from sklearn.neighbors.classification import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble.weight_boosting import AdaBoostClassifier
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.metrics.classification import f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix



nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amirh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
games = pd.read_json('https://raw.githubusercontent.com/sshmo/crawler/master/games.jl', lines=True)
games.head()

Unnamed: 0,name,genre,score,score_num,downloads,description
0,Lords Mobile: Kingdom Wars,Strategy,4.3,5946326,"100,000,000+",Are you ready for a REAL fight?\n\nThe true Em...
1,Fishdom,Puzzle,4.4,4565785,"100,000,000+",Never Fishdomed before? Take a deep breath and...
2,State of Survival: Survive the Zombie Apocalypse,Strategy,4.4,1522191,"10,000,000+","""It's been six months since the zombie apocaly..."
3,Genshin Impact,Adventure,4.5,1060121,"10,000,000+","Step into Teyvat, a vast world teeming with li..."
4,Gardenscapes,Casual,4.4,10246959,"100,000,000+",Welcome to Gardenscapes—the first hit from Pla...


In [3]:
games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         199 non-null    object 
 1   genre        199 non-null    object 
 2   score        199 non-null    float64
 3   score_num    199 non-null    object 
 4   downloads    199 non-null    object 
 5   description  199 non-null    object 
dtypes: float64(1), object(5)
memory usage: 9.5+ KB


In [4]:
games = games[[ "description", "genre"]]
games.head()

Unnamed: 0,description,genre
0,Are you ready for a REAL fight?\n\nThe true Em...,Strategy
1,Never Fishdomed before? Take a deep breath and...,Puzzle
2,"""It's been six months since the zombie apocaly...",Strategy
3,"Step into Teyvat, a vast world teeming with li...",Adventure
4,Welcome to Gardenscapes—the first hit from Pla...,Casual


In [5]:
# text preprocessing

for index, row in games.iterrows():
    text = row['description']
    # 1.lowercase
    text_lower = text.lower()
    ## 2.Removing Punctuation and unicode chars
    text_lower_unicode = "".join([char for char in text_lower if char not in string.punctuation])
    text_lower_unicode = text_lower_unicode.encode('ascii', 'ignore').decode()
    ### 3.Tokenization
    text_lower_unicode_tokenized = word_tokenize(text_lower_unicode)
    #### 4.Stopword Filtering
    text_lower_unicode_tokenized_filtered = [w for w in text_lower_unicode_tokenized if not w in stopwords.words('english')]
    ##### 5.Stemming
    porter = PorterStemmer()
    text_lower_unicode_tokenized_filtered_stemming = [porter.stem(w) for w in text_lower_unicode_tokenized_filtered]
    ###### 6.add to DataFrame
    games.loc[index] = {'description': ' '.join(text_lower_unicode_tokenized_filtered_stemming), 'genre': row['genre']}
    
games.head()

Unnamed: 0,description,genre
0,readi real fight true emperor fallen need real...,Strategy
1,never fishdom take deep breath dive underwat w...,Puzzle
2,six month sinc zombi apocalyps began viru infe...,Strategy
3,step teyvat vast world teem life flow element ...,Adventure
4,welcom gardenscapesth first hit playrix scape ...,Casual


In [6]:
# vectorizer

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
X = vectorizer.fit(games['description']).transform(games['description'])

X.shape

(199, 5548)

In [7]:
# label encoder

le = preprocessing.LabelEncoder()
y = le.fit(games['genre']).transform(games['genre'])
np.unique(games['genre'])

y.shape

(199,)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [9]:
# Feature Elimination

#ch2 = SelectPercentile(chi2, 80)
#X_train = ch2.fit_transform(X_train, y_train)
#X_test = ch2.transform(X_test)

In [10]:
#np.random.seed(42)
# algoritms

# 1.SGDClassifier max : 74 --> seed : 23
np.random.seed(23)

sgd = SGDClassifier()
sgd.fit(X_train, y_train)
score = sgd.score(X_test, y_test)
print('SDG score: ' + str(score))

SDG score: 0.58


In [11]:
## 2.SVC max : 0.3 --> seed : *

svmc = svm.SVC()
svmc.fit(X_train, y_train)
score = svmc.score(X_test, y_test)
print('svm score: ' + str(score))

svm score: 0.34


In [12]:
## 2.1SVC max : 0.72 --> seed : *

svmlc = svm.SVC(kernel='linear', C=1.2)
svmlc.fit(X_train, y_train)
score = svmlc.score(X_test, y_test)
print('svm linear score: ' + str(score))

svm linear score: 0.56


In [13]:
### 3.KNNClassifier max : 0.68 --> seed : *

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
score = knn.score(X_test, y_test)
print('knn score: ' + str(score))

knn score: 0.58


In [14]:
### 3.1KNNClassifier max : 0.66 --> seed : *

knn2 = KNeighborsClassifier(n_neighbors=6, weights='distance')
knn2.fit(X_train, y_train)
score = knn2.score(X_test, y_test)
print('knn2 score: ' + str(score))

knn2 score: 0.62


In [15]:
#### 4.MultinomialNB max : 0.3 --> seed : *

mnnb = MultinomialNB()
mnnb.fit(X_train, y_train)
score = mnnb.score(X_test, y_test)
print('mnnb score: ' + str(score))

mnnb score: 0.36


In [16]:
##### 5.AdaBoost Classsifier max : 0.3 --> seed : 23
np.random.seed(23)

abc = AdaBoostClassifier()
abc.fit(X_train, y_train)
score = abc.score(X_test, y_test)
print('abc score: ' + str(score))

abc score: 0.26


In [17]:
##### 5.1AdaBoost Classsifier max : 0.3 --> seed : 23
np.random.seed(23)

abc2 = AdaBoostClassifier(n_estimators=100)
abc2.fit(X_train, y_train)
score = abc2.score(X_test, y_test)
print('abc2 score: ' + str(score))

abc2 score: 0.24


In [18]:
###### 6.RandomForest Classifier max : 0.62 --> seed : 12
np.random.seed(12)

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
score = rfc.score(X_test, y_test)
print('rfc score: ' + str(score))

rfc score: 0.5


In [19]:
###### 6.1RandomForest Classifier max : 0.72 --> seed : 93
np.random.seed(93)

rfc2 = RandomForestClassifier(n_estimators=163)
rfc2.fit(X_train, y_train)
score = rfc2.score(X_test, y_test)
print('rfc2 score: ' + str(score))

rfc2 score: 0.48


In [20]:
####### 7.Voting Classifier max : 0.7 --> seed : 41
np.random.seed(41)

vcls = VotingClassifier(estimators=[('randomforest', rfc2), ('naivebayes', mnnb), ('knn', knn), ('svm', svmlc)])
vcls.fit(X_train, y_train)
score = vcls.score(X_test, y_test)
print('vcls score: ' + str(score))

vcls score: 0.6


In [21]:
######## 8.MultinomialNB

mnnb = MultinomialNB(alpha=0.01, fit_prior=True)
mnnb.fit(X_train, y_train)
score = mnnb.score(X_test, y_test)
print('mnnb score: ' + str(score))

mnnb score: 0.64


In [32]:
mnnb_predict = mnnb.predict(X_test)
mnnb_recall = recall_score(y_test, mnnb_predict, average='macro')
mnnb_precision = precision_score(y_test, mnnb_predict, average='macro')
mnnb_f1 = f1_score(y_test, mnnb_predict, average='macro')
mnnb_conf = confusion_matrix(y_test, mnnb_predict)
print("mnnb recal: " + str(mnnb_recall))
print("mnnb precision: " + str(mnnb_precision))
print("mnnb f1: " + str(mnnb_f1))
print("mnnb confusion matrix: \n" + str(mnnb_conf))

mnnb recal: 0.563860498475883
mnnb precision: 0.5583826429980276
mnnb f1: 0.5358551704705551
mnnb confusion matrix: 
[[2 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 2 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 3 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 2 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 3 0 0 1 0 0]
 [1 0 0 0 0 0 0 0 1 0 0 0 0]
 [2 0 0 0 0 0 0 0 0 5 0 0 4]
 [0 0 0 0 0 0 1 0 0 0 2 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 3 0]
 [2 0 0 0 0 0 1 0 1 0 0 0 9]]


In [33]:
vcls_predict = vcls.predict(X_test)
vcls_recall = recall_score(y_test, vcls_predict, average='macro')
vcls_precision = precision_score(y_test, vcls_predict, average='macro')
vcls_f1 = f1_score(y_test, vcls_predict, average='macro')
vcls_conf = confusion_matrix(y_test, vcls_predict)
print("vcls recal: " + str(vcls_recall))
print("vcls precision: " + str(vcls_precision))
print("vcls f1: " + str(vcls_f1))
print("vcls confusion matrix: \n" + str(vcls_conf))

vcls recal: 0.5168220668220668
vcls precision: 0.5986111111111111
vcls f1: 0.49419002050581
vcls confusion matrix: 
[[ 0  0  0  0  0  0  0  0  0  0  0  2]
 [ 0  1  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  1]
 [ 0  1  0  1  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  3  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  3  0  0  0  0  0  0]
 [ 0  0  0  0  0  3  2  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  1  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  6  0  0  4]
 [ 0  0  0  0  0  1  0  0  0  1  0  1]
 [ 0  0  0  0  0  2  0  0  2  0  0  0]
 [ 1  0  0  0  0  0  0  0  0  0  0 12]]


In [34]:
knn_predict = knn.predict(X_test)
knn_recall = recall_score(y_test, knn_predict, average='macro')
knn_precision = precision_score(y_test, knn_predict, average='macro')
knn_f1 = f1_score(y_test, knn_predict, average='macro')
knn_conf = confusion_matrix(y_test, knn_predict)
print("knn recal: " + str(knn_recall))
print("knn precision: " + str(knn_precision))
print("knn f1: " + str(knn_f1))
print("svmlc confusion matrix: \n" + str(knn_conf))

knn recal: 0.5259817105970952
knn precision: 0.4756410256410256
knn f1: 0.48369963369963376
svmlc confusion matrix: 
[[0 0 0 0 0 0 0 0 0 0 0 0 2]
 [0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 2 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 3 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 2 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 3 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 2 0 0 0 0]
 [2 0 0 0 0 0 0 0 0 6 0 0 3]
 [0 0 0 0 0 0 1 0 0 1 1 0 0]
 [0 0 0 0 1 1 0 0 1 1 0 0 0]
 [2 0 0 0 0 0 0 0 1 1 0 0 9]]


In [35]:
knn2_predict = knn2.predict(X_test)
knn2_recall = recall_score(y_test, knn2_predict, average='macro')
knn2_precision = precision_score(y_test, knn2_predict, average='macro')
knn2_f1 = f1_score(y_test, knn2_predict, average='macro')
knn2_conf = confusion_matrix(y_test, knn2_predict)
print("knn2 recal: " + str(knn2_recall))
print("knn2 precision: " + str(knn2_precision))
print("knn2 f1: " + str(knn2_f1))
print("knn2 confusion matrix: \n" + str(knn2_conf))

knn2 recal: 0.551129639591178
knn2 precision: 0.5324283559577677
knn2 f1: 0.5238577212261423
knn2 confusion matrix: 
[[ 0  0  0  0  0  0  0  0  0  0  0  0  2]
 [ 0  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  1]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  2  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  3  0  0  0  0  0  0  0]
 [ 0  0  0  1  0  0  2  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  1  3  0  0  1  0  0]
 [ 0  0  0  0  0  0  0  0  2  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  0  6  0  0  4]
 [ 0  0  0  0  0  0  1  0  0  1  1  0  0]
 [ 0  0  0  0  1  1  0  0  0  1  0  1  0]
 [ 1  0  0  0  0  0  0  0  1  0  0  1 10]]


In [36]:
svmlc_predict = svmlc.predict(X_test)
svmlc_recall = recall_score(y_test, svmlc_predict, average='macro')
svmlc_precision = precision_score(y_test, svmlc_predict, average='macro')
svmlc_f1 = f1_score(y_test, svmlc_predict, average='macro')
svmlc_conf = confusion_matrix(y_test, svmlc_predict)
print("svmlc recal: " + str(svmlc_recall))
print("svmlc precision: " + str(svmlc_precision))
print("svmlc f1: " + str(svmlc_f1))
print("svmlc confusion matrix: \n" + str(svmlc_conf))

svmlc recal: 0.44459984459984464
svmlc precision: 0.4152777777777777
svmlc f1: 0.37114362245941196
svmlc confusion matrix: 
[[ 0  0  0  0  0  0  0  0  0  0  0  2]
 [ 0  1  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  1]
 [ 0  1  0  0  0  1  0  0  0  0  0  0]
 [ 0  0  0  0  3  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  3  0  0  0  0  0  0]
 [ 0  1  0  0  0  3  1  0  0  0  0  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  1]
 [ 1  0  0  0  0  0  0  0  6  0  0  4]
 [ 0  0  0  0  0  1  0  0  0  2  0  0]
 [ 0  0  0  0  0  2  0  0  2  0  0  0]
 [ 1  0  0  0  0  0  0  0  0  0  0 12]]


In [22]:
# save best model
import pickle

filename = 'naringame_ML_genre_model.sav'
pickle.dump(mnnb, open(filename, 'wb'))

In [23]:
## load model
### some times later ...

#loaded_model = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(X_test, Y_test)
#print(result)