In [1]:
import pandas as pd
import numpy as np
import string

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from keras import models, layers
from keras.utils import to_categorical

from sklearn.model_selection._split import train_test_split
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
games = pd.read_json('https://raw.githubusercontent.com/sshmo/crawler/master/games.jl', lines=True)
games.head()

Unnamed: 0,name,genre,score,score_num,downloads,description
0,Lords Mobile: Kingdom Wars,Strategy,4.3,5946326,"100,000,000+",Are you ready for a REAL fight?\n\nThe true Em...
1,Fishdom,Puzzle,4.4,4565785,"100,000,000+",Never Fishdomed before? Take a deep breath and...
2,State of Survival: Survive the Zombie Apocalypse,Strategy,4.4,1522191,"10,000,000+","""It's been six months since the zombie apocaly..."
3,Genshin Impact,Adventure,4.5,1060121,"10,000,000+","Step into Teyvat, a vast world teeming with li..."
4,Gardenscapes,Casual,4.4,10246959,"100,000,000+",Welcome to Gardenscapes—the first hit from Pla...


In [3]:
games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         199 non-null    object 
 1   genre        199 non-null    object 
 2   score        199 non-null    float64
 3   score_num    199 non-null    object 
 4   downloads    199 non-null    object 
 5   description  199 non-null    object 
dtypes: float64(1), object(5)
memory usage: 9.5+ KB


In [4]:
games = games[[ "description", "genre"]]
games.head()

Unnamed: 0,description,genre
0,Are you ready for a REAL fight?\n\nThe true Em...,Strategy
1,Never Fishdomed before? Take a deep breath and...,Puzzle
2,"""It's been six months since the zombie apocaly...",Strategy
3,"Step into Teyvat, a vast world teeming with li...",Adventure
4,Welcome to Gardenscapes—the first hit from Pla...,Casual


In [5]:
games.genre.nunique()

14

In [6]:
games.groupby('genre')['description'].count()

genre
Action          15
Adventure       13
Arcade           1
Board            3
Card             9
Casino          15
Casual          22
Educational      1
Puzzle          17
Racing           5
Role Playing    33
Simulation      15
Sports           9
Strategy        41
Name: description, dtype: int64

In [7]:
# text preprocessing

for index, row in games.iterrows():
    text = row['description']
    # 1.lowercase
    text_lower = text.lower()
    ## 2.Removing Punctuation and unicode chars
    text_lower_unicode = "".join([char for char in text_lower if char not in string.punctuation])
    text_lower_unicode = text_lower_unicode.encode('ascii', 'ignore').decode()
    ### 3.Tokenization
    text_lower_unicode_tokenized = word_tokenize(text_lower_unicode)
    #### 4.Stopword Filtering
    text_lower_unicode_tokenized_filtered = [w for w in text_lower_unicode_tokenized if not w in stopwords.words('english')]
    ##### 5.Stemming
    porter = PorterStemmer()
    text_lower_unicode_tokenized_filtered_stemming = [porter.stem(w) for w in text_lower_unicode_tokenized_filtered]
    ###### 6.add to DataFrame
    games.loc[index] = {'description': ' '.join(text_lower_unicode_tokenized_filtered_stemming), 'genre': row['genre']}
    
games.head()

Unnamed: 0,description,genre
0,readi real fight true emperor fallen need real...,Strategy
1,never fishdom take deep breath dive underwat w...,Puzzle
2,six month sinc zombi apocalyps began viru infe...,Strategy
3,step teyvat vast world teem life flow element ...,Adventure
4,welcom gardenscapesth first hit playrix scape ...,Casual


In [8]:
# vectorizer

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
X = vectorizer.fit(games['description']).transform(games['description'])

X.shape

(199, 5548)

In [9]:
# label encoder

le = preprocessing.LabelEncoder()
y = le.fit(games['genre']).transform(games['genre'])
np.unique(games['genre'])

y.shape

(199,)

In [10]:
x_train, x_test, y_train, y_test = train_test_split(X, to_categorical(y, 14))

In [11]:
x_train.shape

(149, 5548)

In [12]:
# create model 

model = models.Sequential()
model.add(layers.Dense(200, activation="relu", input_shape=(5548,)))
model.add(layers.Dropout(0.3, noise_shape=None, seed=None))
model.add(layers.Dense(200, activation="relu"))
model.add(layers.Dropout(0.3, noise_shape=None, seed=None))
model.add(layers.Dense(200, activation="relu"))
model.add(layers.Dense(14, activation="sigmoid"))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 200)               1109800   
_________________________________________________________________
dropout (Dropout)            (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 200)               40200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 200)               40200     
_________________________________________________________________
dense_3 (Dense)              (None, 14)                2814      
Total params: 1,193,014
Trainable params: 1,193,014
Non-trainable params: 0
______________________________________________

In [13]:
# compile model configuration

model.compile(optimizer="adam",
              loss='binary_crossentropy',
              metrics=["accuracy"])

In [14]:
# fit model 

results = model.fit(x_train.toarray(),y_train,
                    epochs=30,
                    batch_size=1,
                    validation_data=(x_test.toarray(), y_test))

print("Deep acc: ", np.mean(results.history['val_accuracy']))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Deep acc:  0.5720000048478444


In [15]:
# create model 

model = models.Sequential()
model.add(layers.Dense(500, activation="relu", input_shape=(5548,)))
model.add(layers.Dropout(0.3, noise_shape=None, seed=None))
model.add(layers.Dense(150, activation="relu"))
model.add(layers.Dropout(0.3, noise_shape=None, seed=None))
model.add(layers.Dense(100, activation="relu"))
model.add(layers.Dense(14, activation="sigmoid"))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 500)               2774500   
_________________________________________________________________
dropout_2 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 150)               75150     
_________________________________________________________________
dropout_3 (Dropout)          (None, 150)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 100)               15100     
_________________________________________________________________
dense_7 (Dense)              (None, 14)                1414      
Total params: 2,866,164
Trainable params: 2,866,164
Non-trainable params: 0
____________________________________________

In [16]:
# compile model configuration

model.compile(optimizer="adam",
              loss='binary_crossentropy',
              metrics=["accuracy"])

In [19]:
# fit model 

results = model.fit(x_train.toarray(),y_train,
                    epochs=30,
                    batch_size=1,
                    validation_data=(x_test.toarray(), y_test))

print("Deep acc: ", np.mean(results.history['val_accuracy']))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Deep acc:  0.6446666737397512


In [20]:
# save my model in HardDisk

model.save('naringame_DL_genre_model.h5')