In [1]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel

import random
from collections import defaultdict
from tqdm import tqdm
import gzip

import os


RANDOM_SEED = 0
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x1a95b88b310>

In [2]:
def read_json(path):
    f: gzip.GzipFile = gzip.open(path)
    f.readline()
    for line in f:
        entry = eval(line)
        yield entry

data = []
for entry in read_json('train.json.gz'):
    data.append(entry)

df: pd.DataFrame = pd.DataFrame(data)

In [3]:
df.fillna(value=0, axis=1, inplace=True)
df['compensation'] = df['compensation'].map(lambda x : x if x == 0 else 1)
df[['early_access', 'compensation']] = df[['early_access', 'compensation']].astype(np.int32)

In [4]:
dftext = df.groupby('gameID')['text'].apply(' '.join).reset_index()
dftext.head()

Unnamed: 0,gameID,text
0,g00045539,"The bard... He's a ♥♥♥♥♥♥♥. Selfish, egoistic,..."
1,g00083675,Recomended. Painkiller eater simulator 10/10 t...
2,g00087199,yes. This game is fun because Slither.io is la...
3,g00102027,I don't even consider them games. What's the a...
4,g00125299,"Was kinda expecting the House of the Dead, jus..."


In [76]:
from nltk.stem import PorterStemmer,LancasterStemmer
import string
from sklearn import feature_extraction

counter = feature_extraction.text.CountVectorizer(min_df=0.01, max_df=0.5, stop_words='english', max_features=2000, ngram_range=(1, 2), binary=True)
wordcount = counter.fit_transform(dftext['text'])

In [96]:
"shooter" in counter.vocabulary_

True

In [87]:
i = 0
for x in counter.vocabulary_:
    print(x)
    i += 1

interested
incredible
alot
creatures
funny
voice
acting
rpg
classic
buggy
read
visuals
finished
windows
ago
sad
job
effort
interface
listen
following
direction
360
support
generic
controller
unplayable
entirely
ends
tale
hidden
gem
friend
sitting
library
awhile
decided
adventure
view
storyline
close
men
brought
wonderful
spin
tells
face
quick
result
shot
playthrough
wonder
taken
seriously
heavily
invested
create
special
exactly
fantasy
genre
creating
entry
uses
magic
personal
gain
stop
evil
course
happens
begins
humor
breaking
wall
everybody
lines
tech
trees
list
abilities
complaint
occasional
weapon
upgrade
fresh
enjoying
silly
exploring
setting
fans
haven
heard
hardly
hold
release
nostalgia
esque
dark
atmosphere
relatively
pleasant
land
encounter
hilarious
individual
majority
dated
stories
blast
school
today
era
complex
expecting
port
console
believe
excellent
laugh
forever
tree
bigger
weapons
fighting
monsters
eventually
girl
live
case
shows
effects
shoot
beautiful
follow
plot
writi

In [97]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=20, random_state=RANDOM_SEED)
reps = LDA.fit_transform(wordcount)

In [98]:
df['gameID'].value_counts()[dftext.iloc[0]['gameID']]

26

In [102]:
dftext.iloc[0]['text']

'The bard... He\'s a ♥♥♥♥♥♥♥. Selfish, egoistic, zynic, mostly interested in cleavage, maybe also beer. The game pulls out an incredible amount of humour. And combat goes alot about summoning creatures. Pros:\n+ funny (!!)\n+ voice acting\n+ songs\n+ RPG classic\nCons:\n- buggy (!!!) Linux users read below\n- outdated visuals\nI finished it on Windows many many years ago and, hell, it was fun! It\'s sad whoever did the job of porting it to Linux didn\'t put much effort into it.\nInterface often doesn\'t listen, in-game map stops following player\'s direction after 360 turn, support for generic controller (anything but xbox360) is only partial (and can\'t remap), abusive particle system sometimes renders game unplayable (if unlucky, one torch on the map will freeze the game entirely). ...and thus ends the Bard\'s Tale.  This was a GREAT hidden gem, which I think a friend sent me.  In anycase it\'s been sitting in my game library for awhile so decided to play it.  It\'s a top down diablo

In [99]:
reps[0]

array([1.61812302e-04, 1.61812301e-04, 1.61812302e-04, 3.90664928e-01,
       1.61812302e-04, 1.61812302e-04, 1.61812302e-04, 1.61812302e-04,
       1.61812302e-04, 2.42927091e-02, 1.61812299e-04, 1.61812302e-04,
       1.54874787e-01, 1.61812301e-04, 1.18368151e-01, 1.61812302e-04,
       3.09372240e-01, 1.61812300e-04, 1.61812301e-04, 1.61812300e-04])

In [100]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 30
display_topics(LDA, counter.get_feature_names_out(), no_top_words)

Topic 0:
grind free play pay grinding items earn win spending quest shop quests unlock cash mobile item paying casual waste lose gold account click gain game free waste time rewards higher skills addictive download
Topic 1:
bugs devs update updates potential fix ing im community access early access multiplayer early simulator thats lag buggy waste alot buying reviews terrible unplayable year horrible fps worst tried state running
Topic 2:
expansion multiplayer previous released ai features campaign franchise mods release improved compared community fans available maps mod favorite added single player base opinion improvements dlc classic original game came ii online bugs
Topic 3:
voice acting voice acting weapons enemy weapon terrible plot fight worst storyline problems missions worse bugs poor mission mouse open awful clunky fps cutscenes shooter running gun came cheap special finish
Topic 4:
cover gun guns shooting shooter single player person shooter exciting bullet campaign mission

In [31]:
mat.shape

(174999, 14301)

In [11]:
df['gameID'].value_counts()

g10773791    1092
g05463839     943
g75228197     897
g40499587     794
g11862712     746
             ... 
g17749950       9
g13018310       9
g85968616       9
g61046982       9
g67585399       8
Name: gameID, Length: 2437, dtype: int64

In [15]:
tmp = df.groupby('gameID')['text'].


<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001D3EC06C730>