In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('steam_reviews_clean.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,date_posted,funny,helpful,hour_played,is_early_access_review,recommendation,review,title
0,0,2019-02-10,2,4,578,False,Recommended,&gt Played as German Reich&gt Declare war on B...,Expansion - Hearts of Iron IV: Man the Guns
1,1,2019-02-10,0,0,184,False,Recommended,yes.,Expansion - Hearts of Iron IV: Man the Guns
2,2,2019-02-07,0,0,892,False,Recommended,Very good game although a bit overpriced in my...,Expansion - Hearts of Iron IV: Man the Guns
3,3,2018-06-14,126,1086,676,False,Recommended,Out of all the reviews I wrote This one is pro...,Dead by Daylight
4,4,2017-06-20,85,2139,612,False,Recommended,Disclaimer I survivor main. I play games for f...,Dead by Daylight


### Preprocessing using TF-IDF vectroization

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
tf_vect = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [5]:
dtm = tf_vect.fit_transform(df['review'])

In [6]:
dtm

<433375x68953 sparse matrix of type '<class 'numpy.float64'>'
	with 6935435 stored elements in Compressed Sparse Row format>

### Creating an instance of Non-negative Matrix with 10 components

In [7]:
from sklearn.decomposition import NMF

In [8]:
nmf = NMF(n_components=10, random_state=42)

In [9]:
nmf.fit(dtm)

In [10]:
len(tf_vect.get_feature_names_out())

68953

#### Print top 10 words for each topic

In [11]:
for index, topic in enumerate(nmf.components_):
    print(f'TOP 15 WORDS FOR TOPIC #{index}')
    print([tf_vect.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

TOP 15 WORDS FOR TOPIC #0
['recommend', 'better', 'money', 'playing', 'bad', 'people', 'don', 'time', 'really', 'buy', 'amazing', 'just', 'like', 'play', 'game']


TOP 15 WORDS FOR TOPIC #1
['tho', 'realy', 'yeah', 'hey', 'verry', 'guess', 'stuff', 'yes', 'graphics', 'far', 'really', 'job', 'pretty', 'game', 'good']


TOP 15 WORDS FOR TOPIC #2
['funny', 'xd', 'fortnite', 'pubg', 'gud', 'ok', 'got', 'like', 'skyrim', 'cool', 'lt', 'gg', 'free', 'product', 'received']


TOP 15 WORDS FOR TOPIC #3
['games', 'solo', 'hours', 'addicting', 'addictive', 'especially', 'playing', 'lots', 'super', 'lot', 'pretty', 'really', 'play', 'friends', 'fun']


TOP 15 WORDS FOR TOPIC #4
['veri', 'realy', 'xd', 'job', 'gameplay', 'wow', 'pretty', 'funny', 'verry', 'graphic', 'lt', 'graphics', 'shot', 'game', 'nice']


TOP 15 WORDS FOR TOPIC #5
['kill', 'cars', 'die', 'hours', 'rock', 'killed', 'recommend', 'simulator', 'buy', 'got', 'naked', 'ign', 'play', '11', '10']


TOP 15 WORDS FOR TOPIC #6
['survival'

### Add new column to dataset, showing each review's topic number

In [12]:
topic_results = nmf.transform(dtm)

In [13]:
topic_results.argmax(axis=1)

df['Topic'] = topic_results.argmax(axis=1)

df.head(10)

Unnamed: 0.1,Unnamed: 0,date_posted,funny,helpful,hour_played,is_early_access_review,recommendation,review,title,Topic
0,0,2019-02-10,2,4,578,False,Recommended,&gt Played as German Reich&gt Declare war on B...,Expansion - Hearts of Iron IV: Man the Guns,0
1,1,2019-02-10,0,0,184,False,Recommended,yes.,Expansion - Hearts of Iron IV: Man the Guns,0
2,2,2019-02-07,0,0,892,False,Recommended,Very good game although a bit overpriced in my...,Expansion - Hearts of Iron IV: Man the Guns,1
3,3,2018-06-14,126,1086,676,False,Recommended,Out of all the reviews I wrote This one is pro...,Dead by Daylight,0
4,4,2017-06-20,85,2139,612,False,Recommended,Disclaimer I survivor main. I play games for f...,Dead by Daylight,3
5,5,2016-12-12,4,55,2694,False,Recommended,ENGLISH After playing for more than two years ...,Dead by Daylight,5
6,6,2017-09-17,12,228,48,False,Recommended,Out of all the reviews I wrote This one is pro...,Dead by Daylight,0
7,7,2018-12-24,295,219,71,False,Recommended,I have never been told to kill myself more tha...,Dead by Daylight,0
8,8,2018-09-21,2,54,400,False,Recommended,Any longtime Dead by Daylight player knows tha...,Dead by Daylight,0
9,9,2018-12-05,380,271,414,False,Recommended,if you think cs go is toxic try this game,Dead by Daylight,0
