In [2]:
import pandas as pd

# Upload the dataset
df = pd.read_csv("D:/CBS/2024spring/NLP/final/data/archive3/steam_reviews.csv")
df.head()

Unnamed: 0,date_posted,funny,helpful,hour_played,is_early_access_review,recommendation,review,title
0,2019-02-10,2,4,578,False,Recommended,&gt Played as German Reich&gt Declare war on B...,Expansion - Hearts of Iron IV: Man the Guns
1,2019-02-10,0,0,184,False,Recommended,yes.,Expansion - Hearts of Iron IV: Man the Guns
2,2019-02-07,0,0,892,False,Recommended,Very good game although a bit overpriced in my...,Expansion - Hearts of Iron IV: Man the Guns
3,2018-06-14,126,1086,676,False,Recommended,Out of all the reviews I wrote This one is pro...,Dead by Daylight
4,2017-06-20,85,2139,612,False,Recommended,Disclaimer I survivor main. I play games for f...,Dead by Daylight


In [3]:
print("The toal number of reviews in the dataset is:",len(df))

The toal number of reviews in the dataset is: 434891


In [4]:
# Check the distribution of reviews for different games in the dataset
df['title'].value_counts()

title
PLAYERUNKNOWN'S BATTLEGROUNDS                    145685
Grand Theft Auto V                                99956
Rust                                              71088
Rocket League®                                    67907
Dead by Daylight                                  22221
MONSTER HUNTER: WORLD                             18412
ASTRONEER                                          2661
The Elder Scrolls V: Skyrim Special Edition        1473
RESIDENT EVIL 2 / BIOHAZARD RE:2                   1385
Sid Meier’s Civilization® VI                        522
Euro Truck Simulator 2                              501
Slay the Spire                                      260
Terraria                                            260
Subnautica                                          247
Left 4 Dead 2                                       221
Insurgency: Sandstorm                               220
RimWorld                                            204
Garry's Mod                               

In [5]:
df.isna().sum()

date_posted                  0
funny                        0
helpful                      0
hour_played                  0
is_early_access_review       0
recommendation               0
review                    1516
title                        0
dtype: int64

In [9]:
df.dropna()

Unnamed: 0,date_posted,funny,helpful,hour_played,is_early_access_review,recommendation,review,title
0,2019-02-10,2,4,578,False,Recommended,&gt Played as German Reich&gt Declare war on B...,Expansion - Hearts of Iron IV: Man the Guns
1,2019-02-10,0,0,184,False,Recommended,yes.,Expansion - Hearts of Iron IV: Man the Guns
2,2019-02-07,0,0,892,False,Recommended,Very good game although a bit overpriced in my...,Expansion - Hearts of Iron IV: Man the Guns
3,2018-06-14,126,1086,676,False,Recommended,Out of all the reviews I wrote This one is pro...,Dead by Daylight
4,2017-06-20,85,2139,612,False,Recommended,Disclaimer I survivor main. I play games for f...,Dead by Daylight
...,...,...,...,...,...,...,...,...
434886,2018-11-17,1,37,10,False,Recommended,YOUR FLESH WILL ROT AND DECAY.STEEL IS IMMORTA...,"Warhammer 40,000: Mechanicus"
434887,2018-11-17,3,41,38,False,Recommended,Domini and Dominae I believe what we are deali...,"Warhammer 40,000: Mechanicus"
434888,2018-11-20,0,0,36,False,Recommended,First off if you like X Com style of games you...,"Warhammer 40,000: Mechanicus"
434889,2018-11-18,1,44,12,False,Recommended,As a disclaimer I'm an AdMech player on the ta...,"Warhammer 40,000: Mechanicus"


In [10]:
# After reviewing the distribution of the dataset, we only choose top 3 games to conduct topic modeling for each of them
top_3_list = df['title'].value_counts().head(3).index.to_list()
print("The top 3 games in dataset are: ",top_3_list)

df_top_review = df[df['title'].isin(top_3_list)][['title','review']]
df_top_review.reset_index(drop=True, inplace=True)
df_top_review['index']=df_top_review.index
documents = df_top_review



The top 3 games in dataset are:  ["PLAYERUNKNOWN'S BATTLEGROUNDS", 'Grand Theft Auto V', 'Rust']


In [11]:
len(documents)

316729

In [12]:
documents.head()

Unnamed: 0,title,review,index
0,Grand Theft Auto V,I love the part in Online where you get banned...,0
1,Grand Theft Auto V,best Rockstar optimization so farI couldn't st...,1
2,Grand Theft Auto V,After having bought this game on PS3 PS4 and P...,2
3,Grand Theft Auto V,Pros SP ModdingCons Take Two Interactive.,3
4,Grand Theft Auto V,Everything in the game now either flies shoots...,4


# Data preprocessing

We will perform the following steps:

Apply simple_process from gensim for preprocessing, include:

(1) Tokenization: Split the text into sentences and the sentences into words.

(2) Lowercase the words and remove punctuation.

We also remove all stopwords and Words that have fewer than 3 characters.

The remaining words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.

In [13]:
!pip install gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer
import numpy as np
np.random.seed(2018)

import nltk
nltk.download('wordnet')



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bing9\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
# Define a function for preprocessing documents
lemm = WordNetLemmatizer()

def preprocess_text(text):
    result=[]
    for token in simple_preprocess(text): # tokenize and lowercase text
        if token not in STOPWORDS:
            result.append(lemm.lemmatize(token))
    return result

In [15]:
# Convert value in review column to string type for further processing
documents['review']=documents['review'].astype(str)

In [16]:
# Apply preprocess to entire documents as preparation for creating dictionary 
documents['review'] = documents['review'].map(preprocess_text)
processed_doc=documents['review'].copy()

In [17]:
documents.head(5)

Unnamed: 0,title,review,index
0,Grand Theft Auto V,"[love, online, banned, playing, day]",0
1,Grand Theft Auto V,"[best, rockstar, optimization, fari, couldn, s...",1
2,Grand Theft Auto V,"[having, bought, game, p, p, pc, outline, goin...",2
3,Grand Theft Auto V,"[pro, sp, moddingcons, interactive]",3
4,Grand Theft Auto V,"[game, fly, shoot, rocket, gtao, play, story, ...",4


In [41]:
documents[documents.title == "Rust"]

Unnamed: 0,title,review,index
245641,Rust,"[ve, owned, rust, maybe, year, played, enjoyed...",245641
245642,Rust,"[begin, review, yeah, know, rust, laggy, piece...",245642
245643,Rust,"[think, game, fun, yesis, game, yeeis, game, w...",245643
245644,Rust,"[thing, know, rust, zergs, aka, big, group, st...",245644
245645,Rust,"[beat, kid, screaming, rock, minute, gameplay,...",245645
...,...,...,...
316724,Rust,"[decent, game, people, game, great, time, wast...",316724
316725,Rust,"[limited, time, game, ve, alot, fun, friend, g...",316725
316726,Rust,"[love, game, um]",316726
316727,Rust,"[game, worth, dabbing, dab]",316727


# Running LDA using Bag of Words on top 3 games separately

For each of top 3 games:

(1) Create dictionary from processed review to map each unique word to an unique index

The apply Gensim filter_extremes to:

Filter out tokens that appear in less than 10 documents or more than 50% (0.5) of documents.

(1) Apply Gensim doc2bow

For each document we create a dictionary reporting how many words and how many times those words appear

(2) Train LDA model with processed data using doc2bow

In [19]:
# Running LDA using Bag of Words on top 3 games separately
# Start timer
import time
start_time = time.time()

for i,game in enumerate(top_3_list):

    i += 1
    
    # Extract preprocessed reviews from dataset for the chosen game
    reviews = documents[documents['title']==game]['review']

    # Create a dictionary of unique word
    dictionary = gensim.corpora.Dictionary(reviews)

    # Filter out too low or too high frequency words
    dictionary.filter_extremes(no_below=10,no_above=0.5)

    # Apply doc2bow for each review to create a dictionary showing how many words and the frequency.
    prefix ="bow_corpus_"
    index = i
    corpus_name = f"{prefix}{index}"
    bow_corpus = [dictionary.doc2bow(doc) for doc in reviews]
    globals()[corpus_name] = bow_corpus
    

    # Apply LDA on the result of doc2bow
    prefix_m="lda_model_"
    index = i
    model_name = f"{prefix_m}{index}"
    lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=20, id2word=dictionary, passes=10, workers=4)
    globals()[model_name]=lda_model
    
    # Save the LDA model
    lda_model.save(f"{model_name}.gensim")

    # Save the dictionary
    dictionary.save_as_text(f"{model_name}_dict.txt")
        
    # print the game and its corresponding model name
    print(f"\nGame {game} has model {model_name}\n")
          
          
# End timer
end_time = time.time()
duration = end_time - start_time
print(f"\nTime taken: {duration} seconds")     
        


Game PLAYERUNKNOWN'S BATTLEGROUNDS has model lda_model_1


Game Grand Theft Auto V has model lda_model_2


Game Rust has model lda_model_3


Time taken: 193.37435460090637 seconds


In [42]:
# Define the function to print model output in dataframe format
def print_topic(model,corpus):
    # Get the document-topic distribution of the model
    doc_topics = [model.get_document_topics(bow) for bow in corpus]

    # Count the number of documents related to each topic
    topic_doc_count = {}
    for doc in doc_topics:
        for topic_id, prob in doc:
            if topic_id not in topic_doc_count:
                topic_doc_count[topic_id] = 0
            topic_doc_count[topic_id] += 1

    # Get the word distributions and probabilities for each topic
    topics = model.show_topics(num_topics=-1, num_words=10, formatted=False)

    # Prepare the data for the dataframe
    data = []
    for topic_id, words in topics:
        words_representation = [word for word, prob in words]
        probabilities = [f"{prob:.4f}" for word, prob in words]
        doc_count = topic_doc_count.get(topic_id, 0)
        data.append([topic_id, doc_count, words_representation, probabilities])

    # Create the dataframe
    df = pd.DataFrame(data, columns=['Topic Name', 'Count of Documents', 'Representation', 'Probability'])

    # Sort the dataframe by the number of documents related to each topic in descending order
    df = df.sort_values(by='Count of Documents', ascending=False).reset_index(drop=True)

    return df

In [43]:
print(f"\nThe topic modeling result of PLAYERUNKNOWN'S BATTLEGROUNDS using LDA model with bag of words is:\n")
result1 = print_topic(lda_model_1,bow_corpus_1)
result1


The topic modeling result of PLAYERUNKNOWN'S BATTLEGROUNDS using LDA model with bag of words is:



Unnamed: 0,Topic Name,Count of Documents,Representation,Probability
0,15,90203,"[fun, friend, lot, play, great, bug, time, bug...","[0.1437, 0.0365, 0.0354, 0.0336, 0.0285, 0.028..."
1,0,89301,"[early, access, review, issue, ve, like, relea...","[0.0365, 0.0301, 0.0233, 0.0176, 0.0131, 0.011..."
2,9,88055,"[fix, lag, hacker, bug, server, issue, update,...","[0.0909, 0.0905, 0.0735, 0.0466, 0.0385, 0.028..."
3,2,87723,"[play, fun, friend, playing, time, people, squ...","[0.0529, 0.0358, 0.0295, 0.0217, 0.0211, 0.019..."
4,6,87171,"[play, got, like, hour, im, pubg, playing, kno...","[0.0399, 0.0290, 0.0248, 0.0191, 0.0189, 0.018..."
5,11,86850,"[gun, shot, like, yes, weapon, player, time, m...","[0.0141, 0.0130, 0.0128, 0.0120, 0.0117, 0.011..."
6,19,86480,"[money, crate, devs, developer, buy, key, pay,...","[0.0315, 0.0217, 0.0142, 0.0134, 0.0120, 0.010..."
7,7,84864,"[pubg, like, battle, better, royale, fortnite,...","[0.0501, 0.0216, 0.0194, 0.0157, 0.0140, 0.013..."
8,18,84417,"[good, great, pretty, simulator, need, job, gr...","[0.5241, 0.1036, 0.0438, 0.0209, 0.0139, 0.010..."
9,3,84198,"[buy, dont, bad, money, worth, garbage, waste,...","[0.1142, 0.0823, 0.0643, 0.0605, 0.0337, 0.026..."


In [22]:
from sklearn.metrics import jaccard_score

def jaccard_similarity(set1, set2):
    return len(set1 & set2) / len(set1 | set2)

def calculate_topic_diversity(topics):
    num_topics = len(topics)
    similarities = []

    for i in range(num_topics):
        for j in range(i + 1, num_topics):
            sim = jaccard_similarity(set(topics[i]), set(topics[j]))
            similarities.append(sim)

    avg_similarity = np.mean(similarities)
    diversity_score = 1 - avg_similarity
    return diversity_score

In [44]:
# Show the topic diversity (using Jaccard Similarity score)
topics1 = result1['Representation'].tolist()
diversity_score1 = calculate_topic_diversity(topics1)
print(f"Topic Diversity Score 1: {diversity_score1}")

Topic Diversity Score 1: 0.9712277985986638


In [46]:
print(f"\nThe topic modeling result of Grand Theft Auto V using LDA model with bag of words is:\n")
result2 = print_topic(lda_model_2,bow_corpus_2)
result2


The topic modeling result of Grand Theft Auto V using LDA model with bag of words is:



Unnamed: 0,Topic Name,Count of Documents,Representation,Probability
0,17,60164,"[pc, time, hour, played, year, bought, gta, p,...","[0.0488, 0.0339, 0.0308, 0.0302, 0.0277, 0.027..."
1,6,59861,"[modding, mod, community, rockstar, gta, openi...","[0.0762, 0.0356, 0.0316, 0.0243, 0.0237, 0.020..."
2,7,59619,"[fun, online, play, friend, free, story, produ...","[0.1160, 0.0603, 0.0573, 0.0540, 0.0540, 0.049..."
3,10,59252,"[money, buy, card, shark, grind, new, online, ...","[0.0655, 0.0301, 0.0274, 0.0248, 0.0179, 0.016..."
4,18,59238,"[player, single, multiplayer, online, worth, s...","[0.1368, 0.1198, 0.0736, 0.0562, 0.0340, 0.027..."
5,4,58152,"[hacker, online, modders, rockstar, people, mo...","[0.0663, 0.0477, 0.0333, 0.0307, 0.0222, 0.021..."
6,2,57854,"[gta, character, car, like, story, mission, wo...","[0.0209, 0.0157, 0.0140, 0.0140, 0.0119, 0.010..."
7,8,57475,"[online, friend, time, mission, fun, play, hei...","[0.0358, 0.0210, 0.0195, 0.0191, 0.0184, 0.018..."
8,14,57160,"[review, buy, ing, rockstar, negative, know, d...","[0.0784, 0.0564, 0.0502, 0.0390, 0.0252, 0.024..."
9,9,56270,"[im, suck, pay, yeah, like, guess, dont, stupi...","[0.0284, 0.0283, 0.0214, 0.0190, 0.0184, 0.018..."


In [47]:
# Show the topic diversity (using Jaccard Similarity score)
topics2 = result2['Representation'].tolist()
diversity_score2 = calculate_topic_diversity(topics2)
print(f"Topic Diversity Score 2: {diversity_score2}")

Topic Diversity Score 2: 0.9598165951514493


In [48]:
print(f"\nThe topic modeling result of RUST using LDA model with bag of words is:\n")
result3 = print_topic(lda_model_3,bow_corpus_3)
result3


The topic modeling result of RUST using LDA model with bag of words is:



Unnamed: 0,Topic Name,Count of Documents,Representation,Probability
0,14,41172,"[fun, friend, lot, play, time, people, playing...","[0.1376, 0.0473, 0.0430, 0.0339, 0.0320, 0.026..."
1,1,39806,"[great, play, friend, nice, fun, community, go...","[0.1497, 0.0962, 0.0860, 0.0562, 0.0495, 0.033..."
2,16,39363,"[best, survival, played, amazing, awesome, ve,...","[0.1644, 0.1057, 0.0663, 0.0558, 0.0456, 0.023..."
3,10,39254,"[buy, ing, time, money, play, worth, dont, loa...","[0.0491, 0.0373, 0.0356, 0.0341, 0.0302, 0.023..."
4,3,38953,"[rust, hour, time, play, playing, ve, life, pl...","[0.0325, 0.0294, 0.0231, 0.0207, 0.0200, 0.019..."
5,11,38606,"[player, pvp, survival, rust, server, building...","[0.0351, 0.0260, 0.0231, 0.0217, 0.0191, 0.016..."
6,9,38506,"[alpha, early, access, like, wait, potential, ...","[0.0537, 0.0335, 0.0237, 0.0161, 0.0129, 0.011..."
7,15,38124,"[rust, new, update, old, legacy, year, version...","[0.0910, 0.0517, 0.0283, 0.0281, 0.0269, 0.021..."
8,7,38082,"[player, time, people, like, server, thing, ru...","[0.0169, 0.0136, 0.0095, 0.0082, 0.0076, 0.007..."
9,5,38032,"[server, hacker, play, people, player, communi...","[0.1163, 0.0340, 0.0221, 0.0217, 0.0213, 0.016..."


In [50]:
# Show the topic diversity (using Jaccard Similarity score)
topics3 = result3['Representation'].tolist()
diversity_score3 = calculate_topic_diversity(topics3)
print(f"Topic Diversity Score 3: {diversity_score3}")

Topic Diversity Score 3: 0.9573554629608075
