In [1]:
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('dataset_sentiment.csv')

In [4]:
df.head(10)

Unnamed: 0,date_posted,funny,helpful,hour_played,is_early_access_review,recommendation,review,title,sentiment
0,2019-02-10,2,4,578,False,Recommended,&gt Played as German Reich&gt Declare war on B...,Expansion - Hearts of Iron IV: Man the Guns,Negative
1,2019-02-10,0,0,184,False,Recommended,yes.,Expansion - Hearts of Iron IV: Man the Guns,Neutral
2,2019-02-07,0,0,892,False,Recommended,Very good game although a bit overpriced in my...,Expansion - Hearts of Iron IV: Man the Guns,Positive
3,2018-06-14,126,1086,676,False,Recommended,Out of all the reviews I wrote This one is pro...,Dead by Daylight,Positive
4,2017-06-20,85,2139,612,False,Recommended,Disclaimer I survivor main. I play games for f...,Dead by Daylight,Negative
5,2016-12-12,4,55,2694,False,Recommended,ENGLISH After playing for more than two years ...,Dead by Daylight,Negative
6,2017-09-17,12,228,48,False,Recommended,Out of all the reviews I wrote This one is pro...,Dead by Daylight,Positive
7,2018-12-24,295,219,71,False,Recommended,I have never been told to kill myself more tha...,Dead by Daylight,Positive
8,2018-09-21,2,54,400,False,Recommended,Any longtime Dead by Daylight player knows tha...,Dead by Daylight,Positive
9,2018-12-05,380,271,414,False,Recommended,if you think cs go is toxic try this game,Dead by Daylight,Negative


In [5]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nieh_\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nieh_\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
def clean(document):
    # Check if the document is not a string (e.g., NaN or float)
    if not isinstance(document, str):
        return ""

    stop_free = " ".join([word for word in document.lower().split() if word not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [7]:
documents = df['review'].tolist()

In [8]:
doc_clean = [clean(doc).split() for doc in documents] 

In [9]:
dictionary = corpora.Dictionary(doc_clean)

In [10]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [11]:
from gensim.models import LdaMulticore
from gensim.models import Phrases
from gensim.models.phrases import Phraser

In [12]:
bigram = Phrases(doc_clean, min_count=5, threshold=100)  # higher threshold fewer phrases
bigram_mod = Phraser(bigram)

In [13]:
doc_clean_bigrams = [bigram_mod[doc] for doc in doc_clean]


In [14]:
dictionary_bigrams = corpora.Dictionary(doc_clean_bigrams)
dictionary_bigrams.filter_extremes(no_below=15, no_above=0.5)
corpus_bigrams = [dictionary_bigrams.doc2bow(text) for text in doc_clean_bigrams]

In [15]:
ldamodel_bigrams = gensim.models.LdaMulticore(corpus_bigrams, 
                                              num_topics=3, 
                                              id2word=dictionary_bigrams, 
                                              passes=10,  # Reduced from a higher number
                                              iterations=50,  # Adjust based on your needs
                                              chunksize=2000)  # Tune this based on your dataset size


In [16]:
for topic in ldamodel_bigrams.print_topics(num_topics=5, num_words=10):
    print(topic)

(0, '0.010*"like" + 0.009*"time" + 0.009*"get" + 0.008*"fun" + 0.008*"play" + 0.007*"still" + 0.006*"even" + 0.006*"player" + 0.006*"one" + 0.006*"really"')
(1, '0.017*"get" + 0.012*"server" + 0.012*"like" + 0.011*"best" + 0.010*"play" + 0.009*"people" + 0.008*"time" + 0.008*"ever" + 0.008*"one" + 0.007*"rust"')
(2, '0.026*"good" + 0.023*"fun" + 0.018*"play" + 0.014*"great" + 0.012*"online" + 0.012*"buy" + 0.011*"free" + 0.011*"friend" + 0.011*"money" + 0.009*"really"')


In [17]:
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import ProcessPoolExecutor


In [18]:
def get_dominant_topic(ldamodel, corpus_doc):
    row = sorted(ldamodel[corpus_doc], key=lambda x: (x[1]), reverse=True)
    # Get the Dominant topic, Perc Contribution and Keywords for each document
    topic_num, prop_topic = row[0]
    wp = ldamodel.show_topic(topic_num)
    topic_keywords = ", ".join([word for word, prop in wp])
    return int(topic_num), round(prop_topic, 4), topic_keywords

def format_topics_sentences(ldamodel, corpus):
    with ProcessPoolExecutor() as executor:
        topics_info = list(executor.map(lambda doc_bow: get_dominant_topic(ldamodel, doc_bow), corpus))

    return pd.DataFrame(topics_info, columns=['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'])

In [19]:
def format_topics_sentences(ldamodel, corpus):
    topics_info = []

    for doc_bow in corpus:
        row = sorted(ldamodel[doc_bow], key=lambda x: (x[1]), reverse=True)
        topic_num, prop_topic = row[0]
        wp = ldamodel.show_topic(topic_num)
        topic_keywords = ", ".join([word for word, prop in wp])
        topics_info.append([int(topic_num), round(prop_topic, 4), topic_keywords])

    return pd.DataFrame(topics_info, columns=['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'])

In [22]:
# Assuming 'doc_clean_bigrams' is your preprocessed list of documents with bi-grams
corpus_bigrams = [dictionary_bigrams.doc2bow(doc) for doc in doc_clean_bigrams]

# Now use this updated corpus with your LDA model
lda_output = ldamodel_bigrams[corpus_bigrams]
df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamodel_bigrams, corpus=corpus_bigrams)


In [23]:
df.reset_index(drop=True, inplace=True)
df_topic_sents_keywords.reset_index(drop=True, inplace=True)

In [24]:
df_combined = pd.concat([df, df_topic_sents_keywords], axis=1)
df_combined.head(10)

Unnamed: 0,date_posted,funny,helpful,hour_played,is_early_access_review,recommendation,review,title,sentiment,Dominant_Topic,Perc_Contribution,Topic_Keywords
0,2019-02-10,2,4,578,False,Recommended,&gt Played as German Reich&gt Declare war on B...,Expansion - Hearts of Iron IV: Man the Guns,Negative,1,0.9439,"get, server, like, best, play, people, time, e..."
1,2019-02-10,0,0,184,False,Recommended,yes.,Expansion - Hearts of Iron IV: Man the Guns,Neutral,0,0.6532,"like, time, get, fun, play, still, even, playe..."
2,2019-02-07,0,0,892,False,Recommended,Very good game although a bit overpriced in my...,Expansion - Hearts of Iron IV: Man the Guns,Positive,0,0.5229,"like, time, get, fun, play, still, even, playe..."
3,2018-06-14,126,1086,676,False,Recommended,Out of all the reviews I wrote This one is pro...,Dead by Daylight,Positive,1,0.5117,"get, server, like, best, play, people, time, e..."
4,2017-06-20,85,2139,612,False,Recommended,Disclaimer I survivor main. I play games for f...,Dead by Daylight,Negative,0,0.7008,"like, time, get, fun, play, still, even, playe..."
5,2016-12-12,4,55,2694,False,Recommended,ENGLISH After playing for more than two years ...,Dead by Daylight,Negative,1,0.551,"get, server, like, best, play, people, time, e..."
6,2017-09-17,12,228,48,False,Recommended,Out of all the reviews I wrote This one is pro...,Dead by Daylight,Positive,1,0.5117,"get, server, like, best, play, people, time, e..."
7,2018-12-24,295,219,71,False,Recommended,I have never been told to kill myself more tha...,Dead by Daylight,Positive,1,0.8511,"get, server, like, best, play, people, time, e..."
8,2018-09-21,2,54,400,False,Recommended,Any longtime Dead by Daylight player knows tha...,Dead by Daylight,Positive,0,0.9435,"like, time, get, fun, play, still, even, playe..."
9,2018-12-05,380,271,414,False,Recommended,if you think cs go is toxic try this game,Dead by Daylight,Negative,1,0.8783,"get, server, like, best, play, people, time, e..."


In [26]:
df_combined.to_csv('topic.csv', index=False)