In [47]:
import pandas as pd
import numpy as np

# Data pre-processing
import nltk
# List of stop words used for data pre-processing
from stop_words_list import stop_words_list
from nltk.corpus import stopwords

# Sentiment Analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Topics modelling
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [48]:
# listOfCreator: "MKBHD", "Jonathan Morrison", "Unbox Therapy", "Chris Stuckman", "Jeremy Jahns", "Channel Awesome", "James Charles", "NikkieTutorials", "sophdoeslife"

creator = "MKBHD"
output_df = pd.read_excel(f"./comments_spreadsheets/extracted_comments_{creator}.xlsx")

In [None]:
duplicates = output_df[output_df.duplicated(("Comments"))]
print ("Count of duplicate comments in dataframe"
, duplicates.shape[0])

print ("Count of unique comments in dataframe"
, output_df.shape[0] - duplicates.shape[0])

# Remove duplicated comments from dataset
unique_df = output_df.drop_duplicates(subset=["Comments"], keep='first')
# unique_df.to_excel(excel_writer=r"./extracted_comments_MKBHD_Samsung_Galaxy_Review.xlsx", index=False, header=True)
# df = pd.read_excel("extracted_comments_MKBHD_Samsung_Galaxy_Review.xlsx")
df = unique_df.reset_index()

# Removes line return "\n"
df = df.replace(r'\n',' ', regex=True)

# display(unique_df)
display(df)

# Sentiment Analysis

In [None]:
sentimentAnalyser = SentimentIntensityAnalyzer()
sentimentScoreList = []
sentimentLabelList = []

for i in df["Comments"].values.tolist():
    sentimentScore = sentimentAnalyser.polarity_scores(i)

    if sentimentScore['compound'] >= 0.05:
        sentimentScoreList.append(sentimentScore['compound'])
        sentimentLabelList.append('Positive')
    elif sentimentScore['compound'] > -0.05 and sentimentScore['compound'] < 0.05:
        sentimentScoreList.append(sentimentScore['compound'])
        sentimentLabelList.append('Neutral')
    elif sentimentScore['compound'] <= -0.05:
        sentimentScoreList.append(sentimentScore['compound'])
        sentimentLabelList.append('Negative')

df["Sentiment"] = sentimentLabelList
df["Sentiment Score"] = sentimentScoreList

display(df)

# Data Pre-Processing for LDA Model
1. Lowercase the letters
2. Remove the punctuation
3. Remove the number
4. Remove extra whitespaces
5. Tokenisation
6. Remove the stopwords

In [None]:
# def lemmatize_text(text):
#     lemmatizer = WordNetLemmatizer()
#     return [lemmatizer.lemmatize(w) for w in text]

# Convert case text as lowercase, remove punctuation, remove extra whitespace in string and on both sides of string
# Lowercase all the letters
df['lower'] = df['Comments'].str.lower()

# Remove punctuations
df['punctuation_removed'] = df['lower'].str.replace("'", '', regex=True).str.replace('[^\w\s]', ' ', regex=True)

# Remove numbers
df['numbers_removed'] = df['punctuation_removed'].str.replace(" \d+", " ", regex=True)

# Remove extra whitespace
df['extra_spaces_removed'] = df['numbers_removed'].str.replace(' +', ' ', regex=True).str.strip()

# Tokenise
df['tokenised'] = df.apply(lambda row: nltk.word_tokenize(row['extra_spaces_removed']), axis=1)

# Stop words removal
# initiate stopwords from nltk
stop_words = stopwords.words('english')
# add additional missing terms
stop_words.extend(stop_words_list)
# remove stopwords
df['removed_stopwords'] = df['tokenised'].apply(lambda x: [item for item in x if item not in stop_words])

display(df.head(10))

# Vectorisation for LDA model

In [None]:
# initialise the count vectorizer
vectorizer = CountVectorizer(analyzer = 'word', ngram_range = (2, 2))

# join the processed data to be vectorised

vectors = []

for index, row in df.iterrows():
    vectors.append(", ".join(row["removed_stopwords"]))

vectorised = vectorizer.fit_transform(vectors)

print(vectorised)

# Topic Modelling Using LDA

In [None]:
# Initialise LDA Model
lda_model = LatentDirichletAllocation(n_components = 3, # number of topics
                                      random_state = 1,     # random state USED TO BE 10
                                      evaluate_every = -1,  # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,         # Use all available CPUs
                                 )

# Returns a list of probability of belonging to each topic for each comment
lda_output = lda_model.fit_transform(vectorised)

# Column names
topic_names = ["Topic" + str(i) for i in range(1, lda_model.n_components + 1)]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns = topic_names)

# Get dominant topic for each document
dominant_topic = (np.argmax(df_document_topic.values, axis=1)+1)
df_document_topic['Dominant_topic'] = dominant_topic

# Join to original dataframes

df = pd.merge(df, df_document_topic, left_index = True, right_index = True, how = 'outer')
display(df.head(10))

In [None]:
# index names
docnames = ['Doc' + str(i) for i in range(len(df))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topic_names, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# display(df_document_topic)

# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(lda_model.components_)

# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names()
df_topic_keywords.index = topic_names

# display(df_topic_keywords)

df_topic_no = pd.DataFrame(df_topic_keywords.idxmax())
df_scores = pd.DataFrame(df_topic_keywords.max())

tmp = pd.merge(df_topic_no, df_scores, left_index=True, right_index=True)
tmp.columns = ['topic', 'relevance_score']

tmp["topic"] =  tmp.topic.str.extract('(\d+)')
tmp["topic"] = tmp["topic"].astype(int)

display(tmp)

In [None]:
all_topics = []

for i in tmp['topic'].unique():
    tmp_1 = tmp.loc[tmp['topic'] == i].reset_index()
    tmp_1 = tmp_1.sort_values('relevance_score', ascending=False).head(1)

    # print(tmp_1)
    # print(tmp_1['topic'])
    # tmp_1['topic'] = tmp_1['topic'] + 1

    tmp_2 = []
    tmp_2.append(tmp_1['topic'].unique()[0])
    tmp_2.append(list(tmp_1['index'].unique()))
    all_topics.append(tmp_2)

all_topics = pd.DataFrame(all_topics, columns=['Dominant_topic', 'topic_name'])
display(all_topics)

In [None]:
results = df.groupby(['Dominant_topic', 'Sentiment']).count().reset_index()
# display(results)

results = results.merge(all_topics, on='Dominant_topic')
results['topic_name'] = results['topic_name'].apply(', '.join)

# display(results)

graph_results = results[['topic_name', 'Sentiment', 'Sentiment Score']]
graph_results = graph_results.pivot(index='topic_name', columns='Sentiment', values='Sentiment Score').reset_index()

graph_results.set_index('topic_name', inplace=True)
display(graph_results)

In [None]:
fig = graph_results.plot.bar(rot=90, figsize=(10,10))

# Uncomment to save the figure as a png to current directory
# fig.figure.savefig(f'{creator}_absa_lda.png' , bbox_inches='tight')