In [1]:
!pip install -r requirement.txt



In [2]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora, models
from gensim.utils import simple_preprocess
from nltk import WordNetLemmatizer
import gensim
from gensim.models import CoherenceModel
from gensim import corpora
import pyLDAvis.gensim_models
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oyuesan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/oyuesan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/oyuesan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
# Read the data from the file 'DH_CollectingData2022_review.tsv.txt'
df = pd.read_csv('DH_CollectingData2022_review.tsv', sep='\t', header=None, names=['sentence', 'sentiment'],
                 encoding='utf-8')
# Create a set of stop words in English
stop_words = set(stopwords.words('english'))
# Create a WordNetLemmatizer object to perform lemmatization
lemmatizer = WordNetLemmatizer()


# Preprocess function
def preprocess(text):
    # Convert to lowercase
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    # Tokenize text using simple_preprocess
    tokens = simple_preprocess(text, deacc=True)
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize words
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Join tokens back into string
    return tokens

# Define a function named get_num that takes a row of data as input
def get_num(row):
    # Extract the sentence and sentiment values from the row
    sentence = row['sentence']
    number = row['sentiment']
    # If the sentiment value is not NaN, return it
    if pd.notna(row['sentiment']):
        return number
    # The bug fix for file.
    if sentence and sentence[-1].isdigit():
        # If the last character of the sentence is a digit, extract the number.
        if len(sentence) > 1 and sentence[-2] == '-':
            return int(sentence[-1])
        # Otherwise, the number is positive
        else:
            return -int(sentence[-1])
    # If the sentence doesn't end with a digit, return NaN
    else:
        return pd.NA

# Apply get_num to 'sentiment' column
df['sentiment'] = df.apply(get_num, axis=1)
# Apply preprocessing to 'sentence' column
df['clean_text'] = df['sentence'].apply(preprocess)

In [4]:
df.head()

Unnamed: 0,sentence,sentiment,clean_text
0,"For Nik, he only wants to silence the cacophon...",0.0,"[nik, want, silence, cacophony, sound, color, ..."
1,I can play this two ways\t0,0.0,"[play, two, way]"
2,"Mild, because it isn't conclusive, and doesn't...",-1.0,"[mild, isnt, conclusive, doesnt, give, u, info..."
3,You can also get some more information about t...,0.0,"[also, get, information, book, writing, exclus..."
4,"Soon, Hero, who has never had friends, is thru...",0.0,"[soon, hero, never, friend, thrust, school, qu..."


In [5]:
# Get all sentences from the 'clean_text' column of the DataFrame and create a list
all_sentences = df['clean_text'].tolist()
# Create a TF-IDF matrix using the TfidfVectorizer from scikit-learn
vectorizer = TfidfVectorizer(stop_words='english', max_features=20)
tfidf_matrix = vectorizer.fit_transform(df['sentence'])
# Extract the top 20 keywords from the TF-IDF matrix
# First, sort the feature weights in descending order
indices = np.argsort(vectorizer.idf_)[::-1]
# Get the feature names from the vectorizer
features = vectorizer.get_feature_names_out()
# Set the number of top features to extract
top_n = 20
# Create a list of the top n features
top_features = [features[i] for i in indices[:top_n]]
# Print the top features
print("Top 20 keywords: ", top_features)

Top 20 keywords:  ['way', 'little', 'mystery', 'did', 'interesting', 'world', 'reading', 'think', 'character', 'great', 'novel', 'love', 'really', 'like', 'good', 'characters', 'just', 'story', 'read', 'book']


In [6]:
# Create a new DataFrame containing only the rows where the 'sentiment' column equals 1 (positive sentiment)
positive_df = df[df['sentiment'] == 1]
# Create a new DataFrame containing only the rows where the 'sentiment' column equals -1 (negative sentiment)
negative_df = df[df['sentiment'] == -1]

In [7]:
# Get the 'clean_text' column from the positive and negative DataFrames
data_words_pos = positive_df['clean_text']
data_words_neg = negative_df['clean_text']
# Define the range of hyperparameters to test
num_topics_range = [3, 5, 7]
num_passes_range = [5, 10, 15]
chunksize_range = [3, 5, 7, 9]
# Define a function to calculate the coherence score for a given model
def compute_coherence(model, corpus, texts):
    coherence_model = CoherenceModel(model=model, corpus=corpus, texts=texts, coherence='c_v')
    return coherence_model.get_coherence()
# Initialize variables to store the best hyperparameters and resulting model
def find_best_hyperparameter(texts):
    best_coherence = 0
    best_num_topics = None
    best_num_passes = None
    best_chunksize = None
    best_model = None
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    # Perform grid search
    for i, num_topics in enumerate(num_topics_range):
        for j, num_passes in enumerate(num_passes_range):
            for k, chunksize in enumerate(chunksize_range):
                # Fit the LDA model using the current hyperparameters
                lda_model = gensim.models.ldamodel.LdaModel(
                    corpus=corpus, id2word=dictionary, num_topics=num_topics,
                    random_state=42, update_every=1, chunksize=chunksize,
                    passes=num_passes, alpha='auto', per_word_topics=True
                )
                # Compute coherence score for the resulting model
                coherence = compute_coherence(lda_model, corpus, texts)
                # Update the best hyperparameters and resulting model if coherence score is higher
                if coherence > best_coherence:
                    best_coherence = coherence
                    best_num_topics = num_topics
                    best_num_passes = num_passes
                    best_chunksize = chunksize
                    best_model = lda_model
    return best_coherence, best_num_topics, best_num_passes, best_chunksize, best_model, corpus, dictionary



In [8]:
# Print the best hyperparameters and resulting model
best_coherence, best_num_topics, best_num_passes, best_chunksize, best_model, corpus, dictionary = find_best_hyperparameter(data_words_pos)
# Print the best hyperparameters and coherence score
print('Best coherence:', best_coherence)
print('Best num topics:', best_num_topics)
print('Best num passes:', best_num_passes)
print('Best chunksize:', best_chunksize)
# Print the top topics and their associated keywords for the resulting model
print(best_model.print_topics())

Best coherence: 0.6117331561233724
Best num topics: 5
Best num passes: 15
Best chunksize: 3
[(0, '0.066*"read" + 0.021*"voice" + 0.016*"see" + 0.015*"mystery" + 0.015*"fun" + 0.015*"liked" + 0.011*"great" + 0.011*"kept" + 0.010*"also" + 0.010*"story"'), (1, '0.022*"bit" + 0.021*"im" + 0.021*"new" + 0.020*"hill" + 0.019*"really" + 0.018*"take" + 0.017*"thriller" + 0.014*"little" + 0.014*"coming" + 0.012*"perfect"'), (2, '0.048*"character" + 0.025*"much" + 0.022*"took" + 0.021*"lot" + 0.018*"reader" + 0.015*"better" + 0.014*"think" + 0.012*"way" + 0.010*"get" + 0.010*"part"'), (3, '0.068*"book" + 0.056*"well" + 0.023*"interesting" + 0.020*"time" + 0.013*"crawford" + 0.011*"something" + 0.010*"worth" + 0.010*"evil" + 0.008*"he" + 0.008*"tale"'), (4, '0.027*"good" + 0.024*"novel" + 0.023*"star" + 0.021*"writer" + 0.012*"beautiful" + 0.012*"enjoy" + 0.010*"first" + 0.009*"life" + 0.008*"every" + 0.008*"make"')]


In [9]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(best_model, corpus, dictionary)
vis

  default_term_info = default_term_info.sort_values(


In [10]:
# Call the find_best_hyperparameter function with the 'data_words_neg' variable as input to get the best hyperparameters and resulting model
best_coherence, best_num_topics, best_num_passes, best_chunksize, best_model, corpus, dictionary = find_best_hyperparameter(data_words_neg)
# Print the best hyperparameters and coherence score
print('Best coherence:', best_coherence)
print('Best num topics:', best_num_topics)
print('Best num passes:', best_num_passes)
print('Best chunksize:', best_chunksize)
# Print the top topics and their associated keywords for the resulting model
print(best_model.print_topics())

Best coherence: 0.5279284985362422
Best num topics: 3
Best num passes: 5
Best chunksize: 5
[(0, '0.032*"little" + 0.025*"unfortunately" + 0.025*"cast" + 0.025*"justice" + 0.025*"killer" + 0.018*"felt" + 0.018*"like" + 0.014*"got" + 0.014*"time" + 0.013*"much"'), (1, '0.019*"feel" + 0.019*"get" + 0.016*"conclusion" + 0.016*"got" + 0.014*"character" + 0.014*"didnt" + 0.012*"even" + 0.012*"ethan" + 0.011*"interesting" + 0.010*"reading"'), (2, '0.040*"book" + 0.028*"first" + 0.025*"writing" + 0.023*"part" + 0.022*"sequel" + 0.020*"pevels" + 0.020*"revelation" + 0.020*"ignored" + 0.020*"flaw" + 0.020*"certain"')]


In [11]:
vis = pyLDAvis.gensim_models.prepare(best_model, corpus, dictionary)
vis

  default_term_info = default_term_info.sort_values(
