In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
import nltk.tokenize as tk
from nltk.tokenize import word_tokenize
import string
from nltk.stem import PorterStemmer
stemmer = PorterStemmer
# Read the CSV file
df = pd.read_csv('C:\\Users\\Zude Ang\\VSC\\IND_TopicModellingOV\\Geopolitics.csv')

stop_words = stopwords.words('english')
new_stop = ['know','think','like','also','really','well','would','right','us','one','actually','people','city','year','question','yeah','going','lot','choose','feel'
            ,'perhaps','gon','say']
stop_words.extend(new_stop)
df['cleaned_corpus'] = ''

def cleaned_corpus(corpus):
    tokens = word_tokenize(corpus)

    tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
    tokens = [token for token in tokens if not token.isnumeric()]
    tokens = [token for token in tokens if token not in string.punctuation]
    tokens = [token for token in tokens if token.isalnum()]
    # tokens = [stemmer.stem(token) for token in tokens]
    
    cleaned_corpus = ' '.join(tokens)
    return cleaned_corpus

df['cleaned_corpus'] = df['Text'].apply(cleaned_corpus)

# Initialize variables
segments = []
current_segment = {'Speaker': '', 'cleaned_corpus': ''}

# Iterate over the rows
for index, row in df.iterrows():
    # Get the segment value
    segment = row['Segment']
    
    # Check if it's a new segment
    if segment == 'Question':
        # Add the previous segment to the list
        if current_segment['cleaned_corpus']:
            segments.append(current_segment)
        
        # Start a new segment
        current_segment = {'Speaker': row['Speaker'], 'cleaned_corpus': row['cleaned_corpus']}
    else:
        # Append the text to the current segment
        current_segment['cleaned_corpus'] += ' ' + row['cleaned_corpus']

# Add the last segment to the list
if current_segment['cleaned_corpus']:
    segments.append(current_segment)

# Preprocess and vectorize the text
corpus = [segment['cleaned_corpus'] for segment in segments]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

# Apply LDA model
num_topics = 5  # Define the number of topics
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(X)

# Get the topic distributions for each segment
segment_topics = lda.transform(X)

for i, segment in enumerate(segments):
    topic_dist = segment_topics[i]
    dominant_topic_idx = topic_dist.argmax()
    dominant_topic_prob = topic_dist[dominant_topic_idx]
    topic_words = lda.components_[dominant_topic_idx]
    top_words_idx = topic_words.argsort()[-5:]
    top_words = [vectorizer.get_feature_names_out()[idx] for idx in top_words_idx]
    
    print(f"Segment {i+1}:")
    print(f"Speaker: {segment['Speaker']}")
    print(f"Text: {segment['cleaned_corpus']}")
    print(f"Dominant Topic: {dominant_topic_idx}")
    print(f"Dominant Topic Probability: {dominant_topic_prob:.4f}")
    print(f"Top Words in Dominant Topic:")
    print(top_words)
    print()


Segment 1:
Speaker: 
Text:  wtf series future future aims dive deep hot topics youth may scratching heads basically asked tough questions guys thank joining another episode wtf future future podcast series national youth councils asia ready exposure program partnership singapore institute international affairs today na unpacking something understand much thankfully experts um give topic global geopolitics define basically politics countries placed ah saw wikipedia research thing went wikipedia guessed yes pretty much broad sense breaking currently happening world terms global geopolitics covering bit china rivalry grew still happening yep episode na speak experts geopolitics obviously na discuss economic impacts region expect moving forward okay joining studio today got assistant professor ntu public policy global affairs dillon low channel news asia executive producer miss pearl force okay first obviously
Dominant Topic: 4
Dominant Topic Probability: 0.9201
Top Words in Dominant Topic