# Topic Modelling

In [2]:
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
file_path = r"https://raw.githubusercontent.com/ajayt0m/Text_Classification_Alleppey/main/WScraped.csv"

df = pd.read_csv(file_path, encoding='latin1')

In [4]:
df

Unnamed: 0,Id,Title,Review,Label
0,1,Most overrated destination in kerala,I had a high expectations seeing this place in...,2
1,2,Venice of the East,Popularly known as'Venice of the East' Alleppe...,3
2,3,Alleppey is most polluted city,"Hai friends, today I am telling you my bad exp...",1
3,4,Alleppey,Alleppey is one the best place to visit in Ker...,4
4,5,ABOUT EXPENSIVE ALLEPPEY TRIP,Alleppey one of the famous district in kerala....,1
...,...,...,...,...
95,96,,Beach was good and clean. We went early in the...,5
96,97,,"Badly kept beach, authorities have no concern ...",1
97,98,,"Nice beach, best to visit early morning. Most ...",3
98,99,,Alappuzha Beach is known for its picturesque b...,4


In [28]:
df['Review'][0]

"I had a high expectations seeing this place in pictures, when I actually visited the place and roam in house boat through water canal I realised it's not worth my money or time. Canal were dirty like drainage in the city.\n\nDefinitely avoidable."

In [29]:
len(df)

100

## Pre-processing

In [30]:
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = cv.fit_transform(df['Review'])

In [31]:
dtm

<100x1163 sparse matrix of type '<class 'numpy.int64'>'
	with 6157 stored elements in Compressed Sparse Row format>

## LDA

In [32]:
num_topics=5
lda = LatentDirichletAllocation(n_components=num_topics,random_state=42)

In [33]:
lda.fit(dtm)

In [34]:
# Print the top words for each topic
feature_names = cv.get_feature_names_out()
num_top_words = 5  # Adjust this based on how many top words you want to display for each topic

for topic_idx, topic in enumerate(lda.components_):
    top_words_idx = topic.argsort()[:-num_top_words-1:-1]
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"Topic #{topic_idx + 1}: {', '.join(top_words)}")

Topic #1: kerala, alleppey, alapuzha, boat, water
Topic #2: boat, kerala, trip, beach, people
Topic #3: place, alleppey, beach, kerala, visit
Topic #4: boat, water, alleppey, kerala, place
Topic #5: boat, house, houseboat, night, day


In [35]:
# Assign topics to documents in the DataFrame
df['Topic'] = lda.transform(dtm).argmax(axis=1) + 1

In [36]:
df

Unnamed: 0,Id,Title,Review,Label,Topic
0,1,Most overrated destination in kerala,I had a high expectations seeing this place in...,2,5
1,2,Venice of the East,Popularly known as'Venice of the East' Alleppe...,3,1
2,3,Alleppey is most polluted city,"Hai friends, today I am telling you my bad exp...",1,1
3,4,Alleppey,Alleppey is one the best place to visit in Ker...,4,3
4,5,ABOUT EXPENSIVE ALLEPPEY TRIP,Alleppey one of the famous district in kerala....,1,2
...,...,...,...,...,...
95,96,,Beach was good and clean. We went early in the...,5,1
96,97,,"Badly kept beach, authorities have no concern ...",1,3
97,98,,"Nice beach, best to visit early morning. Most ...",3,3
98,99,,Alappuzha Beach is known for its picturesque b...,4,4
