<a href="https://colab.research.google.com/github/YamRub/NLP-with-Sequence-model/blob/main/Assignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


In [5]:
df = pd.read_excel("/content/sample_data/netflix_reviews_NLP Sample Data.xlsx")

In [6]:
df1=df.copy(deep=True)

In [7]:
# Preprocess the data
def preprocess_text(text):
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = text.lower()
    text = " ".join([stemmer.stem(word) for word in text.split() if word not in stop_words])
    return text

In [12]:
df1['content'] = df1['content'].astype(str)

In [9]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
# Apply preprocessing
df1['cleaned_content'] = df1['content'].apply(preprocess_text)

In [14]:
# Display the first few rows after cleaning
print(df1.head())

                               reviewId         userName  \
0  c01a0242-e5e7-4e8a-a948-3f0b3ba89f20    Chris Paredes   
1  3c1c9fab-65e9-4f89-a1b6-9c45f27addb1  ijustwannabefab   
2  27fdf06b-24e8-4a72-9092-c88518e2ceb2       Darah Lazo   
3  2e8efd65-9b51-4ce4-ace0-411215764a93    jeremy porter   
4  5918b74d-38d0-4e9a-ac5e-da2738351352   Jacob Overcash   

                                             content  score  thumbsUpCount  \
0  Where it's me on the plane. It's good. You can...      5              0   
1  I have an account for 5 users that I share wit...      1              1   
2            Can never go a day without Netflix ðŸ˜Ž      5              0   
3  And another streaming service that I'm not gon...      1              1   
4   Cannot cast if you have ad level plan. Worthless      1              0   

     reviewCreatedVersion                  at              appVersion  \
0  8.113.3 build 31 50678 2024-05-21 22:41:10  8.113.3 build 31 50678   
1  8.114.0 build 19 5068

In [15]:
df1.columns

Index(['reviewId', 'userName', 'content', 'score', 'thumbsUpCount',
       'reviewCreatedVersion', 'at', 'appVersion', 'cleaned_content'],
      dtype='object')

implement LDA for topic modeling

In [16]:
df1 = df1.drop(df1.columns[[0, 1,2,3, 4, 5, 6, 7]], axis=1)

In [18]:
df1.head()

Unnamed: 0,cleaned_content
0,plane good watch movit realli great love im go...
1,account 5 user share dad alway use playstat da...
2,never go day without netflix ðÿ˜ž
3,anoth stream servic im gonna pay anymor want p...
4,cannot cast ad level plan worthless


In [19]:
# Function to display topics
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))

In [21]:
# Function to identify major topics
def identify_topics(df, num_topics=5, num_top_words=10):
    # Vectorize the cleaned content using CountVectorizer
    vectorizer = CountVectorizer(max_features=5000, stop_words='english')
    X = vectorizer.fit_transform(df['cleaned_content'])

    # Apply LDA
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(X)

    # Display topics
    print("\nIdentified Topics:")
    display_topics(lda, vectorizer.get_feature_names_out(), num_top_words)

    # Assign topics to reviews
    df['topic'] = np.argmax(lda.transform(X), axis=1)

    return df, lda, vectorizer

In [22]:
# Identify topics
df_with_topics, lda_model, vectorizer = identify_topics(df1, num_topics=6, num_top_words=10)

# Display the first few rows with topics
print(df_with_topics.head())


Identified Topics:
Topic 0:
app download good watch netflix use great realli dont love
Topic 1:
app work netflix updat phone play screen fix video tri
Topic 2:
ðÿ netflix app account payment tri devic use sign say
Topic 3:
movi netflix watch love app like good seri great best
Topic 4:
netflix pay use servic cancel content stream price subscript im
Topic 5:
watch episod season pleas netflix like app fix im list
                                     cleaned_content  topic
0  plane good watch movit realli great love im go...      0
1  account 5 user share dad alway use playstat da...      4
2                  never go day without netflix ðÿ˜ž      2
3  anoth stream servic im gonna pay anymor want p...      4
4                cannot cast ad level plan worthless      4


Validation and User Interaction

In [23]:
# Function to validate topics
def validate_topics(df):
    print("\nValidation of Topics:")
    topic_counts = df['topic'].value_counts()
    print(topic_counts)

In [24]:
# Function to allow user to accept or reject topics
def accept_reject_topics(df, lda_model, vectorizer, accepted_topics):
    # Filter out rejected topics
    df_filtered = df[df['topic'].isin(accepted_topics)]

    # Rerun LDA on the filtered data
    X_filtered = vectorizer.transform(df_filtered['cleaned_content'])
    lda_model_filtered = LatentDirichletAllocation(n_components=len(accepted_topics), random_state=42)
    lda_model_filtered.fit(X_filtered)

    # Display topics
    print("\nFiltered Topics:")
    display_topics(lda_model_filtered, vectorizer.get_feature_names_out(), 10)

    # Assign topics to reviews
    df_filtered['topic'] = np.argmax(lda_model_filtered.transform(X_filtered), axis=1)

    return df_filtered, lda_model_filtered

In [25]:
# Validate initial topics
validate_topics(df_with_topics)


Validation of Topics:
topic
3    28878
1    25162
4    17823
0    14838
2    12695
5    10333
Name: count, dtype: int64


In [26]:
# Assume user accepts topics 0 and 1 for simplicity
accepted_topics = [0, 1]

In [27]:
# Re-run LDA with accepted topics
df_filtered_topics, lda_filtered_model = accept_reject_topics(df_with_topics, lda_model, vectorizer, accepted_topics)


Filtered Topics:
Topic 0:
app watch download good netflix movi use great love like
Topic 1:
app work netflix updat phone fix play screen open video


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['topic'] = np.argmax(lda_model_filtered.transform(X_filtered), axis=1)


In [28]:
# Display the first few rows with filtered topics
print(df_filtered_topics.head())

                                      cleaned_content  topic
0   plane good watch movit realli great love im go...      0
6                                               youä      0
7   bad cooper servic abl find solut video gone black      1
8   latest updat complet ruin app especi android m...      1
11                          video freez audio continu      1


Hierarchical Presentation

In [29]:
# Function to present topics hierarchically
def hierarchical_topics(lda_model, vectorizer, df, num_top_words=10):
    topic_dict = {}
    for topic_idx, topic in enumerate(lda_model.components_):
        topic_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
        topic_dict[f"Topic {topic_idx}"] = topic_words

    # Display hierarchy
    print("\nHierarchical Presentation of Topics:")
    for topic, words in topic_dict.items():
        print(f"{topic}:")
        for word in words:
            print(f"  - {word}")

    return topic_dict

In [30]:
# Present topics hierarchically
hierarchical_topic_dict = hierarchical_topics(lda_filtered_model, vectorizer, df_filtered_topics)


Hierarchical Presentation of Topics:
Topic 0:
  - app
  - watch
  - download
  - good
  - netflix
  - movi
  - use
  - great
  - love
  - like
Topic 1:
  - app
  - work
  - netflix
  - updat
  - phone
  - fix
  - play
  - screen
  - open
  - video


In [31]:
# Print hierarchical topics
print(hierarchical_topic_dict)

{'Topic 0': ['app', 'watch', 'download', 'good', 'netflix', 'movi', 'use', 'great', 'love', 'like'], 'Topic 1': ['app', 'work', 'netflix', 'updat', 'phone', 'fix', 'play', 'screen', 'open', 'video']}
