In [None]:
%pip install contractions
%pip install emoji
%pip install mglearn
%matplotlib inline
%pip install pyLDAvis



In [None]:
import pandas as pd
# !pip install --upgrade pandas==1.5.3
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
import re
import string
import contractions
import emoji
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
import mglearn as mglearn
import matplotlib as plt
from __future__ import print_function
import pyLDAvis
import pyLDAvis.lda_model
pyLDAvis.enable_notebook()

### Loading Data

In [None]:
df = pd.read_csv("labelled_data.csv")
df = df[['text', 'label']].dropna()
df.head()

Unnamed: 0,text,label
0,naw dude if anything that gives you a solid co...,0
1,OpenAI’s employees must be seriously regrettin...,-1
2,Opinions: What AI product lived up to the hype...,0
3,"You don’t need OpenAI to do this, in fact I’d ...",-1
4,So for some reason I can't respond to any of t...,-1


### Cleaning Text

In [None]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, do_emoji_conversion=True, use_spacy_tokenizer=True):
        self.do_emoji_conversion = do_emoji_conversion
        self.use_spacy_tokenizer = use_spacy_tokenizer

    def remove_links(self, text):
        return re.sub(r'http[s]?://\S+|www\.\S+', '', text)

    def remove_user_mentions(self, text):
        return re.sub(r'u/\S+', '', text)

    def expand_contractions(self, text):
        return contractions.fix(text)

    def remove_non_ascii(self, text):
        return text.encode("ascii", "ignore").decode()

    def remove_punctuations(self, text):
        text = re.sub(r'[-]', ' ', text)
        text = re.sub(r'(\S)[' + re.escape(string.punctuation) + r']+', r'\1 ', text)
        text = re.sub(r'(\S)[' + re.escape(string.punctuation) + r'](\S)', r'\1 \2', text)
        return text

    def remove_numbers(self, text):
        return re.sub(r'[0-9]+', '', text)

    def emoji_to_text(self, text):
        return emoji.demojize(text)

    def normalize(self, text):
        return text.lower()

    def preprocess(self, text):
        text = self.normalize(text)
        text = self.remove_links(text)
        text = self.remove_user_mentions(text)
        text = self.expand_contractions(text)
        text = self.remove_non_ascii(text)
        text = self.remove_punctuations(text)
        text = self.remove_numbers(text)
        if self.do_emoji_conversion:
            text = self.emoji_to_text(text)
        text = self.normalize(text)
        return text

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.apply(self.preprocess)

preprocessor = TextPreprocessor()

df['Cleaned_Text'] = preprocessor.fit_transform(df['text'])
df[['Cleaned_Text','text']].head()

Unnamed: 0,Cleaned_Text,text
0,naw dude if anything that gives you a solid co...,naw dude if anything that gives you a solid co...
1,openais employees must be seriously regretting...,OpenAI’s employees must be seriously regrettin...
2,opinions what ai product lived up to the hype...,Opinions: What AI product lived up to the hype...
3,you do not need openai to do this in fact i w...,"You don’t need OpenAI to do this, in fact I’d ..."
4,so for some reason i cannot respond to any of ...,So for some reason I can't respond to any of t...


In [None]:
print("Samples per class:\n")
print(df['label'].value_counts())

Samples per class:

label
 0    3923
-1    1170
 1     450
Name: count, dtype: int64


### Splitting Positive and Negative Data

In [None]:
positive_df = df[df['label'] == 1]
negative_df = df[df['label'] == -1]

###LDA for Positive Reviews

In [None]:
pos_vect = CountVectorizer(max_features=10000, max_df=0.2, stop_words="english")
pos_X = pos_vect.fit_transform(positive_df['Cleaned_Text'])

In [None]:
pos_lda = LatentDirichletAllocation(n_components=10, learning_method="batch", max_iter=25, random_state=0)
pos_document_topics = pos_lda.fit_transform(pos_X)

In [None]:
pos_sorting = np.argsort(pos_lda.components_, axis=1)[:, ::-1]
pos_feature_names = np.array(pos_vect.get_feature_names_out())

mglearn.tools.print_topics(topics=range(10), feature_names=pos_feature_names, sorting=pos_sorting, topics_per_chunk=10, n_words=20)

topic 0       topic 1       topic 2       topic 3       topic 4       topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      --------      --------      --------      --------      --------      
better        gpt           open          gpt           like          free          thanks        thank         make          gpt           
like          impressive    really        model         really        good          just          good          agi           did           
new           experience    model         great         use           think         open          gpt           like          good          
model         new           source        did           api           search        gpt           like          using         way           
just          bing          best          actually      using         models        model         open          use           use           
people       

The above keywords show that some relevant topics are performance improvements, capabilities, accessibility, general appreciation, API usage, community collaboration, learning, AGI advancements, and comparison with other models.

In [None]:
pyLDAvis.lda_model.prepare(pos_lda, pos_X, pos_vect, lambda_step=0.5)

The scatter plot (left) shows a 2D representation of the topics. The distance between the points indicates the similarity or dissimilarity between topics. For example, from above, it is visible that topic 5 is far different from the other topics. It is also evident that topic 1 and 6 have some overlapping themes with topic 2. 

The bar chart (right) shows the top keywords that best represent that topic.

### LSA for Positive Reviews

In [None]:
pos_lsa_model = TruncatedSVD(n_components=10, random_state=42)
pos_lsa_document_topics = pos_lsa_model.fit_transform(pos_X)

In [None]:
pos_lsa_sorting = np.argsort(pos_lsa_model.components_, axis=1)[:, ::-1]
pos_lsa_feature_names = np.array(pos_vect.get_feature_names_out())

mglearn.tools.print_topics(topics=range(10), feature_names=pos_lsa_feature_names, sorting=pos_lsa_sorting, topics_per_chunk=10, n_words=20)

topic 0       topic 1       topic 2       topic 3       topic 4       topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      --------      --------      --------      --------      --------      
model         like          gpt           model         api           really        models        better        just          google        
gpt           gpt           better        new           use           good          access        just          new           models        
like          just          time          really        free          code          free          models        using         week          
just          people        access        using         whisper       access        really        good          access        agi           
open          things        fine          just          just          better        fine          mini          games         use           
really       

The above keywords show that some relevant topics are usability, API, general sentiment, model performance, accessibility, cost-efficiency, and comparison with other models.

### NNMF for Positive Reviews

In [None]:
pos_nmf_model = NMF(n_components=10, init='nndsvd', max_iter=80, random_state=42)
pos_nfm_document_topics = pos_nmf_model.fit_transform(pos_X)

In [None]:
pos_nfm_sorting = np.argsort(pos_nmf_model.components_, axis=1)[:, ::-1]
pos_nfm_feature_names = np.array(pos_vect.get_feature_names_out())

mglearn.tools.print_topics(topics=range(10), feature_names=pos_nfm_feature_names, sorting=pos_nfm_sorting, topics_per_chunk=10, n_words=20)

topic 0       topic 1       topic 2       topic 3       topic 4       topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      --------      --------      --------      --------      --------      
like          model         gpt           open          api           really        ask           better        games         week          
just          new           access        source        free          good          learn         output        unique        agi           
people        whisper       fine          google        use           code          plan          tokens        game          years         
models        going         time          meta          whisper       just          python        prompt        experience    making        
things        using         results       free          available     access        set           siri          new           want          
great        

The above keywords show that some relevant topics are usability, model capabilities, accessibility, API usage, reasoning, search functionality, learning resources, Advancements, gaming experience, and comparisons with other models.

Overall, all three methods converge on key areas like model performance, accessibility, and comparisons with other models, but NNMF brings out a stronger focus on gaming and learning, while LDA and LSA emphasize the general appreciation and community aspects more strongly.

### LDA for Negative Reviews

In [None]:
neg_vect = CountVectorizer(max_features=10000, max_df=0.2, stop_words="english")
neg_X = neg_vect.fit_transform(negative_df['Cleaned_Text'])

In [None]:
neg_lda = LatentDirichletAllocation(n_components=10, learning_method="batch", max_iter=25, random_state=0)
neg_document_topics = neg_lda.fit_transform(neg_X)

In [None]:
neg_sorting = np.argsort(neg_lda.components_, axis=1)[:, ::-1]
neg_feature_names = np.array(neg_vect.get_feature_names_out())

mglearn.tools.print_topics(topics=range(10), feature_names=neg_feature_names, sorting=neg_sorting, topics_per_chunk=10, n_words=20)

topic 0       topic 1       topic 2       topic 3       topic 4       topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      --------      --------      --------      --------      --------      
like          chatgpt       chatgpt       chatgpt       like          profit        like          chatgpt       access        open          
just          just          model         gpt           open          google        gpt           api           chatgpt       source        
chatgpt       open          really        did           does          non           data          just          like          chatgpt       
open          use           just          google        gpt           model         chatgpt       gpt           models        just          
better        model         good          api           source        just          just          google        google        does          
gpt          

The above keywords show distinct topics related to user dissatisfaction, with themes about issues with models, APIs, data handling, and comparisons with other models

In [None]:
pyLDAvis.lda_model.prepare(neg_lda, neg_X, neg_vect, lambda_step=0.5)

### LSA for Negative Reviews

In [None]:
neg_lsa_model = TruncatedSVD(n_components=10, random_state=42)
neg_lsa_document_topics = neg_lsa_model.fit_transform(neg_X)

In [None]:
neg_lsa_sorting = np.argsort(neg_lsa_model.components_, axis=1)[:, ::-1]
neg_lsa_feature_names = np.array(neg_vect.get_feature_names_out())

mglearn.tools.print_topics(topics=range(10), feature_names=neg_lsa_feature_names, sorting=neg_lsa_sorting, topics_per_chunk=10, n_words=20)

topic 0       topic 1       topic 2       topic 3       topic 4       topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      --------      --------      --------      --------      --------      
chatgpt       chatgpt       open          gpt           just          like          api           data          profit        code          
like          api           chatgpt       api           code          chatgpt       use           profit        non           model         
just          using         source        version       open          looks         data          gpt           free          profit        
open          plus          deepseek      web           chatgpt       llms          models        microsoft     really        api           
gpt           data          censorship    variants      source        feels         using         non           people        data          
model        

The above keywords reveal several key topics related to user dissatisfaction with ChatGPT, API functionality, and issues surrounding accessibility and business practices.

### NNMF for Negative Reviews

In [None]:
neg_nmf_model = NMF(n_components=10, init='nndsvd', max_iter=80, random_state=42)
neg_nfm_document_topics = neg_nmf_model.fit_transform(neg_X)

In [None]:
neg_nfm_sorting = np.argsort(neg_nmf_model.components_, axis=1)[:, ::-1]
neg_nfm_feature_names = np.array(neg_vect.get_feature_names_out())

mglearn.tools.print_topics(topics=range(10), feature_names=neg_nfm_feature_names, sorting=neg_nfm_sorting, topics_per_chunk=10, n_words=20)

topic 0       topic 1       topic 2       topic 3       topic 4       topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      --------      --------      --------      --------      --------      
model         chatgpt       open          gpt           just          like          api           data          profit        code          
free          plus          source        worse         people        did           does          use           non           issue         
better        using         models        users         really        agi           access        training      models        work          
google        subscription  closed        variants      going         feels         use           italy         business      comments      
deepseek      tools         censorship    chat          post          llms          version       collection    tax           error         
models       

The above keywords reveal several key topics related to user dissatisfaction with ChatGPT, including performance issues, accessibility concerns, and business practices.

Common topics of dissatisfaction include model performance issues, API problems, censorship, and accessibility issues related to subscriptions and business practices. Users also express competition from other models from Google and Microsoft.