In [1]:
import pandas as pd

#ploting results
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go

#preprocessing and cleaning 
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

#vectorizors
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#classification imports
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

#evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

**PREPROCESSING_CLEANING**

In [2]:
df=pd.read_csv("D:/Semester 6/NLP/Assignment #2/Q2 Sentiment Analysis Dataset.csv",encoding = "ISO-8859-1")
df=df.drop(['Unnamed: 4','Unnamed: 5','id'],axis=1) #removing redundant features

In [3]:
#encoding
df.loc[df['sentiment'] == 'not_relevant', 'sentiment'] = 6
df['sentiment'].value_counts()

sentiment
3    2162
1    1219
5     423
6      82
Name: count, dtype: int64

In [4]:
print(df.isnull().sum())
print(df.head(5))

sentiment    0
date         0
text         0
dtype: int64
  sentiment                            date  \
0         1  Mon Dec 01 20:46:01 +0000 2014   
1         1  Mon Dec 01 21:09:50 +0000 2014   
2         1  Mon Dec 01 21:35:14 +0000 2014   
3         1  Mon Dec 01 23:55:55 +0000 2014   
4         1  Tue Dec 02 00:06:05 +0000 2014   

                                                text  
0  WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...  
1  @apple Contact sync between Yosemite and iOS8 ...  
3  @Apple, For the love of GAWD, CENTER the '1'on...  
4  i get the storage almost full notification lit...  


In [5]:
#The distribution of classes is not equal 
fig = px.pie(df, values='sentiment', names='sentiment')
fig.show()

In [6]:
df['text'] = df['text'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text)) #remove urls
df['text'] = df['text'].apply(lambda x: x.lower())  # Convert to lowercase
df['text'] = df['text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x)) # removing everything except alpha  values
df['text'] = df['text'].apply(lambda text: re.sub(r"\s+", " ", text).strip()) #removing extra white spaces

In [7]:
df['text'].head(5)

0    wtf my battery was one second ago and now is w...
1    apple contact sync between yosemite and ios is...
3    apple for the love of gawd center the on the d...
4    i get the storage almost full notification lit...
Name: text, dtype: object

In [8]:
nltk.download('stopwords')
stopwordsssssss=stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stopwordsssssss))  # Remove stop words

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arhar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**STEMMING+LEMMATIZATION**

In [9]:
#stemming 
porter_stemmer = PorterStemmer()
def stem_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

df['text'] = df['text'].apply(stem_sentences)

In [10]:
#lemmatizaion i know this is probably redundant but just wanted to try something
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(stemmed_tokens)

df['text'] = df['text'].apply(lemmatize_sentences)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\arhar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**FEATURE_EXTRACTION**

In [11]:
vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(df['text'])

print(vectorizer.get_feature_names_out())
print(bow.toarray())


['aa' 'aaaron' 'aapl' ... 'zu' 'zuckerberg' 'zum']
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 1 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 1 ... 0 0 0]]


In [12]:
#TfidfVectorizer is better than TfidfTransformer xD
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(df['text'])

print(vectorizer.get_feature_names_out())
print(tfidf.toarray())

['aa' 'aaaron' 'aapl' ... 'zu' 'zuckerberg' 'zum']
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.21024268 ... 0.         0.         0.        ]
 [0.         0.         0.13202848 ... 0.         0.         0.        ]
 [0.         0.         0.1234683  ... 0.         0.         0.        ]]


In [13]:
vectorizer = CountVectorizer(ngram_range=(1, 3))
ngram = vectorizer.fit_transform(df['text'])

print(vectorizer.get_feature_names_out())
print(ngram.toarray())

['aa' 'aa credit' 'aa credit rate' ... 'zum' 'zum appl' 'zum appl appl']
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


**MODEL_TRAININGS**

In [39]:
def classifiers(X_train, X_test, y_train, y_test):

    classifiers = {
        'Naïve Bayes': MultinomialNB(),
        'Logistic Regression': LogisticRegression(max_iter=10000),
        'Random Forest': RandomForestClassifier(),
        'SVM': SVC(max_iter=10000),
        'Perceptron': Perceptron(max_iter=10000)
    }



    evaluation_macro = {'Classifier': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1 Score': []}
    evaluation_micro = {'Classifier': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1 Score': []}


    for clf_name, clf in classifiers.items():

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        #macro averaging 
        evaluation_macro['Classifier'].append(clf_name)
        evaluation_macro['Accuracy'].append(accuracy)
        # some values in the confusion matrix might be 0 thast why using zero_dision=1
        evaluation_macro['Precision'].append(precision_score(y_test, y_pred, average='macro', zero_division=1))
        evaluation_macro['Recall'].append(recall_score(y_test, y_pred, average='macro'))
        evaluation_macro['F1 Score'].append(f1_score(y_test, y_pred, average='macro'))

        #micro averaging
        evaluation_micro['Classifier'].append(clf_name)
        evaluation_micro['Accuracy'].append(accuracy)
        # some values in the confusion matrix might be 0 thast why using zero_dision=1
        evaluation_micro['Precision'].append(precision_score(y_test, y_pred, average='micro', zero_division=1))
        evaluation_micro['Recall'].append(recall_score(y_test, y_pred, average='micro'))
        evaluation_micro['F1 Score'].append(f1_score(y_test, y_pred, average='micro'))

    df_macro = pd.DataFrame(evaluation_macro)
    df_micro = pd.DataFrame(evaluation_micro)


    return df_micro, df_macro


In [41]:
X = ngram  
y = df['sentiment'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42) #using stratify so that we have equal label distribution

ngram_micro,ngram_macro=classifiers(X_train, X_test, y_train, y_test)


In [63]:
print(ngram_micro,'\n\n',ngram_macro)

            Classifier  Accuracy  Precision    Recall  F1 Score
0          Naïve Bayes  0.673522   0.673522  0.673522  0.673522
1  Logistic Regression  0.723650   0.723650  0.723650  0.723650
2        Random Forest  0.723650   0.723650  0.723650  0.723650
3                  SVM  0.721080   0.721080  0.721080  0.721080
4           Perceptron  0.719794   0.719794  0.719794  0.719794 

             Classifier  Accuracy  Precision    Recall  F1 Score
0          Naïve Bayes  0.673522   0.474731  0.483058  0.472702
1  Logistic Regression  0.723650   0.523775  0.441206  0.457861
2        Random Forest  0.723650   0.531927  0.442164  0.462501
3                  SVM  0.721080   0.828160  0.415902  0.430303
4           Perceptron  0.719794   0.552902  0.471835  0.494455


In [40]:
X = bow
y = df['sentiment'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42) #using stratify so that we have equal label distribution

bow_micro,bow_macro=classifiers(X_train, X_test, y_train, y_test)


In [64]:
print(bow_micro,'\n\n',bow_macro)

            Classifier  Accuracy  Precision    Recall  F1 Score
0          Naïve Bayes  0.703085   0.703085  0.703085  0.703085
1  Logistic Regression  0.739075   0.739075  0.739075  0.739075
2        Random Forest  0.721080   0.721080  0.721080  0.721080
3                  SVM  0.732648   0.732648  0.732648  0.732648
4           Perceptron  0.682519   0.682519  0.682519  0.682519 

             Classifier  Accuracy  Precision    Recall  F1 Score
0          Naïve Bayes  0.703085   0.529295  0.469760  0.478971
1  Logistic Regression  0.739075   0.593017  0.482285  0.509542
2        Random Forest  0.721080   0.508447  0.447398  0.463330
3                  SVM  0.732648   0.829945  0.431768  0.449867
4           Perceptron  0.682519   0.460579  0.460679  0.458603


In [42]:
X = tfidf
y = df['sentiment'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42) #using stratify so that we have equal label distribution

tfidf_micro,tfidf_macro=classifiers(X_train, X_test, y_train, y_test)

In [65]:
print(tfidf_micro,'\n\n',tfidf_macro)

            Classifier  Accuracy  Precision    Recall  F1 Score
0          Naïve Bayes  0.708226   0.708226  0.708226  0.708226
1  Logistic Regression  0.733933   0.733933  0.733933  0.733933
2        Random Forest  0.732648   0.732648  0.732648  0.732648
3                  SVM  0.745501   0.745501  0.745501  0.745501
4           Perceptron  0.665810   0.665810  0.665810  0.665810 

             Classifier  Accuracy  Precision    Recall  F1 Score
0          Naïve Bayes  0.708226   0.795839  0.394988  0.384891
1  Logistic Regression  0.733933   0.825211  0.431770  0.447657
2        Random Forest  0.732648   0.537130  0.450103  0.469745
3                  SVM  0.745501   0.861313  0.443611  0.465413
4           Perceptron  0.665810   0.472940  0.449149  0.453710


**COMPARING_MODELS**

In [51]:
def graph_comparison(df_micro,df_macro):

    fig = make_subplots(rows=1, cols=4, subplot_titles=("Accuracy", "Precision", "Recall", "F1 Score"))

    metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']

    for i, metric in enumerate(metrics, start=1):
        fig.add_trace(go.Bar(x=df_micro['Classifier'], y=df_micro[metric], name='Micro', marker_color='darkgoldenrod'), row=1, col=i)
        fig.add_trace(go.Bar(x=df_macro['Classifier'], y=df_macro[metric], name='Macro', marker_color='steelblue'), row=1, col=i)

    fig.update_layout(height=400, width=1200, title_text="Performance Metrics Comparison")
    fig.update_xaxes(title_text="Classifiers", row=1, col=1)
    fig.update_yaxes(title_text="Score", row=1, col=1)

    fig.show()


In [55]:
graph_comparison(ngram_micro,ngram_macro)

In [53]:
graph_comparison(bow_micro,bow_macro)

In [54]:
graph_comparison(tfidf_micro,tfidf_macro)