### Sentiment Analysis

In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report
import spacy

In [3]:
columns = ['id','games','Label','Text']

df_train=pd.read_csv('twitter_training.csv', names = columns)
df_test=pd.read_csv('twitter_validation.csv', names = columns)

In [4]:
df_train.head()

Unnamed: 0,id,games,Label,Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [5]:
df_test.head()

Unnamed: 0,id,games,Label,Text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [6]:
le = LabelEncoder()
df_train['Label'] = le.fit_transform(df_train['Label'])

In [7]:
nlp = spacy.load("en_core_web_sm") 

def preprocess(text):
    if pd.isnull(text):
        return ""
    # Tokenize the text and remove stop words and punctuation
    doc = nlp(text)
    filtered_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(filtered_tokens)

In [8]:
# drop null values
df_train.dropna(inplace=True)

### Preprocessing

In [9]:
df_train['Preprocessed Text']=df_train['Text'].apply(preprocess) 

In [10]:
df_train.head()

Unnamed: 0,id,games,Label,Text,Preprocessed Text
0,2401,Borderlands,3,im getting on borderlands and i will murder yo...,m get borderland murder
1,2401,Borderlands,3,I am coming to the borders and I will kill you...,come border kill
2,2401,Borderlands,3,im getting on borderlands and i will kill you ...,m get borderland kill
3,2401,Borderlands,3,im coming on borderlands and i will murder you...,m come borderland murder
4,2401,Borderlands,3,im getting on borderlands 2 and i will murder ...,m get borderland 2 murder


### Test and Train sets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df_train['Preprocessed Text'], df_train['Label'], test_size=0.2, random_state=42,
stratify=df_train['Label'])

In [12]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (59196,)
Shape of X_test:  (14800,)


### Naive Bayes Model

In [13]:
clf = Pipeline([
    ('vectorizer_tri_grams', TfidfVectorizer()),
    ('naive_bayes', MultinomialNB())        
])


In [14]:
clf.fit(X_train, y_train)

Pipeline(steps=[('vectorizer_tri_grams', TfidfVectorizer()),
                ('naive_bayes', MultinomialNB())])

In [15]:
y_pred = clf.predict(X_test)

In [16]:
print(accuracy_score(y_test, y_pred))

0.7312837837837838


In [17]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.46      0.62      2575
           1       0.65      0.90      0.76      4472
           2       0.84      0.63      0.72      3622
           3       0.71      0.81      0.76      4131

    accuracy                           0.73     14800
   macro avg       0.79      0.70      0.71     14800
weighted avg       0.77      0.73      0.72     14800



In [18]:
# Using GridSearchCV to get best hyperparameters

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

# Define the parameter grid
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],  # unigrams, bigrams, trigrams
    'tfidf__use_idf': [True, False],                # use or don't use IDF
    'nb__alpha': [0.1, 0.5, 1.0, 5.0, 10.0]         # smoothing parameter for MultinomialNB
}

# Initialize GridSearchCV with the pipeline and the parameter grid
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print(f"Best Parameters: {best_params}")

# Predict on the test data using the best pipeline model
y_pred = best_model.predict(X_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Best Parameters: {'nb__alpha': 0.1, 'tfidf__ngram_range': (1, 3), 'tfidf__use_idf': True}
Accuracy: 0.9311486486486487


### Test Model

In [24]:
test_text = df_test['Text'][6]
print(f"{test_text} ===> {df_test['Label'][6]}")

Thank you @EAMaddenNFL!! 

New TE Austin Hooper in the ORANGE & BROWN!! 

#Browns | @AustinHooper18 

 pic.twitter.com/GRg4xzFKOn ===> Positive


In [25]:
# Apply preprocess

test_text_processed = [preprocess(test_text)]
test_text_processed

['thank @EAMaddenNFL \n\n New TE Austin Hooper ORANGE BROWN \n\n brown | @austinhooper18 \n\n  pic.twitter.com/grg4xzfkon']

In [26]:
test_text = clf.predict(test_text_processed)

In [28]:
classes = ['Irrelevant', 'Natural', 'Negative', 'Positive']

print(f"True Label: {df_test['Label'][6]}")
print(f'Predict Label: {classes[test_text[0]]}')

True Label: Positive
Predict Label: Positive


### Another test

In [33]:
test_text = df_test['Text'][10]
print(f"{test_text} ===> {df_test['Label'][10]}")

# Apply preprocess
test_text_processed = [preprocess(test_text)]
test_text_processed

test_text = clf.predict(test_text_processed)

print(f"True Label: {df_test['Label'][10]}")
print(f'Predict Label: {classes[test_text[0]]}')

The professional dota 2 scene is fucking exploding and I completely welcome it.

Get the garbage out. ===> Positive
True Label: Positive
Predict Label: Positive


Appears that model seems to be able to predict the sentiments with high accuracy. To confirm, create a list and run all and compare the model's predictions against the actual.