In [32]:
import pandas as pd
import numpy as np
import re
import string

import matplotlib.pyplot as plt
import seaborn as sns

# NLP Libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# SKLearn Libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

# Evaluation Metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import pickle


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ranjith/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ranjith/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ranjith/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Load Raw Datasets

In [3]:
train_data = pd.read_csv('train.csv', encoding='latin1')
test_data = pd.read_csv('test.csv', encoding='latin1')

train_data.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [4]:
train_data.shape

(27481, 10)

In [5]:
test_data.shape

(4815, 9)

In [6]:
train_data.sentiment.value_counts()

sentiment
neutral     11118
positive     8582
negative     7781
Name: count, dtype: int64

# NLP Preprocessing

In [7]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
{re.escape(string.punctuation)}

{'!"\\#\\$%\\&\'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}\\~'}

In [11]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r'\d+', '', text)
    return text

In [12]:
train_data['cleaned_text'] = train_data['text'].apply(clean_text)
test_data['cleaned_text'] = test_data['text'].apply(clean_text)

In [13]:
train_data['cleaned_text']

0                        id have responded if i were going
1               sooo sad i will miss you here in san diego
2                                   my boss is bullying me
3                            what interview leave me alone
4         sons of  why couldnt they put them on the rel...
                               ...                        
27476     wish we could come see u on denver  husband l...
27477     ive wondered about rake to  the client has ma...
27478     yay good for both of you enjoy the break  you...
27479                                but it was worth it  
27480       all this flirting going on  the atg smiles ...
Name: cleaned_text, Length: 27481, dtype: object

In [14]:
# Stopwords and Lemmatization apply

stop_words = set(stopwords.words('english'))


In [19]:
word_list = ['he', 'go','market']
word_list

['he', 'go', 'market']

In [20]:
' '.join(word_list)

'he go market'

In [15]:
lemmatizer = WordNetLemmatizer()

In [16]:
def preprocess_text(text):
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return ' '.join(words)

In [17]:
train_data['processed_text'] = train_data['cleaned_text'].apply(preprocess_text)
test_data['processed_text'] = test_data['cleaned_text'].apply(preprocess_text)

In [18]:
train_data['processed_text']

0                                       id responded going
1                                  sooo sad miss san diego
2                                             bos bullying
3                                    interview leave alone
4                   son couldnt put release already bought
                               ...                        
27476    wish could come see u denver husband lost job ...
27477    ive wondered rake client made clear net dont f...
27478    yay good enjoy break probably need hectic week...
27479                                                worth
27480                     flirting going atg smile yay hug
Name: processed_text, Length: 27481, dtype: object

# Train Test Split

In [21]:
X = train_data['processed_text']
y = train_data['sentiment']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)

# TF-IDF Vectorization

In [22]:
tfidf = TfidfVectorizer()

X_train_vec = tfidf.fit_transform(X_train)
X_val_vec = tfidf.transform(X_val)

X_test_vec = tfidf.transform(test_data['processed_text'])

In [23]:
X_train_vec

<21984x22409 sparse matrix of type '<class 'numpy.float64'>'
	with 153969 stored elements in Compressed Sparse Row format>

# Model Explorations

## Random Forest Classifer

In [26]:
rf = RandomForestClassifier(n_estimators=100, random_state=32)
rf.fit(X_train_vec, y_train)
y_pred_rf = rf.predict(X_val_vec)

print('********** Model: RandomForestClassifier*************')

print('Accuracy_Score: \n', accuracy_score(y_val, y_pred_rf))

print('Classification Repot: \n', classification_report(y_val, y_pred_rf))

********** Model: RandomForestClassifier*************
Accuracy_Score: 
 0.6958340913225396
Classification Repot: 
               precision    recall  f1-score   support

    negative       0.75      0.59      0.66      1636
     neutral       0.64      0.73      0.68      2185
    positive       0.73      0.75      0.74      1676

    accuracy                           0.70      5497
   macro avg       0.71      0.69      0.70      5497
weighted avg       0.70      0.70      0.69      5497



## Navie Bayer Classifer

In [27]:
nb = MultinomialNB()
nb.fit(X_train_vec, y_train)
y_pred_nb = nb.predict(X_val_vec)

print('********** Model: Naive Bayes Classifier*************')

print('Accuracy_Score: \n', accuracy_score(y_val, y_pred_nb))

print('Classification Repot: \n', classification_report(y_val, y_pred_nb))

********** Model: Naive Bayes Classifier*************
Accuracy_Score: 
 0.6132435874113152
Classification Repot: 
               precision    recall  f1-score   support

    negative       0.80      0.40      0.54      1636
     neutral       0.52      0.82      0.64      2185
    positive       0.74      0.55      0.63      1676

    accuracy                           0.61      5497
   macro avg       0.69      0.59      0.60      5497
weighted avg       0.67      0.61      0.61      5497



## SVM Classifer

In [31]:
svm = LinearSVC()
svm.fit(X_train_vec, y_train)
y_pred_svm = svm.predict(X_val_vec)

print('********** Model: SVM Classifier*************')

print('Accuracy_Score: \n', accuracy_score(y_val, y_pred_svm))

print('Classification Repot: \n', classification_report(y_val, y_pred_svm))

********** Model: SVM Classifier*************
Accuracy_Score: 
 0.6690922321266145
Classification Repot: 
               precision    recall  f1-score   support

    negative       0.69      0.63      0.66      1636
     neutral       0.62      0.67      0.65      2185
    positive       0.72      0.71      0.71      1676

    accuracy                           0.67      5497
   macro avg       0.68      0.67      0.67      5497
weighted avg       0.67      0.67      0.67      5497



In [29]:
y_val

19234     neutral
10375    negative
8495     positive
3483     negative
6877      neutral
           ...   
3119      neutral
22194     neutral
4326     positive
15582     neutral
18965    negative
Name: sentiment, Length: 5497, dtype: object

In [30]:
y_pred_svm

array(['neutral', 'negative', 'positive', ..., 'positive', 'negative',
       'negative'], dtype=object)

# Save the Best Model

In [33]:
with open('sentiment_model_v1.pkl', 'wb') as f:
    pickle.dump(rf, f)

# Load the Trained Model

In [34]:
with open('sentiment_model_v1.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# Predict Using the Latest Loaded Model

In [36]:
sample_text = ['This movie was absolutely fantastic!']

sample_cleaned = [preprocess_text(clean_text(t)) for t in sample_text]
sample_vec = tfidf.transform(sample_cleaned)

print('Predicted Sentiment:', loaded_model.predict(sample_vec)[0])


Predicted Sentiment: positive
