In [68]:
import pandas as pd 
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
import joblib

Downloading the dataset from kaggle

In [None]:


import kagglehub

# Download latest version
path = kagglehub.dataset_download("mdismielhossenabir/sentiment-analysis")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/mdismielhossenabir/sentiment-analysis?dataset_version_number=1...


100%|██████████| 14.3k/14.3k [00:00<00:00, 4.60MB/s]

Extracting files...
Path to dataset files: /Users/anoshandrews/.cache/kagglehub/datasets/mdismielhossenabir/sentiment-analysis/versions/1





In [5]:
import os
os.listdir('/Users/anoshandrews/.cache/kagglehub/datasets/mdismielhossenabir/sentiment-analysis/versions/1')

['sentiment_analysis.csv']

In [6]:
path = os.path.join('/Users/anoshandrews/.cache/kagglehub/datasets/mdismielhossenabir/sentiment-analysis/versions/1', 'sentiment_analysis.csv')

In [7]:
df = pd.read_csv(path)

In [8]:
df.head()

Unnamed: 0,Year,Month,Day,Time of Tweet,text,sentiment,Platform
0,2018,8,18,morning,What a great day!!! Looks like dream.,positive,Twitter
1,2018,8,18,noon,"I feel sorry, I miss you here in the sea beach",positive,Facebook
2,2017,8,18,night,Don't angry me,negative,Facebook
3,2022,6,8,morning,We attend in the class just for listening teac...,negative,Facebook
4,2022,6,8,noon,"Those who want to go, let them go",negative,Instagram


In [9]:
df.shape

(499, 7)

In [67]:
df['sentiment'].value_counts()

sentiment
neutral     199
positive    166
negative    134
Name: count, dtype: int64

In [10]:
new_df = pd.DataFrame()

In [11]:
new_df['tweet'] = df['text']
new_df['sentiment'] = df['sentiment']

In [12]:
new_df.head()

Unnamed: 0,tweet,sentiment
0,What a great day!!! Looks like dream.,positive
1,"I feel sorry, I miss you here in the sea beach",positive
2,Don't angry me,negative
3,We attend in the class just for listening teac...,negative
4,"Those who want to go, let them go",negative


In [20]:
def text_cleaning(text):
    text = text.lower()
    text = re.sub(re.compile('<.*?>'), '', text)  # removing the html tags
    text = re.sub(r'[^\w\s]', '', text) # removing the punctuations
    return text

# s1 = '<head> Anosh,; is: good <\head>'
# print(text_cleaning(s1))
    

In [21]:
new_df['tweet'] = new_df['tweet'].apply(text_cleaning)

In [22]:
new_df['tweet']

0                      what a great day looks like dream
1          i feel sorry i miss you here in the sea beach
2                                          dont angry me
3      we attend in the class just for listening teac...
4                       those who want to go let them go
                             ...                        
494    according to  a quarter of families under six ...
495        the plan to not spend money is not going well
496      uploading all my bamboozle pictures of facebook
497     congratulations  you guys finish a month earl...
498     actually i wish i was back in tahoe  i miss i...
Name: tweet, Length: 499, dtype: object

In [27]:
# now the text has been cleaned, and now we have to tokenize the data to do further operations

new_df['tweet'] = new_df['tweet'].apply(word_tokenize)

In [28]:
# removing the stopwords: words that do not convey any special meaning( you have to first imoprt nltk- Natural Language toolkit for that


stopwords = stopwords.words('english')

new_df['tweet'] = new_df['tweet'].apply(lambda tokens: [word for word in tokens if word not in stopwords])

In [29]:
new_df.head()

Unnamed: 0,tweet,sentiment
0,"[great, day, looks, like, dream]",positive
1,"[feel, sorry, miss, sea, beach]",positive
2,"[dont, angry]",negative
3,"[attend, class, listening, teachers, reading, ...",negative
4,"[want, go, let, go]",negative


In [30]:
# joining back the text to apply Tf-Idf Vectorizer
new_df['tweet'] = new_df['tweet'].apply(lambda tokens: ' '.join(tokens))

In [31]:
new_df.head()

Unnamed: 0,tweet,sentiment
0,great day looks like dream,positive
1,feel sorry miss sea beach,positive
2,dont angry,negative
3,attend class listening teachers reading slide ...,negative
4,want go let go,negative


In [35]:
# applying tf-idf vectorizer because it seems to work

ti = TfidfVectorizer()

X = new_df['tweet']
y = new_df['sentiment']

In [38]:
# before that we need to perform the train test split you will understand why

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

In [51]:
X_train_vectorized = ti.fit_transform(X_train)
X_test_vectorized = ti.transform(X_test)  # make sure that you only apply transform on X_test because you dont want it to fit again on this new data and create discrepancies

In [54]:
print(X_train_vectorized.shape)
print(X_test_vectorized.shape)

(399, 1145)
(100, 1145)


In [55]:
mnb = MultinomialNB()  # We are using multinomial NB as it was proven effective
mnb.fit(X_train_vectorized, y_train)

In [58]:
y_pred = mnb.predict(X_test_vectorized)
mnb_accuracy = accuracy_score(y_pred, y_test)
mnb_accuracy

0.66

In [62]:
mnb.predict( X_test_vectorized[15])

array(['positive'], dtype='<U8')

In [63]:
y_test.iloc[15]

'positive'

In [64]:
from sklearn.naive_bayes import BernoulliNB
nb = BernoulliNB()

In [65]:
nb.fit(X_train_vectorized, y_train)

In [66]:
y_pred_new = nb.predict(X_test_vectorized)
accuracy_new = accuracy_score(y_pred_new, y_test)
accuracy_new

0.58

In [69]:
joblib.dump(ti, 'tfidf_vectorizer.pkl')
joblib.dump(mnb, 'mnb_model.pkl')

['mnb_model.pkl']