Design and implement a machine learning model to perform sentiment analysis on a given
dataset containing textual data and corresponding sentiment labels. The analysis should use
TF-IDF (Term Frequency-Inverse Document Frequency) vectors to transform the text into
numerical features and classify the sentiments effectively

In [1]:
import nltk
import pandas as pd
import numpy as np
import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, classification_report

In [2]:
#download the necessary models for each task
nltk.download('punkt')  # Download the tokenizer models
nltk.download('punkt_tab')
nltk.download('wordnet')  # Download WordNet, required for semantic analysis for lemmatization
nltk.download('stopwords')
#nltk.download('averaged_perceptron_tagger')  # Download POS tagger
nltk.download('omw-1.4')  # Download the WordNet OMW corpus

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Asad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Asad\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
# read the dataset
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
train.shape, test.shape

((27481, 10), (3534, 9))

In [6]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [7]:
test.head()

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797,27400.0,105
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044,2381740.0,18
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265,470.0,164
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272,1246700.0,26


In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3534 entries, 0 to 3533
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   textID            3534 non-null   object 
 1   text              3534 non-null   object 
 2   sentiment         3534 non-null   object 
 3   Time of Tweet     3534 non-null   object 
 4   Age of User       3534 non-null   object 
 5   Country           3534 non-null   object 
 6   Population -2020  3534 non-null   int64  
 7   Land Area (Km²)   3534 non-null   float64
 8   Density (P/Km²)   3534 non-null   int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 248.6+ KB


In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   textID            27481 non-null  object 
 1   text              27480 non-null  object 
 2   selected_text     27480 non-null  object 
 3   sentiment         27481 non-null  object 
 4   Time of Tweet     27481 non-null  object 
 5   Age of User       27481 non-null  object 
 6   Country           27481 non-null  object 
 7   Population -2020  27481 non-null  int64  
 8   Land Area (Km²)   27481 non-null  float64
 9   Density (P/Km²)   27481 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 2.1+ MB


In [10]:
train.drop(columns=['selected_text'], inplace=True)

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   textID            27481 non-null  object 
 1   text              27480 non-null  object 
 2   sentiment         27481 non-null  object 
 3   Time of Tweet     27481 non-null  object 
 4   Age of User       27481 non-null  object 
 5   Country           27481 non-null  object 
 6   Population -2020  27481 non-null  int64  
 7   Land Area (Km²)   27481 non-null  float64
 8   Density (P/Km²)   27481 non-null  int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 1.9+ MB


In [12]:
train.shape, test.shape

((27481, 9), (3534, 9))

In [13]:
test['sentiment'].value_counts()

sentiment
neutral     1430
positive    1103
negative    1001
Name: count, dtype: int64

In [14]:
nlp = spacy.load("en_core_web_sm", disable='ner')

### Preprocessing

In [15]:
# Remove non-string values
train['text'] = train['text'].apply(lambda x: x if isinstance(x, str) else None)

# Drop rows with None (NaN)
train.dropna(subset=['text'], inplace=True)

In [16]:
# Remove non-string values
test['text'] = test['text'].apply(lambda x: x if isinstance(x, str) else None)

# Drop rows with None (NaN)
test.dropna(subset=['text'], inplace=True)

In [17]:
print(test['sentiment'].isnull().sum())

0


In [18]:
def preprocess_text(texts):
    # lemmatize the tokens and store them in a list
    processed_texts = []
    for doc in nlp.pipe(texts, n_process=-1):
        lemmatized_tokens = [token.lemma_.lower() for token in doc if token.is_alpha and token.lemma_ not in nlp.Defaults.stop_words]
        
        # Join the lemmatized tokens into a string
        processed_text = " ".join(lemmatized_tokens)
        
        processed_texts.append(processed_text)
        
    return processed_texts

In [19]:
#train['text'] = train['text'].astype(str)
#test['text'] =test['text'].astype(str)

In [20]:
# apply preprcoess_text function to user_review column
train['text'] = preprocess_text(train['text'])
test['text'] = preprocess_text(test['text'])

In [21]:
#view the first 5 rows
train['text'].head()

0                    respond i
1    sooo sad i miss san diego
2                 boss bully i
3            interview leave i
4              son release buy
Name: text, dtype: object

In [22]:
test['text'].head()

0                                          session day
1    shanghai exciting precisely skyscraper galore ...
2    recession hit veronique branquinho quit compan...
3                                           happy bday
4                                               i like
Name: text, dtype: object

### Vectorization & One Hot Encoding

In [23]:
count_vectorizer_ohe = CountVectorizer(min_df=0.001, binary=True)

In [24]:
#fit_transform user_review
count_vectorizer_ohe_train = count_vectorizer_ohe.fit_transform(train['text'])

### Building a Naive Bayes Model

In [25]:
# Naive Bayes Classifier
naive_bayes_classifier = BernoulliNB()

In [26]:
#create the naive bayes model for the train data
naive_bayes_classifier.fit(count_vectorizer_ohe_train, train['sentiment'])
naive_bayes_classifier.score(count_vectorizer_ohe_train, train['sentiment'])

0.6976346433770014

In [27]:
##create the naive bayes model for the validation data
count_vectorizer_ohe_test = count_vectorizer_ohe.transform(test['text'])
naive_bayes_classifier.score(count_vectorizer_ohe_test, test['sentiment'])

0.6844934917940011

### Count Vectorizer

In [28]:
# initialize count_vectorizer and name it count_vectorizer
count_vectorizer = CountVectorizer(min_df=0.001)

In [29]:
#fit_transform user_review
count_vectorizer_train = count_vectorizer.fit_transform(train['text'])

### Building a Naive Bayes Model using count vectorization

# Naive Bayes Classifier
naive_bayes_classifier = MultinomialNB()

In [30]:
#create the naive bayes model for the train data
naive_bayes_classifier.fit(count_vectorizer_train, train['sentiment'])
naive_bayes_classifier.score(count_vectorizer_train, train['sentiment'])

0.6976346433770014

In [32]:
##create the naive bayes model for the validation data
count_vectorizer_test = count_vectorizer.transform(test['sentiment'])
naive_bayes_classifier.score(count_vectorizer_test, test['sentiment'])

0.4046406338426712

### TF-IDF

In [33]:
# initialize tfifd vectorizer
tfidf_vectorizer = TfidfVectorizer(min_df=0.001)

In [34]:
#create the naive bayes model for the train data using tfidf
tfidf_vectorizer_train = tfidf_vectorizer.fit_transform(train['text'])
naive_bayes_classifier.fit(tfidf_vectorizer_train, train['sentiment'])
naive_bayes_classifier.score(tfidf_vectorizer_train, train['sentiment'])

0.6976346433770014

In [35]:
#create the naive bayes model for the validation data using tfidf
tfidf_vectorizer_test = tfidf_vectorizer.transform(train['text'])
naive_bayes_classifier.score(tfidf_vectorizer_test, train['sentiment'])

0.6976346433770014

### Using n-grams with Tfldf

In [36]:
tfidf_ngram_vectorizer = TfidfVectorizer(min_df=0.001, ngram_range=(1, 3))

### Building Naive Bayes Model

In [37]:
#create the naive bayes model for the train data using tfidf and ngram
tfidf_ngram_vectorizer_train = tfidf_ngram_vectorizer.fit_transform(train['text'])
naive_bayes_classifier.fit(tfidf_ngram_vectorizer_train, train['sentiment'])
naive_bayes_classifier.score(tfidf_ngram_vectorizer_train, train['sentiment'])

0.6915574963609898

In [38]:
tfidf_ngram_vectorizer.get_feature_names_out()[150:160]

array(['couple', 'course', 'cousin', 'cover', 'coz', 'crash', 'crazy',
       'cream', 'cross', 'cry'], dtype=object)

In [39]:
#create the naive bayes model for the validation data using tfidf and ngram
tfidf_ngram_vectorizer_test = tfidf_ngram_vectorizer.transform(test['text'])
naive_bayes_classifier.score(tfidf_ngram_vectorizer_test, test['sentiment'])

0.6760045274476514

In [40]:
count_ngram_vectorizer = CountVectorizer(min_df=0.001, ngram_range=(1, 3))

In [41]:
#create the naive bayes model for the train data using count vectorizer and ngram
count_ngram_vectorizer_train = count_ngram_vectorizer.fit_transform(train['text'])
naive_bayes_classifier.fit(count_ngram_vectorizer_train, train['sentiment'])
naive_bayes_classifier.score(count_ngram_vectorizer_train, train['sentiment'])

0.6915574963609898

In [42]:
#create the naive bayes model for the validation data using count vectorizer and ngram
count_ngram_vectorizer_test = count_ngram_vectorizer.transform(test['text'])
naive_bayes_classifier.score(count_ngram_vectorizer_test, test['sentiment'])

0.6760045274476514