Importing libraries

In [44]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to C:\Users\Aaghaaz
[nltk_data]     Khan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Loading and Displaying the dataset

In [45]:
df = pd.read_csv("sentiment-analysis-data.csv", encoding="latin_1")
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [46]:
df.columns

Index(['textID', 'text', 'selected_text', 'sentiment', 'Time of Tweet',
       'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)',
       'Density (P/Km²)'],
      dtype='object')

Dropping the columns not required

In [47]:
remove_cols = []

for cols in df.columns:
    if cols != "text" and cols != "sentiment":
        remove_cols.append(cols)

In [48]:
remove_cols

['textID',
 'selected_text',
 'Time of Tweet',
 'Age of User',
 'Country',
 'Population -2020',
 'Land Area (Km²)',
 'Density (P/Km²)']

In [49]:
df.drop(columns=remove_cols, inplace=True)
df.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [50]:
df['sentiment'].unique()

array(['neutral', 'negative', 'positive'], dtype=object)

In [51]:
df['sentiment'].value_counts()

sentiment
neutral     11118
positive     8582
negative     7781
Name: count, dtype: int64

Encoding values of 'sentiment'

In [52]:
df['sentiment'] = df['sentiment'].map({"negative": 0, "neutral": 1, "positive": 2})
df.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",1
1,Sooo SAD I will miss you here in San Diego!!!,0
2,my boss is bullying me...,0
3,what interview! leave me alone,0
4,"Sons of ****, why couldn`t they put them on t...",0


In [53]:
df['sentiment'].unique()

array([1, 0, 2])

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       27480 non-null  object
 1   sentiment  27481 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 429.5+ KB


Dropping the null row from dataset

In [55]:
df.isna().sum()

text         1
sentiment    0
dtype: int64

In [56]:
df = df.dropna()

In [57]:
df.isna().sum()

text         0
sentiment    0
dtype: int64

In [58]:
df.shape

(27480, 2)

Writing a function to clean text by removing stopwords, punctuations, and other special characters as they are not needed in predicting the sentiment of a text.
Then, we will tokenize the text so that later we can encode each tokens to pass it further to the model to train it.

I will be using the 'stopwords' module provided by nltk to remove the stopwords such as 'a', 'and', 'the', etc from the text.

I will be using regex library for removing the special characters.

Link: https://regex101.com/

For tokenizing, I will simply be using
```
text.split()
```

In [59]:
stopwords = set(stopwords.words('english'))

In [60]:
def clean_text(text):
    text = text.lower() # lowercase the text
    text = re.sub(r'[^a-zA-Z\s]', '', text) # removing special characters
    # removing anything that is not an alphabet or a white space 
    words = text.split() # tokenizing the text
    words = [word for word in words if word not in stopwords] # we get a list like ["My", "Name", "is"]
    return " ".join(words) # convert the list to sentence and returning it like ["My Name is"]

In [61]:
df['clean_text'] = df['text'].apply(clean_text)

In [62]:
df.head()

Unnamed: 0,text,sentiment,clean_text
0,"I`d have responded, if I were going",1,id responded going
1,Sooo SAD I will miss you here in San Diego!!!,0,sooo sad miss san diego
2,my boss is bullying me...,0,boss bullying
3,what interview! leave me alone,0,interview leave alone
4,"Sons of ****, why couldn`t they put them on t...",0,sons couldnt put releases already bought


In [63]:
print("Original Text:", df['text'][500])
print("\nClean Text:", df['clean_text'][500])

Original Text:  but my bday is JUNE 19.. this is wack... and ihavent seen any promotions for my bday party  someone better finagle this asap!

Clean Text: bday june wack ihavent seen promotions bday party someone better finagle asap


Train-Test split

In [64]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['sentiment'], test_size=0.2, random_state=42)

Encoding / Vectorizing the text before passing it to the model for training

In [65]:
encoder = TfidfVectorizer()

X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

In [66]:
model = LogisticRegression() # using Logistic Regression model

In [67]:
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [68]:
y_pred = model.predict(X_test)

Checking model performance on our dataset

In [69]:
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Accuracy Score: 0.6848617176128093


In [70]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.57      0.63      1572
           1       0.62      0.74      0.68      2236
           2       0.77      0.71      0.74      1688

    accuracy                           0.68      5496
   macro avg       0.70      0.68      0.68      5496
weighted avg       0.69      0.68      0.68      5496



Trying out custom texts to see how the model works on them

In [71]:
def predict_sentiment(texts):

    tokenized_texts = [clean_text(text) for text in texts]
    vectors = encoder.transform(tokenized_texts)
    results = model.predict(vectors)
    # Convert the labels back to text
    label_map = {0: "Negative", 1   : "Neutral", 2: "Positive"}
    
    return [label_map[result] for result in results]

In [72]:
text1 = "Hey the food was so good!"
text2 = "I'm gonna kill you"
text3 = "Hey hii how are you doing"

In [73]:
texts = [text1, text2, text3]
predictions = predict_sentiment(texts)
for i in range(len(texts)):
    print(f"{texts[i]}: {predictions[i]}\n")

Hey the food was so good!: Positive

I'm gonna kill you: Negative

Hey hii how are you doing: Neutral

