Import libraries


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import keras
import tensorflow


load the data

In [3]:
data = pd.read_csv('/content/tweet_emotions.csv')
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


Data preprocesasing

In [4]:
data.describe()

Unnamed: 0,tweet_id
count,10989.0
mean,1960200000.0
std,2146952.0
min,1956967000.0
25%,1957617000.0
50%,1960871000.0
75%,1962104000.0
max,1963082000.0


In [5]:
#check missing values
data.isnull().sum()

Unnamed: 0,0
tweet_id,0
sentiment,0
content,0


In [6]:
data.columns

Index(['tweet_id', 'sentiment', 'content'], dtype='object')

In [7]:
data['sentiment'].unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [8]:
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
worry,3413
sadness,2458
neutral,2053
surprise,606
hate,581
happiness,507
love,410
relief,249
fun,229
empty,218


In [9]:
#rename column content as tweet
data.rename(columns={'content':'tweet'},inplace=True)

In [10]:
data.head()

Unnamed: 0,tweet_id,sentiment,tweet
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [11]:
# label encoding for sentiment column
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['sentiment_label'] = le.fit_transform(data['sentiment'])

cleaning text data

In [12]:
import nltk
import string
nltk.download('stopwords')
from nltk.corpus import stopwords
import re

stop_words = set(stopwords.words('english'))

def clean_tweet(tweet):
    # Handle potential float values (e.g., NaN)
    if isinstance(tweet, float):
        return '' # Or any other appropriate handling for float values
    #convert to lowercase
    tweet = tweet.lower()
    #remove urls
    tweet = re.sub(r'http\S+', '', tweet)
    #remove mentions
    tweet = re.sub(r'@\w+', '', tweet)
    #remove hashtags
    tweet = re.sub(r'#\w+', '', tweet)
    #remove numbers
    tweet = re.sub(r'\d+', '', tweet)
    #remove special characters
    tweet = re.sub(r'[^\w\s]', '', tweet)
    #remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    #remove stopwords
    tweet = ' '.join([word for word in tweet.split() if word not in stop_words])
    return tweet

data['cleaned_tweet'] = data['tweet'].apply(clean_tweet)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [13]:
data.head()

Unnamed: 0,tweet_id,sentiment,tweet,sentiment_label,cleaned_tweet
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...,2,know listenin bad habit earlier started freaki...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...,10,layin n bed headache ughhhhwaitin call
2,1956967696,sadness,Funeral ceremony...gloomy friday...,10,funeral ceremonygloomy friday
3,1956967789,enthusiasm,wants to hang out with friends SOON!,3,wants hang friends soon
4,1956968416,neutral,@dannycastillo We want to trade with someone w...,8,want trade someone houston tickets one


test train split


In [14]:
from sklearn.model_selection import train_test_split
x = data['cleaned_tweet']
y = data['sentiment_label']
#split dataset into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)


text vectorization

In [15]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
#using tf and idf
tfidf_vectorizer = TfidfVectorizer()
x_train = tfidf_vectorizer.fit_transform(x_train)
x_test = tfidf_vectorizer.transform(x_test)

In [16]:
#using logistic regression model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train,y_train)


In [17]:
y_pred = model.predict(x_test)


In [18]:
# checking accuracy and confusion matrix
from sklearn.metrics import accuracy_score,confusion_matrix
print('accuracy: ',accuracy_score(y_test,y_pred))
print('confusion matrix:\n ',confusion_matrix(y_test,y_pred))

accuracy:  0.32150439793751895
confusion matrix:
  [[  0   0   0   0   0   0   0   0   3   0   1   0  16]
 [  0   0   0   0   0   0   0   0   4   0  11   0  13]
 [  0   0   0   0   0   1   0   0  18   0  14   0  33]
 [  0   0   0   0   0   0   0   0  13   0  10   0  23]
 [  0   0   0   0   0   0   0   0  13   0  18   0  38]
 [  0   0   0   0   0   2   1   1  17   0  38   0  76]
 [  0   0   0   0   0   1  12   0  14   0  42   0  96]
 [  0   0   0   0   0   1   1   5  23   0  39   0  67]
 [  0   0   0   0   0   3   2   1 144   0  99   1 352]
 [  0   0   0   0   0   0   0   0  12   0  24   0  45]
 [  0   0   0   0   0   1  13   3  63   0 238   0 420]
 [  0   0   0   0   0   1   2   0  34   0  39   4 116]
 [  0   0   0   0   0   4   8   0 115   0 233   0 655]]
