**Each entry in this dataset consists of a text segment representing a Twitter message and a corresponding label indicating the predominant emotion conveyed. The emotions are classified into six categories: sadness (0), joy (1), love (2), anger (3), fear (4), and surprise (5). Whether you're interested in sentiment analysis, emotion classification, or text mining, this dataset provides a rich foundation for exploring the nuanced emotional landscape within the realm of social media.**

# Potential Use Cases:
- **Sentiment Analysis**: Uncover the prevailing sentiments in English messages across various emotions.
- **Emotion Classification**: Develop models to accurately classify **Sentances** into the six specified emotion categories.
- **Textual Analysis**: Explore linguistic patterns and expressions associated with different emotional states.

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("text.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,i just feel really helpless and heavy hearted,4
1,1,ive enjoyed being able to slouch about relax a...,0
2,2,i gave up my internship with the dmrg and am f...,4
3,3,i dont know i feel so lost,0
4,4,i am a kindergarten teacher and i am thoroughl...,4


# Text Cleaning 

In [3]:
df.drop(df.iloc[:,0:1],axis=1,inplace=True)
df.head()

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4


In [4]:
# Initialize the Porter Stemmer
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def process_text(text):
    # Remove all non-letter characters
    cleaned_text = re.sub('[^a-zA-Z]', " ", text)
    # Convert to lowercase
    cleaned_text = cleaned_text.lower()
    # Split into words
    words = cleaned_text.split()
    # Stem words and remove stopwords
    processed_words = [ps.stem(word) for word in words if word not in stop_words]
    # Join words back into a single string
    processed_text = " ".join(processed_words)
    return processed_text

In [5]:
batch_size = 10000
corpus = []

for start in range(0, len(df), batch_size):
    end = start + batch_size
    batch = df.iloc[start:end]
    batch['processed_text'] = batch['text'].apply(process_text)
    corpus.extend(batch['processed_text'].tolist())

In [6]:
# Vectorization 

Count_vector = CountVectorizer()
X = Count_vector.fit_transform(corpus)

In [7]:
y = df['label']

In [8]:
#Train Test Split

X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 9)

# Modelling

**Navie Bayes Classifier with default parameters**

In [9]:
Emotion_classifier_model = MultinomialNB()
Emotion_classifier_model.fit(X_train,y_train)

#Prediction
ypred_test = Emotion_classifier_model.predict(X_test)
ypred_train = Emotion_classifier_model.predict(X_train)

# Evaluation

print("Train Accuracy", accuracy_score(y_train,ypred_train))
print("Train Accuracy", accuracy_score(y_test,ypred_test))

Train Accuracy 0.8609014326114794
Train Accuracy 0.8376358532664764


In [10]:
report = classification_report(y_test, ypred_test)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.92      0.89     24199
           1       0.82      0.92      0.87     28216
           2       0.78      0.53      0.63      6900
           3       0.89      0.82      0.85     11434
           4       0.82      0.79      0.81      9639
           5       0.80      0.34      0.48      2974

    accuracy                           0.84     83362
   macro avg       0.83      0.72      0.75     83362
weighted avg       0.84      0.84      0.83     83362



In [11]:
# hyperparamete Tuning for the best parmeters for better accuracy

# Define the parameter grid for MultinomialNB
param_grid = {'alpha': [0.1, 0.5, 1.0, 5.0, 10.0]}
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

grid_search.best_params_

{'alpha': 0.5}

# Modelling with best classifier

In [12]:
Emotion_classifier_best = MultinomialNB(alpha= .5)
Emotion_classifier_best.fit(X_train,y_train)

#Prediction
ypred_test = Emotion_classifier_best.predict(X_test)
ypred_train = Emotion_classifier_best.predict(X_train)

# Evaluation

print("Train Accuracy", accuracy_score(y_train,ypred_train))
print("Train Accuracy", accuracy_score(y_test,ypred_test))

Train Accuracy 0.8710349770728182
Train Accuracy 0.8405988340011036


In [13]:
# Trying it on the new text

# New sentence to classify
new_sentence = input("Enter your new sentence here: ")

# Preprocess the new sentence
def preprocess_single_sentence(sentence):
    cleaned_sentence = re.sub('[^a-zA-Z]', " ", sentence)
    cleaned_sentence = cleaned_sentence.lower()
    words = cleaned_sentence.split()
    processed_words = [ps.stem(word) for word in words if word not in stop_words]
    processed_sentence = " ".join(processed_words)
    return processed_sentence

processed_sentence = preprocess_single_sentence(new_sentence)

# Transform the preprocessed sentence using the fitted CountVectorizer
sentence_vector = Count_vector.transform([processed_sentence])

# Make prediction using the trained model
prediction = Emotion_classifier_best.predict(sentence_vector)
prediction_proba = Emotion_classifier_best.predict_proba(sentence_vector)

# Map prediction to emotions
emotion_map = {
    0: "sadness",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "surprise"
}

predicted_emotion = emotion_map[prediction[0]]
print("Prediction probabilities of your emotion in sentance :", predicted_emotion)

Enter your new sentence here:  Hello how are you 


Prediction probabilities of your emotion in sentance : sadness


In [14]:
prediction_proba

array([[0.39936273, 0.31773429, 0.03927524, 0.13960955, 0.06730878,
        0.03670941]])

In [15]:
from xgboost import XGBClassifier

#modeling
XGB_Model = XGBClassifier(random_state=0)
XGB_Model.fit(X_train,y_train)

#Evaluation on train data
ypred_train_AB = XGB_Model.predict(X_train)
ypred_test_AB = XGB_Model.predict(X_test)

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
print("Train Accuracy:", accuracy_score(ypred_train_AB,y_train))
print("Test Accuracy:", accuracy_score(ypred_test_AB,y_test))
print("Cross Validation Score:",cross_val_score(XGB_Model, X, y, cv=5, scoring='f1_weighted').mean())

Train Accuracy: 0.8841405080867424
Test Accuracy: 0.8757227513735275
Cross Validation Score: 0.8761642629560733


In [16]:
XGB_report = classification_report(y_test, ypred_test_AB)
print("Classification Report:\n", XGB_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.90      0.92     24199
           1       0.89      0.90      0.90     28216
           2       0.77      0.76      0.76      6900
           3       0.89      0.87      0.88     11434
           4       0.85      0.83      0.84      9639
           5       0.64      0.92      0.76      2974

    accuracy                           0.88     83362
   macro avg       0.83      0.86      0.84     83362
weighted avg       0.88      0.88      0.88     83362



In [None]:
# New sentence to classify with XGB 
new_sentence = input("Enter your new sentence here: ")

processed_sentence = preprocess_single_sentence(new_sentence)

# Transform the preprocessed sentence using the fitted CountVectorizer
sentence_vector = Count_vector.transform([processed_sentence])

# Make prediction using the trained model
prediction = XGB_Model.predict(sentence_vector)

# Map prediction to emotions
emotion_map = {
    0: "sadness",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "surprise"
}

predicted_emotion = emotion_map[prediction[0]]
print("Prediction probabilities of your emotion in sentance :", predicted_emotion)