In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [3]:
df=pd.read_csv('one_output_dataset_new.csv')
df

Unnamed: 0,speech,emotion
0,From the moment that the French defenses at Se...,trust
1,"Five score years ago, a great American, in who...",anticipation
2,The media will not show the magnitude of this ...,anger
3,"Madam Vice President, my fellow Americans: to ...",disgust
4,"My German countrymen, men and women, (long pau...",anger
5,We observe today not a victory of party but a ...,joy
6,"Your Majesties, Your Highnesses, Distinguished...",trust
7,I am honored to be with you today at your comm...,anticipation
8,"Honorable UN Secretary General Mr Ban Ki-moon,...",sadness
9,It is with a profound sense of humility that I...,sadness


In [5]:
#preprocess transcript

# download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# initialize a PorterStemmer
stemmer = PorterStemmer()

def preprocess_text(text):
    # lowercase the text
    text = text.lower()
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # tokenize the text
    words = word_tokenize(text)
    # remove stopwords and stem the words
    words = [stemmer.stem(word) for word in words if word not in stopwords.words('english')]
    # join the words back into a string
    text = ' '.join(words)

    return text

df['speech'] = df['speech'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /home/arjun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/arjun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df['speech'], df['emotion'], test_size=0.2, random_state=42)

In [8]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [9]:
text_clf.fit(X_train, y_train)

In [10]:
predicted = text_clf.predict(X_test)

In [11]:
predicted

array(['anticipation', 'anticipation', 'anticipation'], dtype='<U12')

In [12]:
from sklearn import metrics


In [13]:
accuracy = metrics.accuracy_score(y_test, predicted)
precision = metrics.precision_score(y_test, predicted, average='weighted', zero_division=0)
recall = metrics.recall_score(y_test, predicted, average='weighted', zero_division=0)
f1 = metrics.f1_score(y_test, predicted, average='weighted')

In [14]:
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.0
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
