In [25]:
import pandas as pd
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

from nltk.tokenize import word_tokenize
 
DATA_FOLDER_PTH=os.path.join(os.getcwd(), os.pardir, 'data')
 
TRAIN_AUDIO_FOLDER_PTH=os.path.join(DATA_FOLDER_PTH, 'raw/MELD/train', 'train_splits')
TRAIN_TEXT_FILE_PTH=os.path.join(DATA_FOLDER_PTH, 'raw/MELD/train', 'train_sent_emo.csv')
 
DEV_AUDIO_FOLDER_PTH=os.path.join(DATA_FOLDER_PTH, 'raw/MELD/dev', 'dev_splits_complete')
DEV_TEXT_FILE_PTH=os.path.join(DATA_FOLDER_PTH, 'raw/MELD/dev', 'dev_sent_emo.csv')
 
TEST_AUDIO_FOLDER_PTH=os.path.join(DATA_FOLDER_PTH, 'raw/MELD/test', 'output_repeated_splits_test')
TEST_TEXT_FILE_PTH=os.path.join(DATA_FOLDER_PTH, 'raw/MELD/test', 'test_sent_emo.csv')

In [26]:
data = pd.read_csv(TRAIN_TEXT_FILE_PTH, encoding='utf-8')

In [27]:
data.head()

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
0,1,also I was the point person on my company’s tr...,Chandler,neutral,neutral,0,0,8,21,"00:16:16,059","00:16:21,731"
1,2,You must’ve had your hands full.,The Interviewer,neutral,neutral,0,1,8,21,"00:16:21,940","00:16:23,442"
2,3,That I did. That I did.,Chandler,neutral,neutral,0,2,8,21,"00:16:23,442","00:16:26,389"
3,4,So let’s talk a little bit about your duties.,The Interviewer,neutral,neutral,0,3,8,21,"00:16:26,820","00:16:29,572"
4,5,My duties? All right.,Chandler,surprise,positive,0,4,8,21,"00:16:34,452","00:16:40,917"


In [28]:
data.shape

(9989, 11)

In [29]:
corpus = data['Utterance']

vectorizer = TfidfVectorizer(ngram_range = (1, 5), max_features = 1000, lowercase = True, tokenizer = word_tokenize) # todo punctuation
X = vectorizer.fit_transform(corpus)

vectorizer.get_feature_names()

['!',
 '! !',
 '! ! !',
 '! ! ! !',
 '! and',
 '! how',
 '! i',
 '! i ’',
 '! i ’ m',
 '! it',
 '! no',
 '! no !',
 '! oh',
 '! okay',
 '! that',
 '! this',
 '! we',
 '! what',
 '! you',
 "'",
 "''",
 "'d",
 "'ll",
 "'m",
 "'m gon",
 "'m gon na",
 "'m not",
 "'m sorry",
 "'re",
 "'s",
 "'s a",
 "'s just",
 "'s not",
 "'s the",
 "'ve",
 ',',
 ', ``',
 ', a',
 ', all',
 ', and',
 ', and i',
 ', are',
 ', are you',
 ', because',
 ', but',
 ', but i',
 ', can',
 ', come',
 ', come on',
 ', do',
 ', do you',
 ', don',
 ', don ’',
 ', don ’ t',
 ', he',
 ', he ’',
 ', here',
 ', hey',
 ', how',
 ', huh',
 ', huh ?',
 ', i',
 ", i 'm",
 ', i can',
 ', i don',
 ', i don ’',
 ', i don ’ t',
 ', i have',
 ', i just',
 ', i know',
 ', i mean',
 ', i think',
 ', i was',
 ', i ’',
 ', i ’ ll',
 ', i ’ m',
 ', i ’ m sorry',
 ', i-i',
 ', if',
 ', if you',
 ', is',
 ', it',
 ", it 's",
 ', it was',
 ', it ’',
 ', it ’ s',
 ', just',
 ', let',
 ', like',
 ', listen',
 ', look',
 ', maybe',
 ', my',
 '

In [30]:
X.shape

(9989, 1000)

In [31]:
Y_emotion = data[['Emotion']]
Y_emotion

Unnamed: 0,Emotion
0,neutral
1,neutral
2,neutral
3,neutral
4,surprise
...,...
9984,neutral
9985,neutral
9986,surprise
9987,neutral


In [32]:
enc = OrdinalEncoder()
Y_emotion = enc.fit_transform(Y_emotion).ravel()
Y_emotion

array([4., 4., 4., ..., 6., 4., 3.])

In [33]:
Y_sentiment = data[['Sentiment']]
Y_sentiment

Unnamed: 0,Sentiment
0,neutral
1,neutral
2,neutral
3,neutral
4,positive
...,...
9984,neutral
9985,neutral
9986,positive
9987,neutral


In [34]:
enc = OrdinalEncoder()
Y_sentiment = enc.fit_transform(Y_sentiment).ravel()
Y_sentiment

array([1., 1., 1., ..., 2., 1., 2.])

## Train logistic regression for sentiment

In [35]:
logregr_sentiment = LogisticRegression(class_weight='balanced', max_iter=1000)
logregr_sentiment.fit(X, Y_sentiment)

predictions = logregr_sentiment.predict(X)

print(metrics.accuracy_score(predictions, Y_sentiment))

0.693062368605466


## Train logistic regression for emotion

In [36]:
logregr_emotion = LogisticRegression(class_weight='balanced', max_iter=1000)
logregr_emotion.fit(X, Y_emotion)

predictions = logregr_emotion.predict(X)

print(metrics.accuracy_score(predictions, Y_emotion))

0.5419961958153969


# Validation

In [37]:
validation = pd.read_csv(DEV_TEXT_FILE_PTH, encoding='utf-8')
validation.head()

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
0,1,"Oh my God, he’s lost it. He’s totally lost it.",Phoebe,sadness,negative,0,0,4,7,"00:20:57,256","00:21:00,049"
1,2,What?,Monica,surprise,negative,0,1,4,7,"00:21:01,927","00:21:03,261"
2,3,"Or! Or, we could go to the bank, close our acc...",Ross,neutral,neutral,1,0,4,4,"00:12:24,660","00:12:30,915"
3,4,You’re a genius!,Chandler,joy,positive,1,1,4,4,"00:12:32,334","00:12:33,960"
4,5,"Aww, man, now we won’t be bank buddies!",Joey,sadness,negative,1,2,4,4,"00:12:34,211","00:12:37,505"


In [38]:
corpus = validation['Utterance']

X = vectorizer.transform(corpus)

In [39]:
Y_emotion = validation[['Emotion']]

enc = OrdinalEncoder()
Y_emotion = enc.fit_transform(Y_emotion).ravel()
labels_emotion = enc.categories_

In [40]:
Y_sentiment = validation[['Sentiment']]

enc = OrdinalEncoder()
Y_sentiment = enc.fit_transform(Y_sentiment).ravel()
labels_sentiment = enc.categories_

## Sentiment test

In [41]:
predictions = logregr_sentiment.predict(X)


print(metrics.classification_report(Y_sentiment, predictions, target_names = labels_sentiment[0]))

              precision    recall  f1-score   support

    negative       0.57      0.48      0.52       406
     neutral       0.67      0.73      0.70       470
    positive       0.48      0.52      0.50       233

    accuracy                           0.60      1109
   macro avg       0.57      0.58      0.57      1109
weighted avg       0.59      0.60      0.59      1109



In [42]:
pd.DataFrame(metrics.confusion_matrix(Y_sentiment, predictions), index = labels_sentiment, columns = labels_sentiment)

Unnamed: 0,negative,neutral,positive
negative,196,119,91
neutral,89,342,39
positive,58,53,122


## Emotion test

In [43]:
predictions = logregr_emotion.predict(X)

print(predictions)
print(Y_emotion)

print(metrics.classification_report(Y_emotion, predictions, target_names = labels_emotion[0]))

[5. 6. 0. ... 0. 3. 3.]
[5. 6. 4. ... 5. 5. 5.]
              precision    recall  f1-score   support

       anger       0.33      0.27      0.30       153
     disgust       0.02      0.09      0.03        22
        fear       0.07      0.17      0.10        40
         joy       0.49      0.46      0.48       163
     neutral       0.72      0.51      0.60       470
     sadness       0.25      0.28      0.26       111
    surprise       0.52      0.57      0.54       150

    accuracy                           0.44      1109
   macro avg       0.34      0.34      0.33      1109
weighted avg       0.52      0.44      0.47      1109



In [44]:
pd.DataFrame(metrics.confusion_matrix(Y_emotion, predictions), index = labels_emotion, columns = labels_emotion)

Unnamed: 0,anger,disgust,fear,joy,neutral,sadness,surprise
anger,42,15,13,20,32,16,15
disgust,6,2,3,1,4,3,3
fear,10,6,7,5,4,5,3
joy,17,18,8,75,27,7,11
neutral,35,29,39,27,242,57,41
sadness,7,19,22,8,17,31,7
surprise,12,12,7,16,11,7,85
