# Importing and preprocessing


In [59]:
import json
import pandas as pd

# Load JSON data
with open('friends_train.json') as f:
    data = json.load(f)


# Create dataframes for each list in data
dfs = [pd.DataFrame(lst) for lst in data]

# Concatenate dataframes
df = pd.concat(dfs, ignore_index=True)
import warnings
warnings.filterwarnings('ignore')


In [60]:
with open('friends_test.json') as f:
    data = json.load(f)


# Create dataframes for each list in data
dfs = [pd.DataFrame(lst) for lst in data]

# Concatenate dataframes
test = pd.concat(dfs, ignore_index=True)


In [61]:
df.head(5)

Unnamed: 0,speaker,utterance,emotion,annotation
0,Chandler,also I was the point person on my companys tr...,neutral,4100000
1,The Interviewer,You mustve had your hands full.,neutral,5000000
2,Chandler,That I did. That I did.,neutral,5000000
3,The Interviewer,So lets talk a little bit about your duties.,neutral,5000000
4,Chandler,My duties? All right.,surprise,2000030


In [62]:
df.emotion.unique()

array(['neutral', 'surprise', 'fear', 'non-neutral', 'joy', 'sadness',
       'anger', 'disgust'], dtype=object)

In [63]:
df['emotion'].replace(["joy","surprise"],"positive",inplace=True)
df['emotion'].replace(["fear","sadness","anger","disgust"],"negative",inplace=True)
df = df.drop(df[df['emotion'] == 'non-neutral'].index)


In [64]:
test['emotion'].replace(["joy","surprise"],"positive",inplace=True)
test['emotion'].replace(["fear","sadness","anger","disgust"],"negative",inplace=True)
test = test.drop(test[test['emotion'] == 'non-neutral'].index)

In [65]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove usernames and hashtags
    text = re.sub(r'@\S+|#\S+', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join the tokens back into text
    text = ' '.join(tokens)
    
    return text

# NLTK default Sentiment analyzer

In [66]:
import nltk
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
def get_sentiment(text):
    scores = sia.polarity_scores(text)
    if scores['compound'] > 0:
        return 'positive'
    elif scores['compound'] < 0:
        return 'negative'
    else:
        return 'neutral'
test["utterance"] = test["utterance"].apply(preprocess_text)
test['sentiment'] = test['utterance'].apply(get_sentiment)
accuracy = (test['sentiment'] == test['emotion']).mean()
print(f'Sentiment analysis accuracy: {accuracy:.2%}')



Sentiment analysis accuracy: 45.97%


In [67]:
df.emotion.unique()

array(['neutral', 'positive', 'negative'], dtype=object)

# ML Algos (Multinomial NB , Logistic Regression and XGboost )

In [68]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier


# Preprocess data
df['utterance'] = df['utterance'].apply(preprocess_text)
df['emotion'] = df['emotion'].astype('category')

# Split dataset into training and testing sets
X = df['utterance']
y = df['emotion']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define models
models = [('MultinomialNB', MultinomialNB()), 
          ('LogisticRegression', LogisticRegression()), 
          ('XGBClassifier', XGBClassifier())]
# Define n-gram ranges to try
ngram_ranges = [(1, 1), (1, 2), (1, 3)]


# Train and evaluate models
# Preprocess test data
test['utterance'] = test['utterance'].apply(preprocess_text)
test['emotion'] = test['emotion'].astype('category')

# Evaluate models on test data
for name, model in models:
    for ngram_range in ngram_ranges:
        print(f'Training {name} with {ngram_range} n-grams...')
        # Define pipeline
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(ngram_range=ngram_range)),
            ('clf', model)
        ])
        # Fit pipeline on training data
        pipeline.fit(X_train, y_train)
        # Predict on test data
        y_pred = pipeline.predict(test['utterance'])
        # Calculate accuracy and confusion matrix
        acc = accuracy_score(test['emotion'], y_pred)
        cm = confusion_matrix(test['emotion'], y_pred)
        # Print results
        print(f'{name} Accuracy: {acc}')
        print(f'{name} Confusion Matrix:\n{cm}\n')




Training MultinomialNB with (1, 1) n-grams...
MultinomialNB Accuracy: 0.6230319388214125
MultinomialNB Confusion Matrix:
[[   6  318   22]
 [   1 1252   34]
 [   1  462  127]]

Training MultinomialNB with (1, 2) n-grams...
MultinomialNB Accuracy: 0.6158344579397211
MultinomialNB Confusion Matrix:
[[   3  332   11]
 [   1 1272   14]
 [   0  496   94]]

Training MultinomialNB with (1, 3) n-grams...
MultinomialNB Accuracy: 0.6122357174988754
MultinomialNB Confusion Matrix:
[[   3  332   11]
 [   1 1275   11]
 [   0  507   83]]

Training LogisticRegression with (1, 1) n-grams...
LogisticRegression Accuracy: 0.631578947368421
LogisticRegression Confusion Matrix:
[[  42  265   39]
 [  33 1133  121]
 [   9  352  229]]

Training LogisticRegression with (1, 2) n-grams...
LogisticRegression Accuracy: 0.631578947368421
LogisticRegression Confusion Matrix:
[[  40  277   29]
 [  32 1146  109]
 [   6  366  218]]

Training LogisticRegression with (1, 3) n-grams...
LogisticRegression Accuracy: 0.63202

# Neural Network

In [70]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode the target variable
le = LabelEncoder()
df['emotion'] = le.fit_transform(df['emotion'])
test['emotion'] = le.fit_transform(test['emotion'])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['utterance'], df['emotion'], test_size=0.2, random_state=42)

# Tokenize the text data
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(test["utterance"])
y_test = test["emotion"]

# Pad the sequences
max_len = 100
X_train_pad = tf.keras.preprocessing.sequence.pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = tf.keras.preprocessing.sequence.pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=64, input_length=max_len),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(4, activation='softmax')
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_pad, y_train, epochs=20, batch_size=32, validation_split=0.2)

# Evaluate the model on test set
test_loss, test_acc = model.evaluate(X_test_pad, y_test, verbose=2)
print('Test Accuracy:', test_acc)



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
70/70 - 0s - loss: 2.7878 - accuracy: 0.5753 - 446ms/epoch - 6ms/step
Test Accuracy: 0.5753486156463623
