# <p style="font-family:JetBrains Mono; font-weight:normal; letter-spacing: 1px; color:#207d06; font-size:100%; text-align:left;padding: 0px; border-bottom: 3px solid #207d06;">**Import Needed Libraries**</p>

pip install spacy

python -m spacy download en_core_web_sm

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

import spacy

# <p style="font-family:JetBrains Mono; font-weight:normal; letter-spacing: 1px; color:#207d06; font-size:100%; text-align:left;padding: 0px; border-bottom: 3px solid #207d06;">**Exploratory Data Analysis EDA**</p>

In [None]:
# Read the dataset with name "Emotion_classify_Data.csv" and store it in a variable df
df = pd.read_csv("Emotion_classify_Data.csv")

# Print the shape of dataframe
print(df.shape)

# Print top 5 rows
df.head(5)

In [None]:
# Check the distribution of Emotion
df['Emotion'].value_counts()

In [None]:
# Show sample
print(f"{df['Comment'][0]} -> {df['Emotion'][0]}")

# <p style="font-family:JetBrains Mono; font-weight:normal; letter-spacing: 1px; color:#207d06; font-size:100%; text-align:left;padding: 0px; border-bottom: 3px solid #207d06;">**Preprocessing**</p>

In [None]:
# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 

In [None]:
txt = df['Comment'][3]
txt

In [None]:
# Tokenization
doc = nlp(txt)

## <p style="font-family:JetBrains Mono; font-weight:normal; letter-spacing: 1px; color:#207d06; font-size:100%; text-align:left;padding: 0px; border-bottom: 3px solid #207d06;">**Sentence Tokenization**</p>
**We won't do this process as data in dataframe is splitted in sentences**

In [None]:
# for sentence in doc.sents:
#     print(sentence)

## <p style="font-family:JetBrains Mono; font-weight:normal; letter-spacing: 1px; color:#207d06; font-size:100%; text-align:left;padding: 0px; border-bottom: 3px solid #207d06;">**Word Tokenization**</p>

In [None]:
for token in doc:
    print(token)

## <p style="font-family:JetBrains Mono; font-weight:normal; letter-spacing: 1px; color:#207d06; font-size:100%; text-align:left;padding: 0px; border-bottom: 3px solid #207d06;">**Stemming and Lemmatization**</p>

In [None]:
for token in doc:
    print(f"Word: {token} | -> {token.lemma_}")

## <p style="font-family:JetBrains Mono; font-weight:normal; letter-spacing: 1px; color:#207d06; font-size:100%; text-align:left;padding: 0px; border-bottom: 3px solid #207d06;">**Stop Words**</p>

In [None]:
for token in doc:
    if token.is_stop or token.is_punct:
        print(token)

## <p style="font-family:JetBrains Mono; font-weight:normal; letter-spacing: 1px; color:#207d06; font-size:100%; text-align:left;padding: 0px; border-bottom: 3px solid #207d06;">**Preprocess Function**</p>

In [None]:
# use this utility function to get the preprocessed text data
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [None]:
print(txt)
procces_txt = preprocess(txt)
print(procces_txt)

 ## <p style="font-family:JetBrains Mono; font-weight:normal; letter-spacing: 1px; color:#207d06; font-size:100%; text-align:left;padding: 0px; border-bottom: 3px solid #207d06;">**Apply preprocess function on dataframe**</p>

In [None]:
df['preprocessed_comment'] = df['Comment'].apply(preprocess) 

In [None]:
df

## <p style="font-family:JetBrains Mono; font-weight:normal; letter-spacing: 1px; color:#207d06; font-size:100%; text-align:left;padding: 0px; border-bottom: 3px solid #207d06;">**Encoding target column**</p>

In [None]:
df['Emotion_num'] = df['Emotion'].map({'joy' : 0, 'fear': 1, 'anger': 2})

df.head(5)

## <p style="font-family:JetBrains Mono; font-weight:normal; letter-spacing: 1px; color:#207d06; font-size:100%; text-align:left;padding: 0px; border-bottom: 3px solid #207d06;">**Split data into train and test**</p>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['preprocessed_comment'], df['Emotion_num'], 
                                                    test_size=0.2, random_state=42, stratify=df['Emotion_num'])

In [None]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

## <p style="font-family:JetBrains Mono; font-weight:normal; letter-spacing: 1px; color:#207d06; font-size:100%; text-align:left;padding: 0px; border-bottom: 3px solid #207d06;">**Convert text column to numeric vector**

In [None]:
v = TfidfVectorizer()

X_train_cv = v.fit_transform(X_train)
X_test_cv = v.transform(X_test)

# All TfidfVectorizer vocabularies
print(v.vocabulary_)

# <p style="font-family:JetBrains Mono; font-weight:normal; letter-spacing: 1px; color:#207d06; font-size:100%; text-align:left;padding: 0px; border-bottom: 3px solid #207d06;">**Machine Learning Model**</p>

## <p style="font-family:JetBrains Mono; font-weight:normal; letter-spacing: 1px; color:#207d06; font-size:100%; text-align:left;padding: 0px; border-bottom: 3px solid #207d06;">1.**Naive Bayes**

In [None]:
NB_model = MultinomialNB()

# Model training
NB_model.fit(X_train_cv, y_train)

In [None]:
# Get prediction
y_pred = NB_model.predict(X_test_cv)

In [None]:
# Print accuracy score
print(accuracy_score(y_test, y_pred))

In [None]:
# Print classification report
print(classification_report(y_test, y_pred))

## <p style="font-family:JetBrains Mono; font-weight:normal; letter-spacing: 1px; color:#207d06; font-size:100%; text-align:left;padding: 0px; border-bottom: 3px solid #207d06;">2.**Random Forest**</p>

In [None]:
RFC_model = RandomForestClassifier()

RFC_model.fit(X_train_cv, y_train)

In [None]:
# Get the predictions for X_test and store it in y_pred
y_pred = RFC_model.predict(X_test_cv)

In [None]:
# Print Accuracy
print(accuracy_score(y_test, y_pred))

In [None]:
# Print the classfication report
print(classification_report(y_test, y_pred))

# <p style="font-family:JetBrains Mono; font-weight:normal; letter-spacing: 1px; color:#207d06; font-size:100%; text-align:left;padding: 0px; border-bottom: 3px solid #207d06;">**Test Model**

## <p style="font-family:JetBrains Mono; font-weight:normal; letter-spacing: 1px; color:#207d06; font-size:100%; text-align:left;padding: 0px; border-bottom: 3px solid #207d06;">**Get text**</p>

In [None]:
test_text = df['Comment'][2000]
test_text

## <p style="font-family:JetBrains Mono; font-weight:normal; letter-spacing: 1px; color:#207d06; font-size:100%; text-align:left;padding: 0px; border-bottom: 3px solid #207d06;">**Apply preprocess**</p>

In [None]:
test_text_processed = [preprocess(test_text)]
test_text_processed

## <p style="font-family:JetBrains Mono; font-weight:normal; letter-spacing: 1px; color:#207d06; font-size:100%; text-align:left;padding: 0px; border-bottom: 3px solid #207d06;">**Convert to vector**</p>

In [None]:
test_text_vc = v.transform(test_text_processed)

## <p style="font-family:JetBrains Mono; font-weight:normal; letter-spacing: 1px; color:#207d06; font-size:100%; text-align:left;padding: 0px; border-bottom: 3px solid #207d06;">**Get Prediction**</p>

In [None]:
test_text = RFC_model.predict(test_text_vc)

## <p style="font-family:JetBrains Mono; font-weight:normal; letter-spacing: 1px; color:#207d06; font-size:100%; text-align:left;padding: 0px; border-bottom: 3px solid #207d06;">**Output**</p>

In [None]:
print(f"{df['Emotion'][2000]} -> {df['Emotion_num'][2000]}")
print(test_text)