## Term Frequency - Inverse Document Frequency (TF-IDF)

#### Import Libraries

In [1]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV

#### Example

In [3]:
text = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes"
]

In [4]:
# Initiliaze the vectorizer
vectorizer = TfidfVectorizer()
output = vectorizer.fit_transform(text)
print(vectorizer.vocabulary_)


{'thor': 25, 'eating': 10, 'pizza': 22, 'loki': 17, 'is': 16, 'ironman': 15, 'ate': 7, 'already': 0, 'apple': 5, 'announcing': 4, 'new': 20, 'iphone': 14, 'tomorrow': 26, 'tesla': 24, 'model': 19, 'google': 12, 'pixel': 21, 'microsoft': 18, 'surface': 23, 'amazon': 2, 'eco': 11, 'dot': 9, 'am': 1, 'biryani': 8, 'and': 3, 'you': 27, 'are': 6, 'grapes': 13}


In [152]:
# Print the idf of each word:
all_feature_names = vectorizer.get_feature_names_out()

for word in all_feature_names:
    
    # Get the index in the vocabulary
    indx = vectorizer.vocabulary_.get(word)
    
    # Get the score
    idf_score = vectorizer.idf_[indx]

    print(f"{word} : {idf_score}")

already : 2.386294361119891
am : 2.386294361119891
amazon : 2.386294361119891
and : 2.386294361119891
announcing : 1.2876820724517808
apple : 2.386294361119891
are : 2.386294361119891
ate : 2.386294361119891
biryani : 2.386294361119891
dot : 2.386294361119891
eating : 1.9808292530117262
eco : 2.386294361119891
google : 2.386294361119891
grapes : 2.386294361119891
iphone : 2.386294361119891
ironman : 2.386294361119891
is : 1.1335313926245225
loki : 2.386294361119891
microsoft : 2.386294361119891
model : 2.386294361119891
new : 1.2876820724517808
pixel : 2.386294361119891
pizza : 2.386294361119891
surface : 2.386294361119891
tesla : 2.386294361119891
thor : 2.386294361119891
tomorrow : 1.2876820724517808
you : 2.386294361119891


In [153]:
output.toarray()[0]

array([0.24266547, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.24266547, 0.        , 0.        ,
       0.40286636, 0.        , 0.        , 0.        , 0.        ,
       0.24266547, 0.11527033, 0.24266547, 0.        , 0.        ,
       0.        , 0.        , 0.72799642, 0.        , 0.        ,
       0.24266547, 0.        , 0.        ])

#### Emotion Classification

Link: https://www.kaggle.com/datasets/praveengovi/emotions-dataset-for-nlp/data

In [2]:
train_dataframe = pd.read_csv(r"emotion_dataset\train.txt", header=None, names=["Text", "Emotion"], sep=";")
val_dataframe = pd.read_csv(r"emotion_dataset\val.txt", header=None, names=["Text", "Emotion"], sep=";")
test_dataframe = pd.read_csv(r"emotion_dataset\test.txt", header=None, names=["Text", "Emotion"], sep=";")
train_dataframe.head(10)

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
5,ive been feeling a little burdened lately wasn...,sadness
6,ive been taking or milligrams or times recomme...,surprise
7,i feel as confused about life as a teenager or...,fear
8,i have been with petronas for years i feel tha...,joy
9,i feel romantic too,love


### Preprocessing

"Love" and "surprise" appear to be underrepresented, most likely as a result of little data. Eliminating these feelings might improve the model's performance.

In [3]:
# Remove desired labels from dataset
train_dataframe = train_dataframe.drop(train_dataframe[(train_dataframe['Emotion'] == 'surprise') | (train_dataframe['Emotion'] == 'love')].index)
val_dataframe = val_dataframe.drop(val_dataframe[(val_dataframe['Emotion'] == 'surprise') | (val_dataframe['Emotion'] == 'love')].index)
test_dataframe = test_dataframe.drop(test_dataframe[(test_dataframe['Emotion'] == 'surprise') | (test_dataframe['Emotion'] == 'love')].index)
print("---TRAIN---")
print(train_dataframe['Emotion'].value_counts())
print("Train Shape: ", train_dataframe.shape)
print("---VALIDATION---")
print(val_dataframe['Emotion'].value_counts())
print("Validation Shape: ", val_dataframe.shape)
print("---TEST---")
print(test_dataframe['Emotion'].value_counts())
print("Test Shape: ", test_dataframe.shape)

---TRAIN---
Emotion
joy        5362
sadness    4666
anger      2159
fear       1937
Name: count, dtype: int64
Train Shape:  (14124, 2)
---VALIDATION---
Emotion
joy        704
sadness    550
anger      275
fear       212
Name: count, dtype: int64
Validation Shape:  (1741, 2)
---TEST---
Emotion
joy        695
sadness    581
anger      275
fear       224
Name: count, dtype: int64
Test Shape:  (1775, 2)


In [4]:
# Print the appearance percentage of the each label
print('---TRAIN---')
percentage_appearance = train_dataframe['Emotion'].value_counts(normalize=True) * 100
print(percentage_appearance)
print('-----------------------------------------------------------------------')
print('---VALIDATION---')
percentage_appearance = val_dataframe['Emotion'].value_counts(normalize=True) * 100
print(percentage_appearance)
print('-----------------------------------------------------------------------')
print('---TEST---')
percentage_appearance = test_dataframe['Emotion'].value_counts(normalize=True) * 100
print(percentage_appearance)

---TRAIN---
Emotion
joy        37.963750
sadness    33.035967
anger      15.286038
fear       13.714245
Name: proportion, dtype: float64
-----------------------------------------------------------------------
---VALIDATION---
Emotion
joy        40.436531
sadness    31.591040
anger      15.795520
fear       12.176910
Name: proportion, dtype: float64
-----------------------------------------------------------------------
---TEST---
Emotion
joy        39.154930
sadness    32.732394
anger      15.492958
fear       12.619718
Name: proportion, dtype: float64


In [5]:
### TRAIN ###
# Get unique values
unique_labels = train_dataframe.Emotion.unique()
numerical_labels = [x for x in range(len(unique_labels))]

# Convert text category into numerical category
numerical_target_dict = dict(zip(unique_labels, numerical_labels))

# Add the new numerical label into the dataframe
train_dataframe['Emotion_Labels'] = train_dataframe['Emotion'].map(numerical_target_dict)

### VALIDATION ###
# Get unique values
unique_labels = val_dataframe.Emotion.unique()
numerical_labels = [x for x in range(len(unique_labels))]

# Convert text category into numerical category
numerical_target_dict = dict(zip(unique_labels, numerical_labels))

# Add the new numerical label into the dataframe
val_dataframe['Emotion_Labels'] = val_dataframe['Emotion'].map(numerical_target_dict)

### TEST ###
# Get unique values
unique_labels = test_dataframe.Emotion.unique()
numerical_labels = [x for x in range(len(unique_labels))]

# Convert text category into numerical category
numerical_target_dict = dict(zip(unique_labels, numerical_labels))

# Add the new numerical label into the dataframe
test_dataframe['Emotion_Labels'] = test_dataframe['Emotion'].map(numerical_target_dict)

In [6]:
# Concatenate train and validation dataset
dataframe_merged = pd.concat([train_dataframe, val_dataframe], axis=0, ignore_index=True)
dataframe_merged.head(10)
print("---TRAIN---")
print(dataframe_merged.shape)
print(dataframe_merged['Emotion'].value_counts())

---TRAIN---
(15865, 3)
Emotion
joy        6066
sadness    5216
anger      2434
fear       2149
Name: count, dtype: int64


#### Undersampling

In [7]:
# Create empty dataframe
df_resampled = pd.DataFrame()
# Set minimum number of sample
min_samples = 2149 


for label in unique_labels:
    df_new = dataframe_merged[dataframe_merged.Emotion==label].sample(min_samples, random_state=319)
    df_resampled = pd.concat([df_resampled,df_new],axis=0)

df_resampled.head(5)

Unnamed: 0,Text,Emotion,Emotion_Labels
12696,i m feeling miserable serioulsy,sadness,0
2479,i knew i was shaking for many reasons a big on...,sadness,0
11997,ive been feeling so listless lately,sadness,0
3847,i feel sorry for the rest of us in second life...,sadness,0
9437,i am going to stop feeling sorry for myself,sadness,0


In [8]:
class TextPreprocessing(BaseEstimator, TransformerMixin):
    def __init__(self, nlp, stop_words):
        self.nlp = nlp
        self.stop_words = stop_words

    def apply_join(self, text):
        return " ".join(text)
    
    def remove_non_alphabetic(self, text):
        """Removes non-alphabetic characters"""
        doc = self.nlp(text)
        text = [token.text for token in doc if token.is_alpha]
        return text

    def lemmatization(self, text):
        """Reduces words to their dictionary form"""
        doc = self.nlp(text)
        text = [token.lemma_ for token in doc]
        return text

    
    def remove_stop_words(self, text):
        """Remove the stop words from text"""
        text = [word for word in text if word not in self.stop_words]
        return text
        
    def text_preprocess(self, text):
        text = self.remove_non_alphabetic(text)
        text = self.apply_join(text)
        text = self.lemmatization(text)
        text = self.remove_stop_words(text)
        text = self.apply_join(text)
        return text
    
    def transform(self, X):
        return [self.text_preprocess(text) for text in X]
    
    def fit(self, X, y=None):
        return self

In [9]:
# Initliaze the nlp model
nlp = spacy.load("en_core_web_sm")

# Get the stop words list
stop_words = nlp.Defaults.stop_words

# Initiliaze the class
text_preprocessing = TextPreprocessing(nlp, stop_words)

# Apply the text preprocessing to eleminate  the redundant words
dataframe_merged["Text_Adjusted"] = dataframe_merged["Text"].apply(text_preprocessing.text_preprocess)
df_resampled["Text_Adjusted"] = df_resampled["Text"].apply(text_preprocessing.text_preprocess)
test_dataframe["Text_Adjusted"] = test_dataframe["Text"].apply(text_preprocessing.text_preprocess)

In [10]:
df_resampled.head(5)

Unnamed: 0,Text,Emotion,Emotion_Labels,Text_Adjusted
12696,i m feeling miserable serioulsy,sadness,0,I m feel miserable serioulsy
2479,i knew i was shaking for many reasons a big on...,sadness,0,I know I shake reason big cyst drama start I c...
11997,ive been feeling so listless lately,sadness,0,I ve feel listless lately
3847,i feel sorry for the rest of us in second life...,sadness,0,I feel sorry rest second life understand suppo...
9437,i am going to stop feeling sorry for myself,sadness,0,I stop feel sorry


#### Trainning wo/Sampling

In [11]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(dataframe_merged["Text_Adjusted"], dataframe_merged['Emotion_Labels'], test_size=0.1, random_state=331, stratify=dataframe_merged.Emotion_Labels)

# svm = SVC()

emotion_pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        ("clf", SVC()),
    ]
)

parameters = {
    'tfidf__ngram_range': ((1, 1), (1, 2), (1,3)),
    'clf__C': (0.01, 0.05, 0.1, 1, 10, 100),
    'clf__kernel': ('linear', 'poly', 'rbf'),
    'clf__gamma': (0.001, 0.01, 0.1, 1.0, 10.0)
}

grid_search = GridSearchCV(emotion_pipeline, parameters, cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

In [12]:
# Print best parameters and score
print(grid_search.best_params_)
print(grid_search.best_score_)

{'clf__C': 1, 'clf__gamma': 0.001, 'clf__kernel': 'linear', 'tfidf__ngram_range': (1, 2)}
0.8703594117156495


In [13]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(dataframe_merged["Text_Adjusted"], dataframe_merged['Emotion_Labels'], test_size=0.1, random_state=331, stratify=dataframe_merged.Emotion_Labels)

svm = SVC(kernel="linear",gamma=0.001, C=1, random_state=331)

pipe = Pipeline(
    [
        ("vectorizer", TfidfVectorizer(ngram_range=(1,2))),
        ("svm", svm),
    ]
)

pipe.fit(X_train, y_train) 

In [14]:
test_pred = pipe.predict(X_test)

#Print the classfication report
print(classification_report(y_test, test_pred))

              precision    recall  f1-score   support

           0       0.89      0.94      0.92       522
           1       0.89      0.88      0.89       243
           2       0.87      0.57      0.69       264
           3       0.83      0.92      0.87       558

    accuracy                           0.86      1587
   macro avg       0.87      0.83      0.84      1587
weighted avg       0.87      0.86      0.86      1587



#### Training w/Undersampling

In [16]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df_resampled["Text_Adjusted"], df_resampled['Emotion_Labels'], test_size=0.1, random_state=331, stratify=df_resampled.Emotion_Labels)

# svm = SVC()

emotion_pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        ("clf", SVC()),
    ]
)

parameters = {
    'tfidf__ngram_range': ((1, 1), (1, 2), (1,3)),
    'clf__C': (0.01, 0.05, 0.1, 1, 10, 100),
    'clf__kernel': ('linear', 'poly', 'rbf'),
    'clf__gamma': (0.001, 0.01, 0.1, 1.0, 10.0)
}


grid_search = GridSearchCV(emotion_pipeline, parameters, cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

In [17]:
# Print best parameters and score
print(grid_search.best_params_)
print(grid_search.best_score_)

{'clf__C': 1, 'clf__gamma': 0.001, 'clf__kernel': 'linear', 'tfidf__ngram_range': (1, 2)}
0.84707986951489


In [18]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df_resampled["Text_Adjusted"], df_resampled['Emotion_Labels'], test_size=0.1, random_state=331, stratify=df_resampled.Emotion_Labels)

svm = SVC(kernel="linear",gamma=0.001, C=1, random_state=331)

pipe = Pipeline(
    [
        ("vectorizer", TfidfVectorizer(ngram_range=(1,2))),
        ("svm", svm),
    ]
)

pipe.fit(X_train, y_train) 

In [19]:
test_pred = pipe.predict(X_test)

#Print the classfication report
print(classification_report(y_test, test_pred))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90       215
           1       0.94      0.92      0.93       215
           2       0.85      0.84      0.85       219
           3       0.81      0.83      0.82       211

    accuracy                           0.87       860
   macro avg       0.87      0.87      0.87       860
weighted avg       0.87      0.87      0.87       860

