In [71]:
import pandas as pd

df = pd.read_csv("Train.csv")
df

Unnamed: 0,id,Sentence,Aspect Term,polarity,from,to
0,2339,I charge it at night and skip taking the cord ...,cord,neutral,41,45
1,2339,I charge it at night and skip taking the cord ...,battery life,positive,74,86
2,1316,The tech guy then said the service center does...,service center,negative,27,41
3,1316,The tech guy then said the service center does...,"""sales"" team",negative,109,121
4,1316,The tech guy then said the service center does...,tech guy,neutral,4,12
...,...,...,...,...,...,...
2353,2272,We also use Paralles so we can run virtual mac...,Windows Server Enterprise 2003,neutral,104,134
2354,2272,We also use Paralles so we can run virtual mac...,Windows Server 2008 Enterprise,neutral,140,170
2355,848,"How Toshiba handles the repair seems to vary, ...",repair,conflict,24,30
2356,848,"How Toshiba handles the repair seems to vary, ...",repair,positive,130,136


Data Preprocessing

In [72]:
df['polarity'].unique()

array(['neutral', 'positive', 'negative', 'conflict'], dtype=object)

In [73]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sheth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sheth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sheth\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [74]:
class PreProcessData:
    def __init__(self) -> None:
        self.stop_words = set(stopwords.words('english'))
        self.word_lemmetizer = WordNetLemmatizer()

    def lower_text(self, text):
        return text.lower()

    def remove_stop_words(self, text):
        return [words for words in text if words.lower() not in self.stop_words]
    
    def remove_char(self, text):
        return re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    def remove_num(self, text):
        return re.sub(r'\d+', '', text)
    
    def tokenize_text(self, text):
        return word_tokenize(text)
    
    def lemmetize_text(self, text):
        return [self.word_lemmetizer.lemmatize(word) for word in text]
    
    def preprocess_data(self, text):  # Changed from preprocess_data to preprocess_data
        text = self.lower_text(text)
        text = self.remove_num(text)
        text = self.remove_char(text)
        text = self.tokenize_text(text)
        #text = self.remove_stop_words(text)
        text = self.lemmetize_text(text)

        preprocessed_text = " ".join(text)

        return preprocessed_text

In [75]:
preprocessor = PreProcessData()

df['Sentence'] = df['Sentence'].apply(preprocessor.preprocess_data)
df['Aspect Term'] = df['Aspect Term'].apply(preprocessor.preprocess_data)

print(df[['Sentence', 'Aspect Term', 'polarity']].head())

                                            Sentence     Aspect Term  polarity
0  i charge it at night and skip taking the cord ...            cord   neutral
1  i charge it at night and skip taking the cord ...    battery life  positive
2  the tech guy then said the service center doe ...  service center  negative
3  the tech guy then said the service center doe ...       sale team  negative
4  the tech guy then said the service center doe ...        tech guy   neutral


In [76]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
import requests
import zipfile
import io
import os

TF-IDF Vectorizer

In [77]:
# referred scikit-learn documentation

X = df['Sentence'] + ' ' + df['Aspect Term']
y = df['polarity'].fillna("unknown")
tfidf_vectorizer =  TfidfVectorizer()
chi_selector = SelectKBest(chi2, k=100)
X_tfidf = tfidf_vectorizer.fit_transform(X)
X_selected = chi_selector.fit_transform(X_tfidf, y)


X_tfidf_train, X_tfidf_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.25, random_state=5624)

GLoVe 

In [78]:
glove_path = 'glove.6B.300d.txt'
if not os.path.exists(glove_path):
    url = 'http://nlp.stanford.edu/data/glove.6B.zip'
    r = requests.get(url)
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall()

# referred https://gist.github.com/sebtheiler/84a0c5afac04f7e602de350ddca94859#file-loading_vectors_full-py
glove = {}
with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove[word] = vector

print(f"Loaded {len(glove)} word vectors.")

Loaded 400000 word vectors.


In [79]:
def text_to_embedding(text):
    words = text.split()
    embeddings = [glove.get(word) for word in words if word in glove]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(glove['the'].shape)
    
X_glove = X.apply(text_to_embedding)
X_glove_train, X_glove_test, y_train, y_test = train_test_split(X_glove, y, test_size=0.25, random_state=5624)

BERT Vectorizer

In [80]:
# referred the following link https://github.com/UKPLab/sentence-transformers
%pip install sentence-transformers

from sentence_transformers import SentenceTransformer
bert = SentenceTransformer('paraphrase-MiniLM-L6-v2')

X_bert = bert.encode(X.to_list(), show_progress_bar=True)
polarity_map = {'positive': 2, 'neutral': 1, 'negative': 0, 'conflict': 4}
y_bert = y.map(polarity_map)
X_bert_train, X_bert_test, y_train, y_test = train_test_split(X_bert, y, test_size=0.25, random_state=5624)

Note: you may need to restart the kernel to use updated packages.




Batches:   0%|          | 0/74 [00:00<?, ?it/s]

SVM, MLP and Random Forest on TF-IDF embeddings

In [81]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

# SVM Classifier
svm_classifier = SVC(kernel='rbf', random_state=42)
svm_classifier.fit(X_tfidf_train, y_train)
svm_predictions = svm_classifier.predict(X_tfidf_test)
print("SVM Classification Report:")
print(classification_report(y_test, svm_predictions))

# Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_tfidf_train, y_train)
rf_predictions = rf_classifier.predict(X_tfidf_test)
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_predictions))

# MLP Classifier
mlp_classifier = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
mlp_classifier.fit(X_tfidf_train, y_train)
mlp_predictions = mlp_classifier.predict(X_tfidf_test)
print("\nMLP Classification Report:")
print(classification_report(y_test, mlp_predictions))


SVM Classification Report:
              precision    recall  f1-score   support

    conflict       0.00      0.00      0.00        11
    negative       0.57      0.87      0.69       228
     neutral       0.84      0.13      0.23       123
    positive       0.70      0.68      0.69       228

    accuracy                           0.63       590
   macro avg       0.53      0.42      0.40       590
weighted avg       0.67      0.63      0.58       590


Random Forest Classification Report:
              precision    recall  f1-score   support

    conflict       0.33      0.09      0.14        11
    negative       0.59      0.72      0.65       228
     neutral       0.40      0.26      0.32       123
    positive       0.68      0.68      0.68       228

    accuracy                           0.60       590
   macro avg       0.50      0.44      0.45       590
weighted avg       0.58      0.60      0.58       590


MLP Classification Report:
              precision    recall  f1



Comparing SVM, MLP and Random Forest for TF-IDF embeddings

In [82]:
from prettytable import PrettyTable
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def get_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    return accuracy, precision, recall, f1

# Get metrics for each classifier
svm_metrics = get_metrics(y_test, svm_predictions)
rf_metrics = get_metrics(y_test, rf_predictions)
mlp_metrics = get_metrics(y_test, mlp_predictions)

table = PrettyTable()
table.field_names = ["Classifier", "Accuracy", "Precision", "Recall", "F1-Score"]

table.add_row(["SVM"] + [f"{metric:.4f}" for metric in svm_metrics])
table.add_row(["Random Forest"] + [f"{metric:.4f}" for metric in rf_metrics])
table.add_row(["MLP"] + [f"{metric:.4f}" for metric in mlp_metrics])

print(table)

+---------------+----------+-----------+--------+----------+
|   Classifier  | Accuracy | Precision | Recall | F1-Score |
+---------------+----------+-----------+--------+----------+
|      SVM      |  0.6254  |   0.6671  | 0.6254 |  0.5796  |
| Random Forest |  0.5966  |   0.5792  | 0.5966 |  0.5809  |
|      MLP      |  0.6288  |   0.6342  | 0.6288 |  0.5923  |
+---------------+----------+-----------+--------+----------+


In [83]:
import numpy as np

X_glove_train = np.array(X_glove_train.tolist())
X_glove_test = np.array(X_glove_test.tolist())

SVM, MLP and Random Forest on GLoVe embeddings

In [84]:
# SVM Classifier
svm_classifier = SVC(kernel='rbf', random_state=42)
svm_classifier.fit(X_glove_train, y_train)
svm_predictions = svm_classifier.predict(X_glove_test)
print("SVM Classification Report:")
print(classification_report(y_test, svm_predictions))

# Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_glove_train, y_train)
rf_predictions = rf_classifier.predict(X_glove_test)
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_predictions))

# MLP Classifier
mlp_classifier = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
mlp_classifier.fit(X_glove_train, y_train)
mlp_predictions = mlp_classifier.predict(X_glove_test)
print("\nMLP Classification Report:")
print(classification_report(y_test, mlp_predictions))


SVM Classification Report:
              precision    recall  f1-score   support

    conflict       0.00      0.00      0.00        11
    negative       0.64      0.86      0.73       228
     neutral       0.70      0.17      0.27       123
    positive       0.71      0.79      0.75       228

    accuracy                           0.67       590
   macro avg       0.51      0.45      0.44       590
weighted avg       0.67      0.67      0.63       590



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Random Forest Classification Report:
              precision    recall  f1-score   support

    conflict       0.00      0.00      0.00        11
    negative       0.71      0.75      0.73       228
     neutral       0.79      0.33      0.47       123
    positive       0.67      0.86      0.75       228

    accuracy                           0.69       590
   macro avg       0.54      0.49      0.49       590
weighted avg       0.70      0.69      0.67       590


MLP Classification Report:
              precision    recall  f1-score   support

    conflict       0.21      0.27      0.24        11
    negative       0.71      0.65      0.68       228
     neutral       0.54      0.50      0.52       123
    positive       0.71      0.79      0.75       228

    accuracy                           0.67       590
   macro avg       0.55      0.55      0.55       590
weighted avg       0.67      0.67      0.67       590





Comparing SVM, MLP and Random Forest for GLoVe embeddings

In [85]:
# Get metrics for each classifier
svm_metrics = get_metrics(y_test, svm_predictions)
rf_metrics = get_metrics(y_test, rf_predictions)
mlp_metrics = get_metrics(y_test, mlp_predictions)

table = PrettyTable()
table.field_names = ["Classifier", "Accuracy", "Precision", "Recall", "F1-Score"]

table.add_row(["SVM"] + [f"{metric:.4f}" for metric in svm_metrics])
table.add_row(["Random Forest"] + [f"{metric:.4f}" for metric in rf_metrics])
table.add_row(["MLP"] + [f"{metric:.4f}" for metric in mlp_metrics])

print("\nGloVe Embeddings Performance Comparison:")
print(table)


GloVe Embeddings Performance Comparison:
+---------------+----------+-----------+--------+----------+
|   Classifier  | Accuracy | Precision | Recall | F1-Score |
+---------------+----------+-----------+--------+----------+
|      SVM      |  0.6712  |   0.6658  | 0.6712 |  0.6280  |
| Random Forest |  0.6949  |   0.6968  | 0.6949 |  0.6716  |
|      MLP      |  0.6678  |   0.6678  | 0.6678 |  0.6663  |
+---------------+----------+-----------+--------+----------+


  _warn_prf(average, modifier, msg_start, len(result))


SVM, MLP and Random Forest on S-BERT embeddings

In [86]:
# SVM Classifier
svm_classifier = SVC(kernel='rbf', random_state=42)
svm_classifier.fit(X_bert_train, y_train)
svm_predictions = svm_classifier.predict(X_bert_test)
print("SVM Classification Report:")
print(classification_report(y_test, svm_predictions))

# Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_bert_train, y_train)
rf_predictions = rf_classifier.predict(X_bert_test)
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_predictions))

# MLP Classifier
mlp_classifier = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
mlp_classifier.fit(X_bert_train, y_train)
mlp_predictions = mlp_classifier.predict(X_bert_test)
print("\nMLP Classification Report:")
print(classification_report(y_test, mlp_predictions))



SVM Classification Report:
              precision    recall  f1-score   support

    conflict       0.00      0.00      0.00        11
    negative       0.73      0.87      0.79       228
     neutral       0.75      0.33      0.46       123
    positive       0.71      0.82      0.76       228

    accuracy                           0.72       590
   macro avg       0.55      0.51      0.50       590
weighted avg       0.71      0.72      0.70       590



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Random Forest Classification Report:
              precision    recall  f1-score   support

    conflict       0.00      0.00      0.00        11
    negative       0.72      0.83      0.77       228
     neutral       0.69      0.34      0.46       123
    positive       0.72      0.83      0.77       228

    accuracy                           0.71       590
   macro avg       0.53      0.50      0.50       590
weighted avg       0.70      0.71      0.69       590


MLP Classification Report:
              precision    recall  f1-score   support

    conflict       0.00      0.00      0.00        11
    negative       0.74      0.78      0.76       228
     neutral       0.62      0.52      0.57       123
    positive       0.76      0.79      0.78       228

    accuracy                           0.72       590
   macro avg       0.53      0.52      0.52       590
weighted avg       0.71      0.72      0.71       590



Comparing SVM, MLP and Random Forest for S-BERT embeddings

In [87]:
svm_metrics = get_metrics(y_test, svm_predictions)
rf_metrics = get_metrics(y_test, rf_predictions)
mlp_metrics = get_metrics(y_test, mlp_predictions)

table = PrettyTable()
table.field_names = ["Classifier", "Accuracy", "Precision", "Recall", "F1-Score"]

table.add_row(["SVM"] + [f"{metric:.4f}" for metric in svm_metrics])
table.add_row(["Random Forest"] + [f"{metric:.4f}" for metric in rf_metrics])
table.add_row(["MLP"] + [f"{metric:.4f}" for metric in mlp_metrics])

print("\nBERT Embeddings Performance Comparison:")
print(table)


BERT Embeddings Performance Comparison:
+---------------+----------+-----------+--------+----------+
|   Classifier  | Accuracy | Precision | Recall | F1-Score |
+---------------+----------+-----------+--------+----------+
|      SVM      |  0.7237  |   0.7129  | 0.7237 |  0.6980  |
| Random Forest |  0.7136  |   0.6994  | 0.7136 |  0.6911  |
|      MLP      |  0.7169  |   0.7064  | 0.7169 |  0.7103  |
+---------------+----------+-----------+--------+----------+


  _warn_prf(average, modifier, msg_start, len(result))
