# **Group-5 Machine Learning Programming (PROG8245)**
# **Final Project Model Development**

### **Import the Required Packages**

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import pandas as pd

[nltk_data] Downloading package punkt to /home/billy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/billy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/billy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### **Importing the Data**

In [2]:
# Importing the tweets dataset

df = pd.read_csv(r"../data/hugginoutput.csv")
df.columns = ['tweets', 'sentiment']
df

Unnamed: 0,tweets,sentiment
0,"Ngoc Pham, a 83 yr old Vietnamese man, was one...",negative
1,I’m making the following announcement and form...,positive
2,“The most aggressive field operation in Califo...,positive
3,Thank you Supervisor @AaronPeskin for endorsin...,positive
4,#tbt sharing a meal w/my friend @narendramodi ...,positive
...,...,...
444,"Hey @united, I’m about to board one of your fl...",neutral
445,OFFICIAL WELCOME: @MikePenceVP let me be the f...,positive
446,BREAKING: A Republican Member of the Pennsylva...,negative
447,"Today, I'm joined by 8 of my colleagues in int...",negative


### **Basic Preprocessing of Tweets**

In [3]:
# Initialize NLP Preprocessing functions

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Preprocess the "tweets" column in place

for index, row in df.iterrows():
    sentence = row['tweets']
    
    # Tokenize the sentence
    tokens = word_tokenize(sentence)
    
    # Remove punctuation and convert to lowercase
    tokens = [token.lower() for token in tokens if token.isalnum()]
    
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into a preprocessed sentence
    preprocessed_sentence = ' '.join(lemmatized_tokens)
    
    # Update the "tweets" column with preprocessed sentence
    df.at[index, 'tweets'] = preprocessed_sentence

# Display the DataFrame
df

Unnamed: 0,tweets,sentiment
0,ngoc pham 83 yr old vietnamese man one 2 asian...,negative
1,making following announcement formal apology t...,positive
2,aggressive field operation california sander a...,positive
3,thank supervisor aaronpeskin endorsing senator...,positive
4,tbt sharing meal friend narendramodi wishing l...,positive
...,...,...
444,hey united board one flight sure hope given fl...,neutral
445,official welcome mikepencevp let first officia...,positive
446,breaking republican member pennsylvania house ...,negative
447,today joined 8 colleague introducing walk with...,negative


### **Word Embeddings Creation using tfidf vectors**

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectorizer

tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed tweets

tfidf_matrix = tfidf_vectorizer.fit_transform(df['tweets'])

# Convert the TF-IDF matrix to a DataFrame

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Concatenate the original DataFrame with the TF-IDF DataFrame

tfidf_df = pd.concat([df, tfidf_df], axis=1)

# Remove the sentence_embeddings column

#tfidf_df = tfidf_df.drop(columns=['sentence_embeddings'])

# Display the final output dataframe with embeddings

tfidf_df


Unnamed: 0,tweets,sentiment,10,100k,10am,11,13,14,15th,16,...,year,yes,yesterday,yet,york,young,youngest,yr,zelenskyy,zero
0,ngoc pham 83 yr old vietnamese man one 2 asian...,negative,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.186308,0.0,0.0,0.0,0.0,0.340949,0.0,0.00000
1,making following announcement formal apology t...,positive,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000
2,aggressive field operation california sander a...,positive,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000
3,thank supervisor aaronpeskin endorsing senator...,positive,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000
4,tbt sharing meal friend narendramodi wishing l...,positive,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444,hey united board one flight sure hope given fl...,neutral,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000
445,official welcome mikepencevp let first officia...,positive,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000
446,breaking republican member pennsylvania house ...,negative,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000
447,today joined 8 colleague introducing walk with...,negative,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.137571,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.20922


### **Word Embeddings creation using Word2Vec Skipgram Model**

In [7]:
from gensim.models import Word2Vec

# Train the Word2Vec model

model = Word2Vec(sentences=df['tweets'], vector_size=500, window=5, sg=1, min_count=1)

# Create a list to store tweet embeddings

tweet_embeddings = []

# Calculate the average embedding for each token in each tweet

for tokens in df['tweets']:
    embeddings = [model.wv[token] for token in tokens if token in model.wv]
    if embeddings:
        tweet_embedding = sum(embeddings) / len(embeddings)
    else:
        tweet_embedding = [0] * model.vector_size  # Assigning zero vector for out-of-vocabulary words
        
    tweet_embeddings.append(tweet_embedding)

# Convert the list of embeddings to a DataFrame
tweet_embeddings_df = pd.DataFrame(tweet_embeddings, columns=[f'feature_{i}' for i in range(model.vector_size)])

# Concatenate the original DataFrame with the tweet embeddings DataFrame
word2vec_df = pd.concat([df, tweet_embeddings_df], axis=1)

# Remove the sentence_embeddings column
#word2vec_df = word2vec_df.drop(columns=['sentence_embeddings'])

# Display the modified DataFrame
word2vec_df

Unnamed: 0,tweets,sentiment,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_490,feature_491,feature_492,feature_493,feature_494,feature_495,feature_496,feature_497,feature_498,feature_499
0,ngoc pham 83 yr old vietnamese man one 2 asian...,negative,0.093001,0.035433,0.054359,0.014268,-0.092885,-0.095223,-0.009147,0.139149,...,0.045125,-0.014702,0.054941,0.056680,0.014922,0.008345,-0.009585,0.026059,-0.044788,0.017465
1,making following announcement formal apology t...,positive,0.089393,0.034264,0.052174,0.013288,-0.089512,-0.091571,-0.008762,0.133645,...,0.044464,-0.014586,0.054000,0.056121,0.014661,0.008307,-0.010544,0.025209,-0.043827,0.017269
2,aggressive field operation california sander a...,positive,0.092736,0.034983,0.053857,0.012971,-0.092095,-0.094059,-0.009199,0.137439,...,0.044533,-0.013958,0.054823,0.056129,0.015049,0.008209,-0.009405,0.025760,-0.044116,0.017266
3,thank supervisor aaronpeskin endorsing senator...,positive,0.093891,0.035553,0.055014,0.014186,-0.093231,-0.094941,-0.009237,0.139285,...,0.044838,-0.014460,0.054815,0.056399,0.014900,0.009279,-0.008537,0.025735,-0.045291,0.017585
4,tbt sharing meal friend narendramodi wishing l...,positive,0.096550,0.036453,0.055479,0.014723,-0.095258,-0.097958,-0.009584,0.144268,...,0.046146,-0.015021,0.056059,0.056950,0.015043,0.008538,-0.008489,0.026371,-0.045916,0.017412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444,hey united board one flight sure hope given fl...,neutral,0.093097,0.035611,0.053706,0.013099,-0.092202,-0.094362,-0.008734,0.138789,...,0.044152,-0.014049,0.055138,0.055669,0.014859,0.007185,-0.009727,0.025783,-0.043274,0.016974
445,official welcome mikepencevp let first officia...,positive,0.092126,0.035323,0.053026,0.013210,-0.091903,-0.093963,-0.008935,0.137879,...,0.044515,-0.014161,0.054582,0.055867,0.014641,0.008411,-0.009974,0.025177,-0.043911,0.017349
446,breaking republican member pennsylvania house ...,negative,0.093538,0.036233,0.054586,0.013583,-0.093482,-0.095595,-0.008999,0.140415,...,0.043972,-0.014225,0.054108,0.055609,0.014792,0.007874,-0.009545,0.025064,-0.043506,0.017114
447,today joined 8 colleague introducing walk with...,negative,0.089949,0.033854,0.051854,0.012845,-0.089671,-0.091502,-0.009008,0.133570,...,0.044578,-0.014126,0.054495,0.056134,0.014803,0.008265,-0.010200,0.025779,-0.044040,0.017239


### **Word Embeddings Creation using BERT**

In [8]:
from transformers import BertTokenizer, BertModel
import torch
import pandas as pd

# Load the BERT model and tokenizer
model_name = 'bert-base-uncased'
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)


# Initialize lists to store sentence embeddings
sentence_embeddings_list = []

# Loop through each row in the DataFrame
for index, row in df.iterrows():
    sentence = row['tweets']
    
    # Tokenize the sentence and convert to token IDs
    tokens = tokenizer.encode(sentence, add_special_tokens=True)
    token_ids = torch.tensor(tokens).unsqueeze(0)
    
    # Create attention mask
    attention_mask = token_ids != 0
    
    # Run a forward pass through BERT
    with torch.no_grad():
        outputs = model(token_ids, attention_mask)
    encoded_layer = outputs.last_hidden_state
    
    # Calculate sentence embedding (mean of all word embeddings)
    sentence_embedding = encoded_layer.mean(dim=1)
    sentence_embeddings_list.append(sentence_embedding[0].tolist())

# Convert sentence embeddings to a NumPy array
sentence_embeddings_array = torch.tensor(sentence_embeddings_list).numpy()

# Create a new DataFrame with original tweets, sentiment, and sentence embedding features
embedding_df = pd.concat([
    df[['tweets', 'sentiment']],
    pd.DataFrame(sentence_embeddings_array, columns=[f"feature_{i}" for i in range(sentence_embeddings_array.shape[1])])
], axis=1)

# Display the embedding DataFrame
embedding_df

  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 3.00MB/s]
Downloading model.safetensors: 100%|██████████| 440M/440M [02:07<00:00, 3.45MB/s] 
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.96MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 50.2kB/s]


Unnamed: 0,tweets,sentiment,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_758,feature_759,feature_760,feature_761,feature_762,feature_763,feature_764,feature_765,feature_766,feature_767
0,ngoc pham 83 yr old vietnamese man one 2 asian...,negative,0.151141,0.182959,0.402899,-0.305875,0.315593,0.111646,0.240216,0.425379,...,-0.032087,-0.201522,0.287459,-0.229566,0.076743,0.075291,-0.147191,-0.154748,-0.033774,0.058169
1,making following announcement formal apology t...,positive,0.019772,-0.119633,0.667289,-0.449309,0.104519,-0.161934,0.494147,0.455437,...,0.321194,-0.011513,0.353712,-0.126999,0.142262,-0.122046,-0.075087,-0.075467,0.016284,0.045438
2,aggressive field operation california sander a...,positive,0.131454,-0.080869,0.196175,-0.031682,0.272633,-0.090877,-0.003943,0.249903,...,-0.086666,-0.123839,-0.198244,-0.371338,0.314498,0.145142,-0.303607,-0.183541,-0.109382,-0.375275
3,thank supervisor aaronpeskin endorsing senator...,positive,-0.179532,-0.021509,0.155459,0.038721,0.035100,-0.155800,-0.217436,-0.008971,...,-0.092380,-0.059386,0.015991,0.098745,0.338740,0.007371,-0.223407,0.085241,-0.032808,0.119657
4,tbt sharing meal friend narendramodi wishing l...,positive,-0.295412,-0.332681,0.301100,-0.227789,0.419574,-0.102872,0.474347,0.239754,...,-0.031311,0.160330,0.354961,-0.036209,-0.098433,0.072526,0.065839,-0.350751,0.079152,-0.117877
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444,hey united board one flight sure hope given fl...,neutral,0.160700,-0.260867,0.497267,-0.093050,0.434605,-0.042255,0.389752,0.375278,...,-0.004017,-0.408482,0.171101,-0.064507,-0.007335,0.129338,-0.141256,-0.311000,-0.030480,-0.302863
445,official welcome mikepencevp let first officia...,positive,0.375342,0.046244,0.656258,-0.028738,0.071722,-0.131102,0.444305,0.248182,...,0.204532,-0.196501,0.063307,-0.169355,0.228348,0.204946,-0.358443,-0.099012,-0.088422,-0.123316
446,breaking republican member pennsylvania house ...,negative,0.049125,-0.283249,-0.012152,0.157952,0.084787,-0.151598,0.047349,-0.334750,...,0.086165,0.273918,-0.085442,0.097800,0.475979,-0.040850,-0.036262,-0.077140,-0.142727,0.006540
447,today joined 8 colleague introducing walk with...,negative,0.151372,-0.096814,0.387434,-0.147987,0.238097,-0.200005,0.261640,0.234045,...,0.222576,0.150740,0.143877,-0.265047,0.493758,-0.082712,-0.207422,-0.457475,-0.221328,-0.357937


### **Creating the Random Forest Classification Model on tfidf word embeddings**

In [9]:
# Define the features and target

X = tfidf_df.iloc[:, 2:]
y = tfidf_df['sentiment']

In [10]:
# Splitting the data into train and test

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state=16)

In [11]:
# Create an SVM model
import sklearn
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=500)
model_tfidf = rfc.fit(X_train, y_train)
model_tfidf

In [12]:
y_pred_rfc_tfidf = rfc.predict(X_test)
y_pred_rfc_tfidf

array(['negative', 'neutral', 'negative', 'positive', 'negative',
       'negative', 'negative', 'negative', 'neutral', 'neutral',
       'neutral', 'positive', 'neutral', 'positive', 'neutral',
       'negative', 'positive', 'neutral', 'positive', 'negative',
       'negative', 'neutral', 'positive', 'negative', 'negative',
       'negative', 'negative', 'neutral', 'neutral', 'negative',
       'neutral', 'negative', 'negative', 'neutral', 'negative',
       'negative', 'negative', 'negative', 'negative', 'positive',
       'positive', 'neutral', 'positive', 'negative', 'negative',
       'negative', 'neutral', 'neutral', 'negative', 'negative',
       'neutral', 'neutral', 'negative', 'positive', 'negative',
       'negative', 'neutral', 'positive', 'neutral', 'neutral', 'neutral',
       'negative', 'negative', 'neutral', 'negative', 'negative',
       'negative', 'positive', 'negative', 'negative', 'negative',
       'neutral', 'negative', 'positive', 'negative', 'neutral',
       

In [13]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, precision_score, precision_recall_curve, auc, f1_score

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred_rfc_tfidf)
conf_matrix = confusion_matrix(y_test, y_pred_rfc_tfidf)
precision = precision_score(y_test, y_pred_rfc_tfidf, average='weighted')
recall = recall_score(y_test, y_pred_rfc_tfidf, average='weighted')
f1 = f1_score(y_test, y_pred_rfc_tfidf, average='weighted')
class_report = classification_report(y_test, y_pred_rfc_tfidf, zero_division='warn')

# Print metrics
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print("Classification Report:")
print(class_report)

Accuracy: 0.72
Confusion Matrix:
[[28  5  1]
 [10 21  1]
 [ 4  4 16]]
Precision: 0.74
Recall: 0.72
F1-Score: 0.72
Classification Report:
              precision    recall  f1-score   support

    negative       0.67      0.82      0.74        34
     neutral       0.70      0.66      0.68        32
    positive       0.89      0.67      0.76        24

    accuracy                           0.72        90
   macro avg       0.75      0.72      0.73        90
weighted avg       0.74      0.72      0.72        90



### **Creating the Support Vector Machines Classification Model on tfidf word embeddings**

In [14]:
from sklearn.svm import SVC

# Instantiate the SVM model with a linear kernel
svm_model = SVC(kernel='linear')

# Fit the SVM model to the training data
svm_model_tfidf = svm_model.fit(X_train, y_train)

In [15]:
y_pred_svm_tfidf = svm_model_tfidf.predict(X_test)
y_pred_svm_tfidf

array(['negative', 'neutral', 'negative', 'positive', 'negative',
       'negative', 'negative', 'negative', 'neutral', 'neutral',
       'neutral', 'positive', 'negative', 'positive', 'negative',
       'negative', 'positive', 'positive', 'positive', 'negative',
       'negative', 'positive', 'positive', 'negative', 'negative',
       'negative', 'negative', 'negative', 'neutral', 'negative',
       'negative', 'negative', 'negative', 'neutral', 'negative',
       'negative', 'negative', 'negative', 'negative', 'positive',
       'positive', 'neutral', 'positive', 'negative', 'negative',
       'negative', 'neutral', 'neutral', 'negative', 'negative',
       'neutral', 'neutral', 'negative', 'positive', 'negative',
       'negative', 'neutral', 'positive', 'negative', 'neutral',
       'negative', 'negative', 'negative', 'neutral', 'negative',
       'negative', 'negative', 'positive', 'negative', 'positive',
       'negative', 'neutral', 'neutral', 'positive', 'negative',
       'neu

In [16]:
y_test

18     negative
30     positive
325    negative
281    positive
442    negative
         ...   
288     neutral
354     neutral
378    negative
343     neutral
427    positive
Name: sentiment, Length: 90, dtype: object

In [17]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, precision_score, precision_recall_curve, auc, f1_score

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred_svm_tfidf)
conf_matrix = confusion_matrix(y_test, y_pred_svm_tfidf)
precision = precision_score(y_test, y_pred_svm_tfidf, average='weighted')
recall = recall_score(y_test, y_pred_svm_tfidf, average='weighted')
f1 = f1_score(y_test, y_pred_svm_tfidf, average='weighted')
class_report = classification_report(y_test, y_pred_svm_tfidf, zero_division='warn')

# Print metrics
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print("Classification Report:")
print(class_report)

Accuracy: 0.71
Confusion Matrix:
[[30  3  1]
 [14 16  2]
 [ 3  3 18]]
Precision: 0.73
Recall: 0.71
F1-Score: 0.70
Classification Report:
              precision    recall  f1-score   support

    negative       0.64      0.88      0.74        34
     neutral       0.73      0.50      0.59        32
    positive       0.86      0.75      0.80        24

    accuracy                           0.71        90
   macro avg       0.74      0.71      0.71        90
weighted avg       0.73      0.71      0.70        90



### **Creating the Random Forest Classification Model on Word2Vec word embeddings**

In [18]:
# Define feature matrix and Target vector

X = word2vec_df.iloc[:, 2:]
y = word2vec_df['sentiment']

In [19]:
# Split the dataset to train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=16)

In [20]:
from sklearn.ensemble import RandomForestClassifier

rfc_model = RandomForestClassifier(n_estimators=500)
rfc_model_word2vec = rfc_model.fit(X_train, y_train)

In [21]:
y_pred_rfc_word2vec = rfc_model_word2vec.predict(X_test)
y_pred_rfc_word2vec

array(['positive', 'negative', 'negative', 'positive', 'negative',
       'negative', 'positive', 'negative', 'neutral', 'neutral',
       'neutral', 'positive', 'positive', 'positive', 'neutral',
       'negative', 'positive', 'neutral', 'positive', 'negative',
       'negative', 'positive', 'positive', 'negative', 'negative',
       'negative', 'negative', 'neutral', 'neutral', 'neutral',
       'positive', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'neutral', 'negative', 'positive',
       'positive', 'negative', 'positive', 'negative', 'negative',
       'positive', 'neutral', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'positive', 'negative',
       'negative', 'neutral', 'positive', 'neutral', 'neutral', 'neutral',
       'negative', 'neutral', 'neutral', 'negative', 'positive',
       'positive', 'neutral', 'negative', 'positive', 'negative',
       'neutral', 'positive', 'positive', 'negative', 'neutral',
  

In [22]:
y_test

18     negative
30     positive
325    negative
281    positive
442    negative
         ...   
288     neutral
354     neutral
378    negative
343     neutral
427    positive
Name: sentiment, Length: 90, dtype: object

In [23]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, precision_score, precision_recall_curve, auc, f1_score

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred_rfc_word2vec)
conf_matrix = confusion_matrix(y_test, y_pred_rfc_word2vec)
precision = precision_score(y_test, y_pred_rfc_word2vec, average='weighted')
recall = recall_score(y_test, y_pred_rfc_word2vec, average='weighted')
f1 = f1_score(y_test, y_pred_rfc_word2vec, average='weighted')
class_report = classification_report(y_test, y_pred_rfc_word2vec, zero_division='warn')

# Print metrics
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print("Classification Report:")
print(class_report)

Accuracy: 0.68
Confusion Matrix:
[[26  2  6]
 [ 9 19  4]
 [ 7  1 16]]
Precision: 0.71
Recall: 0.68
F1-Score: 0.68
Classification Report:
              precision    recall  f1-score   support

    negative       0.62      0.76      0.68        34
     neutral       0.86      0.59      0.70        32
    positive       0.62      0.67      0.64        24

    accuracy                           0.68        90
   macro avg       0.70      0.68      0.68        90
weighted avg       0.71      0.68      0.68        90



### **Creating the Support Vector Machines Classification Model on Word2Vec word embeddings**

In [24]:
X = word2vec_df.iloc[:, 2:]
y = word2vec_df['sentiment']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 16)

In [26]:
from sklearn.svm import SVC

# Instantiate the SVM model with a linear kernel
svm_model = SVC(kernel='linear')

# Fit the SVM model to the training data
svm_model_word2vec = svm_model.fit(X_train, y_train)

In [27]:
y_train.value_counts()

sentiment
negative    140
neutral     115
positive    104
Name: count, dtype: int64

In [28]:
y_pred_svm_word2vec = svm_model_word2vec.predict(X_test)
y_pred_svm_word2vec

array(['negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negati

In [40]:
X_test

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_758,feature_759,feature_760,feature_761,feature_762,feature_763,feature_764,feature_765,feature_766,feature_767
18,0.205061,-0.245505,0.620967,-0.115574,-0.188906,-0.154950,0.655467,0.472972,-0.281374,-0.205236,...,0.306327,0.059130,0.343192,0.069979,0.144525,-0.118963,0.033147,-0.304546,0.103931,-0.175341
30,-0.047776,-0.004181,0.283527,0.046182,0.132087,-0.136705,0.171566,0.615851,-0.001066,-0.326898,...,0.124181,-0.170582,-0.110820,-0.042314,-0.064014,0.146247,-0.285388,-0.280629,0.184392,-0.105620
325,-0.380308,-0.390469,0.282545,-0.085962,0.435515,-0.588693,0.218925,0.445578,0.093114,-0.063507,...,-0.007023,-0.014008,0.372811,0.098985,-0.012932,-0.434729,0.103148,-0.565593,-0.227760,0.088061
281,-0.058206,-0.051789,0.786617,0.203038,0.141367,-0.062331,0.006285,0.303823,-0.363049,-0.386051,...,0.090308,-0.058273,0.246626,0.055448,0.187479,-0.120377,-0.167341,-0.212612,0.056514,0.159891
442,-0.157319,-0.291393,0.440695,0.406951,0.279871,0.027453,-0.178320,0.251929,-0.389319,-0.123286,...,0.008698,0.225937,0.193327,0.118261,0.231760,0.112386,0.056295,-0.211523,0.028133,0.189385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288,0.369056,-0.190862,-0.303469,0.043405,-0.111924,-0.074828,0.563038,-0.112159,0.148949,-0.263398,...,0.137504,-0.445872,0.021112,-0.316803,-0.090145,-0.162454,0.014657,0.118773,-0.282687,0.010809
354,-0.175246,-0.286230,0.291217,0.099451,0.173630,-0.204588,-0.024685,-0.048901,-0.590306,-0.173868,...,0.121849,0.110234,0.219813,-0.074870,0.068324,-0.387760,0.120919,-0.165434,0.023902,-0.185176
378,0.075999,0.283393,0.212851,0.069321,-0.044164,0.042958,0.196436,0.164088,-0.083676,-0.354801,...,0.129261,-0.171976,0.231709,-0.148343,0.301682,-0.151102,-0.341876,-0.337246,-0.121205,0.046173
343,0.313418,-0.189293,0.376974,-0.020832,0.279435,-0.549569,0.136697,0.194990,-0.502766,-0.320396,...,0.100685,-0.039492,-0.040186,-0.147796,0.167134,0.082111,-0.051230,-0.144026,-0.119377,0.048702


In [30]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, precision_score, precision_recall_curve, auc, f1_score

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred_svm_word2vec)
conf_matrix = confusion_matrix(y_test, y_pred_svm_word2vec)
precision = precision_score(y_test, y_pred_svm_word2vec, average='weighted')
recall = recall_score(y_test, y_pred_svm_word2vec, average='weighted')
f1 = f1_score(y_test, y_pred_svm_word2vec, average='weighted')
class_report = classification_report(y_test, y_pred_svm_word2vec)

# Print metrics
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print("Classification Report:")
print(class_report)

Accuracy: 0.38
Confusion Matrix:
[[34  0  0]
 [32  0  0]
 [24  0  0]]
Precision: 0.14
Recall: 0.38
F1-Score: 0.21
Classification Report:
              precision    recall  f1-score   support

    negative       0.38      1.00      0.55        34
     neutral       0.00      0.00      0.00        32
    positive       0.00      0.00      0.00        24

    accuracy                           0.38        90
   macro avg       0.13      0.33      0.18        90
weighted avg       0.14      0.38      0.21        90



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### **Creating the Random Forest Classification Model on BERT word embeddings**

In [31]:
X = embedding_df.iloc[:,2:]
y = embedding_df['sentiment']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 16)

In [33]:
from sklearn.ensemble import RandomForestClassifier

rfc_model = RandomForestClassifier(n_estimators=500)
rfc_model_BERT = rfc_model.fit(X_train, y_train)

In [34]:
y_pred_rfc_BERT = rfc_model_BERT.predict(X_test)
y_pred_rfc_BERT

array(['negative', 'positive', 'negative', 'positive', 'negative',
       'positive', 'negative', 'negative', 'negative', 'neutral',
       'neutral', 'positive', 'negative', 'positive', 'neutral',
       'negative', 'positive', 'neutral', 'positive', 'negative',
       'negative', 'negative', 'positive', 'negative', 'negative',
       'negative', 'negative', 'neutral', 'neutral', 'negative',
       'negative', 'negative', 'negative', 'neutral', 'negative',
       'negative', 'negative', 'negative', 'negative', 'positive',
       'positive', 'neutral', 'positive', 'negative', 'negative',
       'negative', 'neutral', 'neutral', 'negative', 'negative',
       'negative', 'negative', 'negative', 'positive', 'negative',
       'negative', 'neutral', 'positive', 'neutral', 'neutral', 'neutral',
       'negative', 'neutral', 'neutral', 'negative', 'negative',
       'negative', 'negative', 'negative', 'positive', 'positive',
       'neutral', 'negative', 'positive', 'negative', 'neutral',
 

In [35]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, precision_score, precision_recall_curve, auc, f1_score

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred_rfc_BERT)
conf_matrix = confusion_matrix(y_test, y_pred_rfc_BERT)
precision = precision_score(y_test, y_pred_rfc_BERT, average='weighted')
recall = recall_score(y_test, y_pred_rfc_BERT, average='weighted')
f1 = f1_score(y_test, y_pred_rfc_BERT, average='weighted')
class_report = classification_report(y_test, y_pred_rfc_BERT, zero_division='warn')

# Print metrics
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print("Classification Report:")
print(class_report)

Accuracy: 0.82
Confusion Matrix:
[[34  0  0]
 [12 19  1]
 [ 1  2 21]]
Precision: 0.85
Recall: 0.82
F1-Score: 0.82
Classification Report:
              precision    recall  f1-score   support

    negative       0.72      1.00      0.84        34
     neutral       0.90      0.59      0.72        32
    positive       0.95      0.88      0.91        24

    accuracy                           0.82        90
   macro avg       0.86      0.82      0.82        90
weighted avg       0.85      0.82      0.82        90



### **Creating the SVM Classification Model on BERT word embeddings**

In [36]:
from sklearn.svm import SVC

# Instantiate the SVM model with a linear kernel
svm_model = SVC(kernel='linear')

# Fit the SVM model to the training data
svm_model_BERT = svm_model.fit(X_train, y_train)

In [37]:
y_pred_svm_BERT = svm_model_BERT.predict(X_test)
y_pred_svm_BERT

array(['negative', 'positive', 'negative', 'positive', 'negative',
       'positive', 'negative', 'negative', 'negative', 'neutral',
       'neutral', 'positive', 'neutral', 'positive', 'neutral',
       'negative', 'positive', 'positive', 'positive', 'negative',
       'negative', 'neutral', 'positive', 'negative', 'negative',
       'neutral', 'negative', 'neutral', 'neutral', 'negative', 'neutral',
       'negative', 'negative', 'neutral', 'negative', 'negative',
       'negative', 'neutral', 'neutral', 'positive', 'positive',
       'positive', 'positive', 'negative', 'negative', 'negative',
       'neutral', 'neutral', 'negative', 'negative', 'negative',
       'negative', 'negative', 'positive', 'negative', 'negative',
       'neutral', 'positive', 'neutral', 'neutral', 'negative',
       'negative', 'negative', 'neutral', 'negative', 'negative',
       'negative', 'neutral', 'negative', 'positive', 'positive',
       'neutral', 'neutral', 'positive', 'negative', 'neutral', 'neut

In [38]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, precision_score, precision_recall_curve, auc, f1_score

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred_svm_BERT)
conf_matrix = confusion_matrix(y_test, y_pred_svm_BERT)
precision = precision_score(y_test, y_pred_svm_BERT, average='weighted')
recall = recall_score(y_test, y_pred_svm_BERT, average='weighted')
f1 = f1_score(y_test, y_pred_svm_BERT, average='weighted')
class_report = classification_report(y_test, y_pred_svm_BERT, zero_division='warn')

# Print metrics
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print("Classification Report:")
print(class_report)

Accuracy: 0.79
Confusion Matrix:
[[30  4  0]
 [ 9 21  2]
 [ 1  3 20]]
Precision: 0.79
Recall: 0.79
F1-Score: 0.79
Classification Report:
              precision    recall  f1-score   support

    negative       0.75      0.88      0.81        34
     neutral       0.75      0.66      0.70        32
    positive       0.91      0.83      0.87        24

    accuracy                           0.79        90
   macro avg       0.80      0.79      0.79        90
weighted avg       0.79      0.79      0.79        90

