# Document Classification with Supervised Machine Learning Models

### Loading Libraries

In [1]:
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import random
import seaborn as sns
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import PorterStemmer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, make_scorer, precision_score, recall_score, precision_recall_fscore_support, f1_score 
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import NuSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD

### Importing and Splitting Data

In [2]:
df = pd.read_csv("Labelled Sampled Speeches.csv")
df = df.dropna(subset=['doc_topics'])

X = df['speech_text']  # features
y = df['doc_topics']   # classification label

# number of documents in each topic
category_counts = df['doc_topics'].value_counts()
print(category_counts)

# split the data: 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) 

2    390
1    210
Name: doc_topics, dtype: int64


### Baseline Accuracy
##### Random Assignment of Class

In [3]:
# counting the number of instances in each class
random.seed(1)
N_hostility = sum(1 for label in y_train if label == 1)
N_hospitality = sum(1 for label in y_train if label == 2)

# generating random predictions
random_predictions = [random.choice([1, 2]) for doc_topics in range(len(y_train))]

# calculating accuracy based on random predictions
accuracy_random = round(np.mean(np.array(random_predictions) == np.array(y_train)),3)*100
print(accuracy_random)

48.1


##### Dictionary Method

In [4]:
custom_stopwords = ["hon", "rose—", "rose", "government", "minister",
                   "gentleman", "speaker", "mr", "friend", "home", "secretary",
                   "friend", "right", "<", ">", "can", "lady", "people"]

def preprocess_text_dict(text):
    tokens = nltk.word_tokenize(text) 
    tokens = [token.lower() for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if token not in stopwords.words('english')]  
    tokens = [token for token in tokens if token not in custom_stopwords]  
    return ' '.join(tokens)

X_train_dictmethod = [preprocess_text_dict(text) for text in X_train]
X_test_dictmethod = [preprocess_text_dict(text) for text in X_test]
y_train_binary = [1 if label == 2 else 0 for label in y_train]
y_test_binary = [1 if label == 2 else 0 for label in y_test]

# "Hostile" dictionary
host_words = ["illegal", "offend", "border", "control", "genuine",
              "model", "system", "deport", "point", "detention", 
              "migrant", "bogus", "issue", "france","smugglers"]

# "Hospitable" dictionary
hosp_words = ["welcome", "safe", "trauma", "flee", "visa", 
              "help", "war", "support", "brutal", "hostile",
              "women", "children", "vulnerable","refugee", "integrate"]

def count_words(text, word_set):
    count = 0
    for word in text.split():
        if word in word_set:
            count += 1
    return count

def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return accuracy, precision, recall, f1

# count occurrences of hostile and hospitable words in each document
X_train_hostile_count = [count_words(text, set(host_words)) for text in X_train_dictmethod]
X_train_hospitable_count = [count_words(text, set(hosp_words)) for text in X_train_dictmethod]
X_test_hostile_count = [count_words(text, set(host_words)) for text in X_test_dictmethod]
X_test_hospitable_count = [count_words(text, set(hosp_words)) for text in X_test_dictmethod]

# classification function
def classify_documents(X_train_hostile_count, X_train_hospitable_count):
    predictions = []
    for hosp_count, hostile_count in zip(X_train_hospitable_count, X_train_hostile_count):
        if hosp_count > hostile_count:
            predictions.append(1)
        else:
            predictions.append(0)
    return predictions

# predictions for training and testing sets
y_train_pred_dict = classify_documents(X_train_hostile_count, X_train_hospitable_count)
y_test_pred_dict = classify_documents(X_test_hostile_count, X_test_hospitable_count)

# evaluating the model on training set using dictionary method
train_accuracy_dict, train_precision_dict, train_recall_dict, train_f1_dict = evaluate_model(y_train_binary, y_train_pred_dict)
test_accuracy_dict, test_precision_dict, test_recall_dict, test_f1_dict = evaluate_model(y_test_binary, y_test_pred_dict)

# evaluating dictionary method
print("Testing Set:")
print("Accuracy:", round(100*test_accuracy_dict,1))
print("F1 Score:", round(100*test_f1_dict,1))
print("Precision:", round(100*test_precision_dict,1))
print("Recall:", round(100*test_recall_dict,1))

print("\nTraining Set:")
print("Accuracy:", round(100*train_accuracy_dict,1))
print("F1 Score:", round(100*train_f1_dict,1))
print("Precision:", round(100*train_precision_dict,1))
print("Recall:", round(100*train_recall_dict,1))

Testing Set:
Accuracy: 60.0
F1 Score: 62.5
Precision: 83.3
Recall: 50.0

Training Set:
Accuracy: 60.4
F1 Score: 61.5
Precision: 82.6
Recall: 49.0


### Pre-processing Text

In [5]:
stemmer = PorterStemmer()
custom_stopwords = ["hon", "rose—", "rose", "government", "minister",
                   "gentleman", "speaker", "mr", "friend", "home", "secretary",
                   "friend", "right", "<", ">", "can", "lady", "people"]

def preprocess_text(text):
    tokens = nltk.word_tokenize(text) 
    tokens = [token.lower() for token in tokens if token.isalpha()]  # Remove non-alphabetic characters
    tokens = [token for token in tokens if token not in stopwords.words('english')]  # Remove stopwords
    tokens = [token for token in tokens if token not in custom_stopwords]  # Remove custom stopwords
    tokens = [stemmer.stem(token) for token in tokens]  # Stemming
    return ' '.join(tokens)

# preprocessing text data for training and testing sets
X_train_preprocessed = [preprocess_text(text) for text in X_train]
X_test_preprocessed = [preprocess_text(text) for text in X_test]

# defining feature representations
vectorizers = [
    CountVectorizer(min_df=5, max_df=0.95, lowercase=True, max_features=600),
    CountVectorizer(min_df=5, max_df=0.95, lowercase=True, max_features=900),
    CountVectorizer(min_df=5, max_df=0.95, lowercase=True, max_features=1200),
    CountVectorizer(min_df=5, lowercase=True, ngram_range=(1, 2), max_features=600),
    CountVectorizer(min_df=5, lowercase=True, ngram_range=(1, 2), max_features=900),
    CountVectorizer(min_df=5, lowercase=True, ngram_range=(1, 2), max_features=1200),
    TfidfVectorizer(min_df=5, lowercase=True, max_features=600),
    TfidfVectorizer(min_df=5, lowercase=True, max_features=900),
    TfidfVectorizer(min_df=5, lowercase=True, max_features=1200)]

### Multinomial Naive Bayes

In [6]:
# creating a MNB cross validation function
param_grid = {
    'alpha': [0.05,0.08,0.1,0.3,0.5,0.7,1.0],  # Smoothing parameter (alpha)
}

def MultinomialNB_train_and_evaluate(X_train, X_test, y_train, y_test, param_grid):
    grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=10, scoring='f1')
    grid_search.fit(X_train, y_train)
    best_nb_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    
    y_pred = best_nb_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    conf_matrix = confusion_matrix(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    
    return accuracy, f1, precision, recall, best_params

results = []      
for vectorizer in vectorizers:
    X_train_dtm = vectorizer.fit_transform(X_train_preprocessed)
    X_test_dtm = vectorizer.transform(X_test_preprocessed)
    num_features = len(vectorizer.get_feature_names_out())

    if "TfidfVectorizer" in str(type(vectorizer)):
        feature_type = "TF-IDF"
    elif vectorizer.ngram_range == (1, 1):
        feature_type = "Unigram"
    else:
        feature_type = "Bigram"

    accuracy, f1, precision, recall, chosen_parameters = MultinomialNB_train_and_evaluate(X_train_dtm, X_test_dtm, y_train, y_test, param_grid)
 
    results.append({
        "Feature Type": feature_type,
        "Number of Features": num_features,
        "Accuracy": round(accuracy,3)*100,
        "F1 Score": round(f1,3)*100,
        "Precision": round(precision,3)*100,
        "Recall": round(recall,3)*100,
        "Parameters": chosen_parameters
    })

results_df = pd.DataFrame(results)
print(results_df)

results_df.to_excel("nb_results.xlsx", index=False)

  Feature Type  Number of Features  Accuracy  F1 Score  Precision  Recall  \
0      Unigram                 600      82.5      79.9       80.6    79.4   
1      Unigram                 900      83.3      81.2       81.2    81.2   
2      Unigram                1200      80.0      77.2       77.6    76.9   
3       Bigram                 600      81.7      79.6       79.3    80.0   
4       Bigram                 900      82.5      80.2       80.4    80.0   
5       Bigram                1200      82.5      80.4       80.3    80.6   
6       TF-IDF                 600      75.0      65.9       76.1    65.0   
7       TF-IDF                 900      77.5      70.9       77.8    69.4   
8       TF-IDF                1200      78.3      72.8       77.8    71.2   

       Parameters  
0  {'alpha': 0.7}  
1  {'alpha': 0.3}  
2  {'alpha': 0.5}  
3  {'alpha': 0.7}  
4  {'alpha': 0.5}  
5  {'alpha': 0.7}  
6  {'alpha': 0.1}  
7  {'alpha': 0.1}  
8  {'alpha': 0.1}  


### Multinomial Naive Bayes on Training Set (Testing for Overfitting)

In [7]:
results = []
for vectorizer in vectorizers:
    X_train_dtm = vectorizer.fit_transform(X_train_preprocessed)
    num_features = len(vectorizer.get_feature_names_out())

    if "TfidfVectorizer" in str(type(vectorizer)):
        feature_type = "TF-IDF"
    elif vectorizer.ngram_range == (1, 1):
        feature_type = "Unigram"
    else:
        feature_type = "Bigram"

    accuracy, f1, precision, recall, chosen_parameters = MultinomialNB_train_and_evaluate(X_train_dtm, X_train_dtm, y_train, y_train, param_grid)

    results.append({
        "Feature Type": feature_type,
        "Number of Features": num_features,
        "Accuracy": round(accuracy, 3)*100,
        "F1 Score": round(f1, 3)*100,
        "Precision": round(precision, 3)*100,
        "Recall": round(recall, 3)*100,
        "Parameters": chosen_parameters
    })

results_df = pd.DataFrame(results)
print(results_df)

results_df.to_excel("nb_training_results.xlsx", index=False)

  Feature Type  Number of Features  Accuracy  F1 Score  Precision  Recall  \
0      Unigram                 600      85.8      85.0       84.3    86.2   
1      Unigram                 900      88.5      88.0       87.2    89.9   
2      Unigram                1200      89.8      89.2       88.4    90.8   
3       Bigram                 600      84.8      83.9       83.2    85.0   
4       Bigram                 900      87.7      87.0       86.3    88.6   
5       Bigram                1200      89.6      89.0       88.2    90.7   
6       TF-IDF                 600      89.2      87.7       89.9    86.3   
7       TF-IDF                 900      91.7      90.6       92.3    89.4   
8       TF-IDF                1200      94.8      94.2       95.1    93.4   

       Parameters  
0  {'alpha': 0.7}  
1  {'alpha': 0.3}  
2  {'alpha': 0.5}  
3  {'alpha': 0.7}  
4  {'alpha': 0.5}  
5  {'alpha': 0.7}  
6  {'alpha': 0.1}  
7  {'alpha': 0.1}  
8  {'alpha': 0.1}  


### Logistic Regression

In [8]:
def logit_train_and_evaluate(X_train, X_test, y_train, y_test, custom_Cs=None):
    if custom_Cs is None:
        custom_Cs = [0.01, 0.02, 0.03, 0.04, 0.05, 1.0]

    lasso_logit_model = LogisticRegressionCV(Cs=custom_Cs, scoring="f1")  
    lasso_logit_model.fit(X_train, y_train) 
    y_pred = lasso_logit_model.predict(X_test)  
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    conf_matrix = confusion_matrix(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro',zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro')
    chosen_parameters = {
        "C": lasso_logit_model.C_,
    }

    return accuracy, f1, precision, recall, conf_matrix, chosen_parameters

results = []
for vectorizer in vectorizers:
    X_train_dtm = vectorizer.fit_transform(X_train_preprocessed)
    X_test_dtm = vectorizer.transform(X_test_preprocessed)
    num_features = len(vectorizer.get_feature_names_out())

    if "TfidfVectorizer" in str(type(vectorizer)):
        feature_type = "TF-IDF" 
    elif vectorizer.ngram_range == (1, 1):
        feature_type = "Unigram"
    else:
        feature_type = "Bigram"

    accuracy, f1, precision, recall, conf_matrix, chosen_parameters = logit_train_and_evaluate(X_train_dtm, X_test_dtm, y_train, y_test)

    results.append({
        "Feature Type": feature_type,
        "Number of Features": num_features,
        "Accuracy": round(accuracy,3)*100,
        "F1 Score": round(f1,3)*100,
        "Precision": round(precision,3)*100,
        "Recall": round(recall,3)*100,
        "Parameters": chosen_parameters
    })
    
results_df = pd.DataFrame(results)
print(results_df)

results_df.to_excel("lr_results.xlsx", index=False)

  Feature Type  Number of Features  Accuracy  F1 Score  Precision  Recall  \
0      Unigram                 600      80.0      73.9       82.5    71.9   
1      Unigram                 900      80.8      74.7       84.6    72.5   
2      Unigram                1200      80.8      74.7       84.6    72.5   
3       Bigram                 600      79.2      72.5       81.7    70.6   
4       Bigram                 900      78.3      71.1       81.0    69.4   
5       Bigram                1200      80.0      73.3       84.0    71.2   
6       TF-IDF                 600      76.7      65.6       87.0    65.0   
7       TF-IDF                 900      76.7      65.6       87.0    65.0   
8       TF-IDF                1200      76.7      66.5       83.7    65.6   

      Parameters  
0  {'C': [0.05]}  
1  {'C': [0.04]}  
2  {'C': [0.04]}  
3  {'C': [0.05]}  
4  {'C': [0.04]}  
5  {'C': [0.04]}  
6   {'C': [1.0]}  
7   {'C': [1.0]}  
8   {'C': [1.0]}  


### Logistic Regression on Training Set

In [9]:
results = []
for vectorizer in vectorizers:
    X_train_dtm = vectorizer.fit_transform(X_train_preprocessed)
    X_test_dtm = vectorizer.transform(X_test_preprocessed)
    num_features = len(vectorizer.get_feature_names_out())

    if "TfidfVectorizer" in str(type(vectorizer)):
        feature_type = "TF-IDF" 
    elif vectorizer.ngram_range == (1, 1):
        feature_type = "Unigram"
    else:
        feature_type = "Bigram"

    accuracy_train, f1_train, precision_train, recall_train, conf_matrix_train, chosen_parameters_train = logit_train_and_evaluate(X_train_dtm, X_train_dtm, y_train, y_train)

    results.append({
        "Feature Type": feature_type,
        "Number of Features": num_features,
        "Accuracy (Train)": round(accuracy_train, 3) * 100,
        "F1 Score (Train)": round(f1_train, 3) * 100,
        "Precision (Train)": round(precision_train, 3) * 100,
        "Recall (Train)": round(recall_train, 3) * 100,
        "Parameters (Train)": chosen_parameters_train,
    })

results_df = pd.DataFrame(results)
print(results_df)
results_df.to_excel("lr_training_results.xlsx", index=False)

  Feature Type  Number of Features  Accuracy (Train)  F1 Score (Train)  \
0      Unigram                 600              92.3              91.1   
1      Unigram                 900              91.7              90.3   
2      Unigram                1200              91.9              90.6   
3       Bigram                 600              92.3              91.1   
4       Bigram                 900              91.9              90.6   
5       Bigram                1200              92.3              91.1   
6       TF-IDF                 600              89.4              87.5   
7       TF-IDF                 900              89.4              87.4   
8       TF-IDF                1200              89.0              86.9   

   Precision (Train)  Recall (Train) Parameters (Train)  
0               94.7            89.1      {'C': [0.05]}  
1               94.3            88.2      {'C': [0.04]}  
2               94.4            88.5      {'C': [0.04]}  
3               94.2       

### Support Vector Machine

In [10]:
param_grid = {
    'nu': [0.6,0.65,0.7],  # nu parameters
    'kernel': ['linear'],  # kernel function = linear
}

def SVM_train_and_evaluate(X_train, X_test, y_train, y_test, param_grid):
    grid_search = GridSearchCV(NuSVC(), param_grid, cv=10, scoring='f1')
    grid_search.fit(X_train, y_train)
    best_nusvm_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    
    y_pred = best_nusvm_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    conf_matrix = confusion_matrix(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    
    return accuracy, f1, precision, recall, best_params

results = []
for vectorizer in vectorizers:
    X_train_dtm = vectorizer.fit_transform(X_train_preprocessed)
    X_test_dtm = vectorizer.transform(X_test_preprocessed)
    num_features = len(vectorizer.get_feature_names_out())

    if "TfidfVectorizer" in str(type(vectorizer)):
        feature_type = "TF-IDF"
    elif vectorizer.ngram_range == (1, 1):
        feature_type = "Unigram"
    else:
        feature_type = "Bigram"

    accuracy, f1, precision, recall, chosen_parameters = SVM_train_and_evaluate(X_train_dtm, X_test_dtm, y_train, y_test, param_grid)

    results.append({
        "Feature Type": feature_type,
        "Number of Features": num_features,
        "Accuracy": round(accuracy,3)*100,
        "F1 Score": round(f1,3)*100,
        "Precision": round(precision,3)*100,
        "Recall": round(recall,3)*100,
        "Parameters": chosen_parameters
    })

results_df = pd.DataFrame(results)
print(results_df)

results_df.to_excel("svm_results.xlsx", index=False)

  Feature Type  Number of Features  Accuracy  F1 Score  Precision  Recall  \
0      Unigram                 600      79.2      71.9       83.4    70.0   
1      Unigram                 900      80.0      73.3       84.0    71.2   
2      Unigram                1200      79.2      71.9       83.4    70.0   
3       Bigram                 600      78.3      70.4       82.7    68.8   
4       Bigram                 900      79.2      71.9       83.4    70.0   
5       Bigram                1200      80.8      74.7       84.6    72.5   
6       TF-IDF                 600      76.7      70.7       75.4    69.4   
7       TF-IDF                 900      79.2      73.6       79.5    71.9   
8       TF-IDF                1200      78.3      71.7       79.7    70.0   

                        Parameters  
0  {'kernel': 'linear', 'nu': 0.6}  
1  {'kernel': 'linear', 'nu': 0.6}  
2  {'kernel': 'linear', 'nu': 0.6}  
3  {'kernel': 'linear', 'nu': 0.6}  
4  {'kernel': 'linear', 'nu': 0.6}  
5  {'ke

### Support Vector Machine on Training Set

In [11]:
results = []
for vectorizer in vectorizers:
    X_train_dtm = vectorizer.fit_transform(X_train_preprocessed)
    X_test_dtm = vectorizer.transform(X_test_preprocessed)
    num_features = len(vectorizer.get_feature_names_out())

    if "TfidfVectorizer" in str(type(vectorizer)):
        feature_type = "TF-IDF"
    elif vectorizer.ngram_range == (1, 1):
        feature_type = "Unigram"
    else:
        feature_type = "Bigram"

    accuracy_train, f1_train, precision_train, recall_train, chosen_parameters_train = SVM_train_and_evaluate(X_train_dtm, X_train_dtm, y_train, y_train, param_grid)

    results.append({
        "Feature Type": feature_type,
        "Number of Features": num_features,
        "Accuracy (Train)": round(accuracy_train, 3) * 100,
        "F1 Score (Train)": round(f1_train, 3) * 100,
        "Precision (Train)": round(precision_train, 3) * 100,
        "Recall (Train)": round(recall_train, 3) * 100,
        "Parameters (Train)": chosen_parameters_train
    })

results_df = pd.DataFrame(results)
print(results_df)

results_df.to_excel("svm_training_results.xlsx", index=False)

  Feature Type  Number of Features  Accuracy (Train)  F1 Score (Train)  \
0      Unigram                 600              89.4              87.4   
1      Unigram                 900              90.2              88.5   
2      Unigram                1200              90.6              89.0   
3       Bigram                 600              88.8              86.6   
4       Bigram                 900              89.6              87.6   
5       Bigram                1200              90.4              88.7   
6       TF-IDF                 600              92.3              91.3   
7       TF-IDF                 900              94.2              93.4   
8       TF-IDF                1200              96.0              95.6   

   Precision (Train)  Recall (Train)               Parameters (Train)  
0               92.6            85.1  {'kernel': 'linear', 'nu': 0.6}  
1               93.1            86.3  {'kernel': 'linear', 'nu': 0.6}  
2               93.7            86.8  {'ker

## Using Word Embeddings

### Making Word Vectors 

In [12]:
np.random.seed(123)

def preprocess_text(text):
    tokens = nltk.word_tokenize(text) 
    return ' '.join(tokens)

X_train_preprocessed = [preprocess_text(text) for text in X_train]
X_test_preprocessed = [preprocess_text(text) for text in X_test]

# Word2Vec model
word2vec_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
def encode_document(document, word2vec_model):
    words = document.split()
    word_vectors = [word2vec_model[word] for word in words if word in word2vec_model]

    return word_vectors

# aggregate word vectors for document representation
def average_word_vectors(word_vectors):
    if not word_vectors:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(word_vectors, axis=0)

X_train_encoded = [encode_document(doc, word2vec_model) for doc in X_train_preprocessed]
X_test_encoded = [encode_document(doc, word2vec_model) for doc in X_test_preprocessed]
X_train_avg = [average_word_vectors(doc) for doc in X_train_encoded]
X_test_avg = [average_word_vectors(doc) for doc in X_test_encoded]

### Logistic Regression with Word2Vec

In [13]:
custom_Cs = [10, 15, 20, 30, 40]

logit_model = LogisticRegressionCV(
    Cs=custom_Cs,
    cv=10,          
    random_state=123, 
    max_iter=1000, # to ensure the model converges
    scoring='f1_macro'
)


logit_model.fit(X_train_avg, y_train) # training
y_pred = logit_model.predict(X_test_avg) # predictions
accuracy = accuracy_score(y_test, y_pred) 
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
best_C = logit_model.C_ # best hyperparameters

print(f"Accuracy: {accuracy * 100:.1f}")
print(f"F1 Score: {f1_score* 100:.1f}")
print(f"Precision: {precision* 100:.1f}")
print(f"Recall: {recall* 100:.1f}")
print(f"Best Hyperparameter (C): {best_C}")

# testing for overfitting
y_pred_train = logit_model.predict(X_train_avg)
accuracy = accuracy_score(y_train, y_pred_train)  # accuracy
precision, recall, f1_score, _ = precision_recall_fscore_support(y_train, y_pred_train, average='binary')

print(" ")
print("Testing for Overfitting on the Training Set")
print(f"Accuracy: {accuracy * 100:.1f}")
print(f"F1 Score: {f1_score * 100:.1f}")
print(f"Precision: {precision * 100:.1f}")
print(f"Recall: {recall * 100:.1f}")
print(f"Best Hyperparameter (C): {best_C}")

Accuracy: 77.5
F1 Score: 58.5
Precision: 76.0
Recall: 47.5
Best Hyperparameter (C): [30]
 
Testing for Overfitting on the Training Set
Accuracy: 86.2
F1 Score: 79.0
Precision: 86.1
Recall: 72.9
Best Hyperparameter (C): [30]


### SVM with Word2Vec

In [14]:
param_grid = {
    'nu': [0.4,0.5,0.6,0.7],  # nu parameter
    'kernel': ['linear', 'rbf', 'poly'],  # kernel functions
}

nu_svm_model = NuSVC()
grid_search = GridSearchCV(nu_svm_model, param_grid, cv=10,scoring="f1") #cv
grid_search.fit(X_train_avg, y_train)
best_nu = grid_search.best_params_['nu'] 
best_kernel = grid_search.best_params_['kernel']

y_pred_nu_svm = grid_search.predict(X_test_avg) # predictions
accuracy_nu_svm = accuracy_score(y_test, y_pred_nu_svm) 
precision_nu_svm, recall_nu_svm, f1_score_nu_svm, _ = precision_recall_fscore_support(y_test, y_pred_nu_svm, average='binary')

print(f"Accuracy: {accuracy_nu_svm* 100:.1f}")
print(f"F1 Score: {f1_score_nu_svm* 100:.1f}")
print(f"Precision: {precision_nu_svm* 100:.1f}")
print(f"Recall: {recall_nu_svm* 100:.1f}")
print(f"Best Hyperparameter (nu): {best_nu}")
print(f"Best Hyperparameter (kernel): {best_kernel}")

# testing for overfitting
y_pred_nu_svm_train = grid_search.predict(X_train_avg)
accuracy_nu_svm_train = accuracy_score(y_train, y_pred_nu_svm_train)
precision_nu_svm_train, recall_nu_svm_train, f1_score_nu_svm_train, _ = precision_recall_fscore_support(
    y_train, y_pred_nu_svm_train, average='binary'
)

print(" ")
print("Results on the Training Set:")
print(f"Accuracy: {accuracy_nu_svm_train * 100:.1f}")
print(f"F1 Score: {f1_score_nu_svm_train * 100:.1f}")
print(f"Precision: {precision_nu_svm_train * 100:.1f}")
print(f"Recall: {recall_nu_svm_train * 100:.1f}")
print(f"Best Hyperparameter (nu): {best_nu}")
print(f"Best Hyperparameter (kernel): {best_kernel}")

Accuracy: 78.3
F1 Score: 61.8
Precision: 75.0
Recall: 52.5
Best Hyperparameter (nu): 0.4
Best Hyperparameter (kernel): poly
 
Results on the Training Set:
Accuracy: 95.0
F1 Score: 92.7
Precision: 95.6
Recall: 90.0
Best Hyperparameter (nu): 0.4
Best Hyperparameter (kernel): poly


### Logistic Regression with BERT

In [15]:
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, BertModel
import torch

custom_Cs = [0.4,0.5,0.7,1.0]

np.random.seed(123)

# BERT tokeniser and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

X_train_bert = []
X_test_bert = []

# get BERT embeddings for training data
for text in X_train:
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    avg_embeddings = torch.mean(last_hidden_states, dim=1).squeeze().numpy()  # Average pooling of token embeddings
    X_train_bert.append(avg_embeddings)

# get BERT embeddings for testing data
for text in X_test:
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    avg_embeddings = torch.mean(last_hidden_states, dim=1).squeeze().numpy()  # Average pooling of token embeddings
    X_test_bert.append(avg_embeddings)

X_train_bert = np.array(X_train_bert)
X_test_bert = np.array(X_test_bert)

# Logistic Regression + BERT
logit_model = LogisticRegressionCV(
    Cs=custom_Cs,
    cv=10,           
    random_state=123, 
    max_iter=1000,    
    scoring='f1_macro'
)


logit_model.fit(X_train_bert, y_train) # Training
y_pred = logit_model.predict(X_test_bert) # Predictions
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
best_C = logit_model.C_

print(f"Accuracy: {accuracy * 100:.1f}")
print(f"F1 Score: {f1_score* 100:.1f}")
print(f"Precision: {precision* 100:.1f}")
print(f"Recall: {recall* 100:.1f}")
print(f"Best Hyperparameter (C): {best_C}")

# Testing for Overfitting
y_pred_train = logit_model.predict(X_train_bert)
accuracy = accuracy_score(y_train, y_pred_train)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_train, y_pred_train, average='binary')
print("\nTesting for Overfitting on the Training Set")
print(f"Accuracy: {accuracy * 100:.1f}")
print(f"F1 Score: {f1_score * 100:.1f}")
print(f"Precision: {precision * 100:.1f}")
print(f"Recall: {recall * 100:.1f}")
print(f"Best Hyperparameter (C): {best_C}")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Accuracy: 76.7
F1 Score: 61.1
Precision: 68.8
Recall: 55.0
Best Hyperparameter (C): [0.5]

Testing for Overfitting on the Training Set
Accuracy: 89.4
F1 Score: 84.1
Precision: 89.4
Recall: 79.4
Best Hyperparameter (C): [0.5]


In [16]:
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, BertModel
import torch

custom_Cs = [0.05,0.1,0.5,1]
np.random.seed(123)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
X_train_bert = []
X_test_bert = []

# get BERT embeddings for training data
for text in X_train:
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    avg_embeddings = torch.mean(last_hidden_states, dim=1).squeeze().numpy()  # Average pooling of token embeddings
    X_train_bert.append(avg_embeddings)

# get BERT embeddings for testing data
for text in X_test:
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    avg_embeddings = torch.mean(last_hidden_states, dim=1).squeeze().numpy()  # Average pooling of token embeddings
    X_test_bert.append(avg_embeddings)

X_train_bert = np.array(X_train_bert)
X_test_bert = np.array(X_test_bert)

# SVM + BERT
svm_model = SVC()

param_grid = {
    'C': custom_Cs,
    'kernel': ['linear', 'rbf', 'poly'],  # kernel functions
}

grid_search = GridSearchCV(svm_model, param_grid, cv=10, scoring='f1_macro')
grid_search.fit(X_train_bert, y_train)
best_C = grid_search.best_params_['C']
best_kernel = grid_search.best_params_['kernel']
y_pred_svm = grid_search.predict(X_test_bert)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm, recall_svm, f1_score_svm, _ = precision_recall_fscore_support(y_test, y_pred_svm, average='binary')

print(f"Accuracy: {accuracy_svm * 100:.1f}")
print(f"F1 Score: {f1_score_svm* 100:.1f}")
print(f"Precision: {precision_svm* 100:.1f}")
print(f"Recall: {recall_svm* 100:.1f}")
print(f"Best Hyperparameter (C): {best_C}")
print(f"Best Kernel: {best_kernel}")

# Testing for Overfitting
y_pred_svm_train = grid_search.predict(X_train_bert)
accuracy_svm_train = accuracy_score(y_train, y_pred_svm_train)
precision_svm_train, recall_svm_train, f1_score_svm_train, _ = precision_recall_fscore_support(y_train, y_pred_svm_train, average='binary')
print("\nTesting for Overfitting on the Training Set")
print(f"Accuracy: {accuracy_svm_train * 100:.1f}")
print(f"F1 Score: {f1_score_svm_train * 100:.1f}")
print(f"Precision: {precision_svm_train * 100:.1f}")
print(f"Recall: {recall_svm_train * 100:.1f}")
print(f"Best Hyperparameter (C): {best_C}")
print(f"Best Kernel: {best_kernel}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


KeyboardInterrupt: 

## Chosen Model: Naive Bayes, Unigram, 900 Features

In [23]:
fulldf = pd.read_csv("debate_dataset.csv")
fullX = fulldf['speech_text'] 
fullX_preprocessed = [preprocess_text(text) for text in fullX]
vectorizer = CountVectorizer(min_df=5, max_df=0.95, lowercase=True, max_features=900)
X_train_dtm = vectorizer.fit_transform(X_train_preprocessed)
X_full = vectorizer.transform(fullX_preprocessed)
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_dtm, y_train)
y_pred_full = naive_bayes_classifier.predict(X_full)

## Labelling the rest of the dataset

In [24]:
fulldf['predicted_label'] = y_pred_full
fulldf.to_csv("fulldf_with_predictions.csv", index=False)

## Evaluating Model

#### Printing Correctly Classified Speeches

In [None]:
vectorizers = CountVectorizer(min_df=5, max_df=0.95, lowercase=True, max_features=600)
X_train_dtm = vectorizers.fit_transform(X_train_preprocessed)
X_test_dtm = vectorizers.transform(X_test_preprocessed)

naive_bayes_classifier = MultinomialNB(alpha=0.7)
naive_bayes_classifier.fit(X_train_dtm, y_train)
x_pred_chosenmodel = naive_bayes_classifier.predict(X_test_dtm)
correct_indices = [i for i, (pred, actual) in enumerate(zip(x_pred_chosenmodel, y_test)) if pred == actual]

print("Correctly classified documents:")
for idx in correct_indices[:50]: 
    print(f"Document {idx + len(X_train)}:")
    print(X_test_preprocessed[idx])  
    print("Predicted Label:", x_pred_chosenmodel[idx]) 
    print("Actual Label:", y_test.iloc[idx])  
    print("\n")

#### Getting the most important terms for each class

In [None]:
vectorizer = CountVectorizer(min_df=5, max_df=0.95, lowercase=True, max_features=600)
X_train_dtm = vectorizer.fit_transform(X_train_preprocessed)
X_test_dtm = vectorizer.transform(X_test_preprocessed)

# Train Multinomial Naive Bayes classifier
naive_bayes_classifier = MultinomialNB(alpha=0.7)
naive_bayes_classifier.fit(X_train_dtm, y_train)
feature_names = vectorizer.get_feature_names_out() # get feature names
log_probabilities = naive_bayes_classifier.feature_log_prob_ # get the log probabilities

top_n = 30
for i, target_class in enumerate(naive_bayes_classifier.classes_):
    print(f"Top {top_n} features for class {target_class}:")
    class_probabilities = log_probabilities[i]  # log probabilities for current class
    top_indices = class_probabilities.argsort()[-top_n:][::-1] 
    top_features = [feature_names[index] for index in top_indices]  # top features
    print(top_features)


#### Printing Incorrectly Classified Speeches

In [None]:
incorrect_indices = [i for i, (pred, actual) in enumerate(zip(x_pred_chosenmodel, y_test)) if pred != actual]
print("Incorrectly classified documents:")
for idx in incorrect_indices[:10]:  
    print(f"Document {idx + len(X_train)}:")
    print(X_test_preprocessed[idx])  
    print("Predicted Label:", x_pred_chosenmodel[idx]) 
    print("Actual Label:", y_test.iloc[idx]) 
    print("\n")