# Document Classification with Supervised Machine Learning Models

### Loading Libraries

In [1]:
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import random
import seaborn as sns
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import PorterStemmer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, make_scorer, precision_score, recall_score, precision_recall_fscore_support, f1_score 
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import NuSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD

### Importing and Splitting Data

In [2]:
df = pd.read_csv("SAMPLED_SPEECHES_3.csv")
df = df.dropna(subset=['doc_topics']) # Remove rows with NA values in the 'doc_topics' column

X = df['speech_text']  # Features
y = df['doc_topics']   # Classification label

# Number of documents in each topic
category_counts = df['doc_topics'].value_counts()
print(category_counts)

# Split the data - 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) 

2    390
1    210
Name: doc_topics, dtype: int64


### Baseline Accuracy

In [3]:
# Count the number of instances in each class
random.seed(1)
N_hostility = sum(1 for label in y_train if label == 1)
N_hospitality = sum(1 for label in y_train if label == 2)

# Generate random predictions
random_predictions = [random.choice([1, 2]) for doc_topics in range(len(y_train))]

# Calculate accuracy based on random predictions
accuracy_random = round(np.mean(np.array(random_predictions) == np.array(y_train)),3)*100
print(accuracy_random)

48.1


### Pre-processing Text

In [4]:
stemmer = PorterStemmer()
custom_stopwords = ["hon", "rose—", "rose", "government", "minister",
                   "gentleman", "speaker", "mr", "friend", "home", "secretary",
                   "friend", "right", "<", ">", "can", "lady", "people"]

def preprocess_text(text):
    tokens = nltk.word_tokenize(text) 
    tokens = [token.lower() for token in tokens if token.isalpha()]  # Remove non-alphabetic characters
    tokens = [token for token in tokens if token not in stopwords.words('english')]  # Remove English stopwords
    tokens = [token for token in tokens if token not in custom_stopwords]  # Remove custom stopwords
    tokens = [stemmer.stem(token) for token in tokens]  # Stemming
    return ' '.join(tokens)

# Apply preprocessing to your text data for both training and testing sets
X_train_preprocessed = [preprocess_text(text) for text in X_train]
X_test_preprocessed = [preprocess_text(text) for text in X_test]

# Defining feature representations (different max_feature value and feature types)
vectorizers = [
    CountVectorizer(min_df=5, max_df=0.95, lowercase=True, max_features=600),
    CountVectorizer(min_df=5, max_df=0.95, lowercase=True, max_features=900),
    CountVectorizer(min_df=5, max_df=0.95, lowercase=True, max_features=1200),
    CountVectorizer(min_df=5, lowercase=True, ngram_range=(1, 2), max_features=600),
    CountVectorizer(min_df=5, lowercase=True, ngram_range=(1, 2), max_features=900),
    CountVectorizer(min_df=5, lowercase=True, ngram_range=(1, 2), max_features=1200),
    TfidfVectorizer(min_df=5, lowercase=True, max_features=600),
    TfidfVectorizer(min_df=5, lowercase=True, max_features=900),
    TfidfVectorizer(min_df=5, lowercase=True, max_features=1200)]

### Multinomial Naive Bayes

In [5]:
# Creating a Multinomial Naive Bayes Cross Validation function
param_grid = {
    'alpha': [0.05,0.08,0.1,0.3,0.5,0.7,1.0],  # Smoothing parameter (alpha)
}

def MultinomialNB_train_and_evaluate(X_train, X_test, y_train, y_test, param_grid):
    grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=10, scoring='f1')
    grid_search.fit(X_train, y_train)
    best_nb_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    
    y_pred = best_nb_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    conf_matrix = confusion_matrix(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    
    return accuracy, f1, precision, recall, best_params


results = []
        
for vectorizer in vectorizers:
    X_train_dtm = vectorizer.fit_transform(X_train_preprocessed)
    X_test_dtm = vectorizer.transform(X_test_preprocessed)
    num_features = len(vectorizer.get_feature_names_out())

    if "TfidfVectorizer" in str(type(vectorizer)):
        feature_type = "TF-IDF"
    elif vectorizer.ngram_range == (1, 1):
        feature_type = "Unigram"
    else:
        feature_type = "Bigram"

    accuracy, f1, precision, recall, chosen_parameters = MultinomialNB_train_and_evaluate(X_train_dtm, X_test_dtm, y_train, y_test, param_grid)
    
    # appending the results
    results.append({
        "Feature Type": feature_type,
        "Number of Features": num_features,
        "Accuracy": round(accuracy,3)*100,
        "F1 Score": round(f1,3)*100,
        "Precision": round(precision,3)*100,
        "Recall": round(recall,3)*100,
        "Parameters": chosen_parameters
    })

results_df = pd.DataFrame(results)
print(results_df)

results_df.to_excel("nb_results.xlsx", index=False)

  Feature Type  Number of Features  Accuracy  F1 Score  Precision  Recall  \
0      Unigram                 600      82.5      79.9       80.6    79.4   
1      Unigram                 900      83.3      81.2       81.2    81.2   
2      Unigram                1200      80.0      77.2       77.6    76.9   
3       Bigram                 600      81.7      79.6       79.3    80.0   
4       Bigram                 900      82.5      80.2       80.4    80.0   
5       Bigram                1200      82.5      80.4       80.3    80.6   
6       TF-IDF                 600      75.0      65.9       76.1    65.0   
7       TF-IDF                 900      77.5      70.9       77.8    69.4   
8       TF-IDF                1200      78.3      72.8       77.8    71.2   

       Parameters  
0  {'alpha': 0.7}  
1  {'alpha': 0.3}  
2  {'alpha': 0.5}  
3  {'alpha': 0.7}  
4  {'alpha': 0.5}  
5  {'alpha': 0.7}  
6  {'alpha': 0.1}  
7  {'alpha': 0.1}  
8  {'alpha': 0.1}  


### Multinomial Naive Bayes on Training Set (Testing for Overfitting)

In [6]:
results = []

for vectorizer in vectorizers:
    X_train_dtm = vectorizer.fit_transform(X_train_preprocessed)
    num_features = len(vectorizer.get_feature_names_out())

    if "TfidfVectorizer" in str(type(vectorizer)):
        feature_type = "TF-IDF"
    elif vectorizer.ngram_range == (1, 1):
        feature_type = "Unigram"
    else:
        feature_type = "Bigram"

    # Call the Multinomial Naive Bayes training and evaluation function on the training set
    accuracy, f1, precision, recall, chosen_parameters = MultinomialNB_train_and_evaluate(X_train_dtm, X_train_dtm, y_train, y_train, param_grid)
    
    # Append the results to the results list
    results.append({
        "Feature Type": feature_type,
        "Number of Features": num_features,
        "Accuracy": round(accuracy, 3)*100,
        "F1 Score": round(f1, 3)*100,
        "Precision": round(precision, 3)*100,
        "Recall": round(recall, 3)*100,
        "Parameters": chosen_parameters
    })

results_df = pd.DataFrame(results)
print(results_df)

results_df.to_excel("nb_training_results.xlsx", index=False)

  Feature Type  Number of Features  Accuracy  F1 Score  Precision  Recall  \
0      Unigram                 600      85.8      85.0       84.3    86.2   
1      Unigram                 900      88.5      88.0       87.2    89.9   
2      Unigram                1200      89.8      89.2       88.4    90.8   
3       Bigram                 600      84.8      83.9       83.2    85.0   
4       Bigram                 900      87.7      87.0       86.3    88.6   
5       Bigram                1200      89.6      89.0       88.2    90.7   
6       TF-IDF                 600      89.2      87.7       89.9    86.3   
7       TF-IDF                 900      91.7      90.6       92.3    89.4   
8       TF-IDF                1200      94.8      94.2       95.1    93.4   

       Parameters  
0  {'alpha': 0.7}  
1  {'alpha': 0.3}  
2  {'alpha': 0.5}  
3  {'alpha': 0.7}  
4  {'alpha': 0.5}  
5  {'alpha': 0.7}  
6  {'alpha': 0.1}  
7  {'alpha': 0.1}  
8  {'alpha': 0.1}  


### Logistic Regression

In [7]:
def logit_train_and_evaluate(X_train, X_test, y_train, y_test, custom_Cs=None):
    if custom_Cs is None:
        custom_Cs = [0.01, 0.02, 0.03,0.04,0.05,1.0]

    lasso_logit_model = LogisticRegressionCV(Cs=custom_Cs, scoring="f1")  
    lasso_logit_model.fit(X_train, y_train) 
    y_pred = lasso_logit_model.predict(X_test)  
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    conf_matrix = confusion_matrix(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro',zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro')
    chosen_parameters = {
        "C": lasso_logit_model.C_,
    }

    return accuracy, f1, precision, recall, conf_matrix, chosen_parameters


results = []

for vectorizer in vectorizers:
    X_train_dtm = vectorizer.fit_transform(X_train_preprocessed)
    X_test_dtm = vectorizer.transform(X_test_preprocessed)
    num_features = len(vectorizer.get_feature_names_out())

    if "TfidfVectorizer" in str(type(vectorizer)):
        feature_type = "TF-IDF" 
    elif vectorizer.ngram_range == (1, 1):
        feature_type = "Unigram"
    else:
        feature_type = "Bigram"

    accuracy, f1, precision, recall, conf_matrix, chosen_parameters = logit_train_and_evaluate(X_train_dtm, X_test_dtm, y_train, y_test)

    results.append({
        "Feature Type": feature_type,
        "Number of Features": num_features,
        "Accuracy": round(accuracy,3)*100,
        "F1 Score": round(f1,3)*100,
        "Precision": round(precision,3)*100,
        "Recall": round(recall,3)*100,
        "Parameters": chosen_parameters
    })

results_df = pd.DataFrame(results)
print(results_df)

results_df.to_excel("lr_results.xlsx", index=False)

  Feature Type  Number of Features  Accuracy  F1 Score  Precision  Recall  \
0      Unigram                 600      80.0      73.9       82.5    71.9   
1      Unigram                 900      80.8      74.7       84.6    72.5   
2      Unigram                1200      80.8      74.7       84.6    72.5   
3       Bigram                 600      79.2      72.5       81.7    70.6   
4       Bigram                 900      78.3      71.1       81.0    69.4   
5       Bigram                1200      80.0      73.3       84.0    71.2   
6       TF-IDF                 600      76.7      65.6       87.0    65.0   
7       TF-IDF                 900      76.7      65.6       87.0    65.0   
8       TF-IDF                1200      76.7      66.5       83.7    65.6   

      Parameters  
0  {'C': [0.05]}  
1  {'C': [0.04]}  
2  {'C': [0.04]}  
3  {'C': [0.05]}  
4  {'C': [0.04]}  
5  {'C': [0.04]}  
6   {'C': [1.0]}  
7   {'C': [1.0]}  
8   {'C': [1.0]}  


### Logistic Regression on Training Set

In [8]:
results = []

for vectorizer in vectorizers:
    X_train_dtm = vectorizer.fit_transform(X_train_preprocessed)
    X_test_dtm = vectorizer.transform(X_test_preprocessed)
    num_features = len(vectorizer.get_feature_names_out())

    if "TfidfVectorizer" in str(type(vectorizer)):
        feature_type = "TF-IDF" 
    elif vectorizer.ngram_range == (1, 1):
        feature_type = "Unigram"
    else:
        feature_type = "Bigram"

    accuracy_train, f1_train, precision_train, recall_train, conf_matrix_train, chosen_parameters_train = logit_train_and_evaluate(X_train_dtm, X_train_dtm, y_train, y_train)

    results.append({
        "Feature Type": feature_type,
        "Number of Features": num_features,
        "Accuracy (Train)": round(accuracy_train, 3) * 100,
        "F1 Score (Train)": round(f1_train, 3) * 100,
        "Precision (Train)": round(precision_train, 3) * 100,
        "Recall (Train)": round(recall_train, 3) * 100,
        "Parameters (Train)": chosen_parameters_train,
    })

# Print a dataframe from the results list
results_df = pd.DataFrame(results)
print(results_df)

results_df.to_excel("lr_training_results.xlsx", index=False)

  Feature Type  Number of Features  Accuracy (Train)  F1 Score (Train)  \
0      Unigram                 600              92.3              91.1   
1      Unigram                 900              91.7              90.3   
2      Unigram                1200              91.9              90.6   
3       Bigram                 600              92.3              91.1   
4       Bigram                 900              91.9              90.6   
5       Bigram                1200              92.3              91.1   
6       TF-IDF                 600              89.4              87.5   
7       TF-IDF                 900              89.4              87.4   
8       TF-IDF                1200              89.0              86.9   

   Precision (Train)  Recall (Train) Parameters (Train)  
0               94.7            89.1      {'C': [0.05]}  
1               94.3            88.2      {'C': [0.04]}  
2               94.4            88.5      {'C': [0.04]}  
3               94.2       

### Support Vector Machine

In [9]:
# Define the SVM parameter grid
param_grid = {
    'nu': [0.6,0.65,0.7],  # nu parameter
    'kernel': ['linear'],  # kernel functions
}

def SVM_train_and_evaluate(X_train, X_test, y_train, y_test, param_grid):
    grid_search = GridSearchCV(NuSVC(), param_grid, cv=10, scoring='f1')
    grid_search.fit(X_train, y_train)
    best_nusvm_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    
    y_pred = best_nusvm_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    conf_matrix = confusion_matrix(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    
    return accuracy, f1, precision, recall, best_params

results = []

for vectorizer in vectorizers:
    X_train_dtm = vectorizer.fit_transform(X_train_preprocessed)
    X_test_dtm = vectorizer.transform(X_test_preprocessed)
    num_features = len(vectorizer.get_feature_names_out())

    if "TfidfVectorizer" in str(type(vectorizer)):
        feature_type = "TF-IDF"
    elif vectorizer.ngram_range == (1, 1):
        feature_type = "Unigram"
    else:
        feature_type = "Bigram"

    # Call the SVM training and evaluation function
    accuracy, f1, precision, recall, chosen_parameters = SVM_train_and_evaluate(X_train_dtm, X_test_dtm, y_train, y_test, param_grid)
    
    # Append the results to the results list
    results.append({
        "Feature Type": feature_type,
        "Number of Features": num_features,
        "Accuracy": round(accuracy,3)*100,
        "F1 Score": round(f1,3)*100,
        "Precision": round(precision,3)*100,
        "Recall": round(recall,3)*100,
        "Parameters": chosen_parameters
    })

results_df = pd.DataFrame(results)
print(results_df)

results_df.to_excel("svm_results.xlsx", index=False)

  Feature Type  Number of Features  Accuracy  F1 Score  Precision  Recall  \
0      Unigram                 600      79.2      71.9       83.4    70.0   
1      Unigram                 900      80.0      73.3       84.0    71.2   
2      Unigram                1200      79.2      71.9       83.4    70.0   
3       Bigram                 600      78.3      70.4       82.7    68.8   
4       Bigram                 900      79.2      71.9       83.4    70.0   
5       Bigram                1200      80.8      74.7       84.6    72.5   
6       TF-IDF                 600      76.7      70.7       75.4    69.4   
7       TF-IDF                 900      79.2      73.6       79.5    71.9   
8       TF-IDF                1200      78.3      71.7       79.7    70.0   

                        Parameters  
0  {'kernel': 'linear', 'nu': 0.6}  
1  {'kernel': 'linear', 'nu': 0.6}  
2  {'kernel': 'linear', 'nu': 0.6}  
3  {'kernel': 'linear', 'nu': 0.6}  
4  {'kernel': 'linear', 'nu': 0.6}  
5  {'ke

### Support Vector Machine on Training Set

In [10]:
results = []

for vectorizer in vectorizers:
    X_train_dtm = vectorizer.fit_transform(X_train_preprocessed)
    X_test_dtm = vectorizer.transform(X_test_preprocessed)
    num_features = len(vectorizer.get_feature_names_out())

    if "TfidfVectorizer" in str(type(vectorizer)):
        feature_type = "TF-IDF"
    elif vectorizer.ngram_range == (1, 1):
        feature_type = "Unigram"
    else:
        feature_type = "Bigram"

    # Call the SVM training and evaluation function on the training set
    accuracy_train, f1_train, precision_train, recall_train, chosen_parameters_train = SVM_train_and_evaluate(X_train_dtm, X_train_dtm, y_train, y_train, param_grid)

    # Append the results to the results list
    results.append({
        "Feature Type": feature_type,
        "Number of Features": num_features,
        "Accuracy (Train)": round(accuracy_train, 3) * 100,
        "F1 Score (Train)": round(f1_train, 3) * 100,
        "Precision (Train)": round(precision_train, 3) * 100,
        "Recall (Train)": round(recall_train, 3) * 100,
        "Parameters (Train)": chosen_parameters_train
    })

# Print a dataframe from the results list
results_df = pd.DataFrame(results)
print(results_df)

results_df.to_excel("svm_training_results.xlsx", index=False)

  Feature Type  Number of Features  Accuracy (Train)  F1 Score (Train)  \
0      Unigram                 600              89.4              87.4   
1      Unigram                 900              90.2              88.5   
2      Unigram                1200              90.6              89.0   
3       Bigram                 600              88.8              86.6   
4       Bigram                 900              89.6              87.6   
5       Bigram                1200              90.4              88.7   
6       TF-IDF                 600              92.3              91.3   
7       TF-IDF                 900              94.2              93.4   
8       TF-IDF                1200              96.0              95.6   

   Precision (Train)  Recall (Train)               Parameters (Train)  
0               92.6            85.1  {'kernel': 'linear', 'nu': 0.6}  
1               93.1            86.3  {'kernel': 'linear', 'nu': 0.6}  
2               93.7            86.8  {'ker

## Using Word Embeddings

### Making Word Vectors 

In [11]:
np.random.seed(1)

# Load the Word2Vec model
word2vec_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

# Encode documents using Word2Vec
def encode_document(document, word2vec_model):
    # Tokenize the document into words
    words = document.split()

    # Encode each word using the pre-trained Word2Vec model
    word_vectors = [word2vec_model[word] for word in words if word in word2vec_model]

    return word_vectors

# Aggregate word vectors for document representation
def average_word_vectors(word_vectors):
    if not word_vectors:
        return np.zeros(word2vec_model.vector_size)  # Return zeros if no word vectors are available
    return np.mean(word_vectors, axis=0)

# Encode training and testing documents
X_train_encoded = [encode_document(doc, word2vec_model) for doc in X_train_preprocessed]
X_test_encoded = [encode_document(doc, word2vec_model) for doc in X_test_preprocessed]

# Aggregate word vectors for document representation
X_train_avg = [average_word_vectors(doc) for doc in X_train_encoded]
X_test_avg = [average_word_vectors(doc) for doc in X_test_encoded]

### Logistic Regression with Word Vectors

In [12]:
custom_Cs = [0.01, 0.03, 0.05, 1, 5]

# Logistic Regression with cross-validated hyperparameter search
logit_model = LogisticRegressionCV(
    Cs=custom_Cs,
    cv=10,            # Number of cross-validation folds
    random_state=123, # Set a random seed for reproducibility
    max_iter=1000,     # max_iter is set at 1000 to ensure the model converges
    scoring='f1_macro'
)

# Training
logit_model.fit(X_train_avg, y_train)

# Predictions
y_pred = logit_model.predict(X_test_avg)

# Calculate precision, recall, and F1 score
accuracy = accuracy_score(y_test, y_pred) # accuracy
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

# Access the best hyperparameters
best_C = logit_model.C_

# Print results
print(f"Accuracy: {accuracy * 100:.1f}")
print(f"F1 Score: {f1_score* 100:.1f}")
print(f"Precision: {precision* 100:.1f}")
print(f"Recall: {recall* 100:.1f}")
print(f"Best Hyperparameter (C): {best_C}")


# TESTING FOR OVERFITTING
y_pred_train = logit_model.predict(X_train_avg)

# Calculate precision, recall, and F1 score on the training set
accuracy = accuracy_score(y_train, y_pred_train)  # accuracy
precision, recall, f1_score, _ = precision_recall_fscore_support(y_train, y_pred_train, average='binary')

# Print results
print(" ")
print("Testing for Overfitting on the Training Set")
print(f"Accuracy: {accuracy * 100:.1f}")
print(f"F1 Score: {f1_score * 100:.1f}")
print(f"Precision: {precision * 100:.1f}")
print(f"Recall: {recall * 100:.1f}")
print(f"Best Hyperparameter (C): {best_C}")

Accuracy: 75.8
F1 Score: 50.8
Precision: 78.9
Recall: 37.5
Best Hyperparameter (C): [5.]
 
Testing for Overfitting on the Training Set
Accuracy: 83.3
F1 Score: 71.8
Precision: 89.5
Recall: 60.0
Best Hyperparameter (C): [5.]


### SVM with Word Vectors

In [13]:
# Define the parameter grid
param_grid = {
    'nu': [0.1,0.2,0.3],  # nu parameter
    'kernel': ['linear', 'rbf', 'poly'],  # kernel functions
}

# NuSVM with grid search
nu_svm_model = NuSVC()

# Perform grid search with cross-validation
grid_search = GridSearchCV(nu_svm_model, param_grid, cv=10,scoring="f1")
grid_search.fit(X_train_avg, y_train)

# Access the best hyperparameters
best_nu = grid_search.best_params_['nu']
best_kernel = grid_search.best_params_['kernel']

# Predictions with the best model
y_pred_nu_svm = grid_search.predict(X_test_avg)

# Calculate precision, recall, and F1 score
accuracy_nu_svm = accuracy_score(y_test, y_pred_nu_svm)
precision_nu_svm, recall_nu_svm, f1_score_nu_svm, _ = precision_recall_fscore_support(y_test, y_pred_nu_svm, average='binary')

# Print results
print(f"Accuracy: {accuracy_nu_svm* 100:.1f}")
print(f"F1 Score: {f1_score_nu_svm* 100:.1f}")
print(f"Precision: {precision_nu_svm* 100:.1f}")
print(f"Recall: {recall_nu_svm* 100:.1f}")
print(f"Best Hyperparameter (nu): {best_nu}")
print(f"Best Hyperparameter (kernel): {best_kernel}")

# TESTING FOR OVERFITTING
y_pred_nu_svm_train = grid_search.predict(X_train_avg)

# Calculate precision, recall, and F1 score on the training set
accuracy_nu_svm_train = accuracy_score(y_train, y_pred_nu_svm_train)
precision_nu_svm_train, recall_nu_svm_train, f1_score_nu_svm_train, _ = precision_recall_fscore_support(
    y_train, y_pred_nu_svm_train, average='binary'
)

# Print results for the training set
print(" ")
print("Results on the Training Set:")
print(f"Accuracy: {accuracy_nu_svm_train * 100:.1f}")
print(f"F1 Score: {f1_score_nu_svm_train * 100:.1f}")
print(f"Precision: {precision_nu_svm_train * 100:.1f}")
print(f"Recall: {recall_nu_svm_train * 100:.1f}")
print(f"Best Hyperparameter (nu): {best_nu}")
print(f"Best Hyperparameter (kernel): {best_kernel}")

Accuracy: 77.5
F1 Score: 60.9
Precision: 72.4
Recall: 52.5
Best Hyperparameter (nu): 0.1
Best Hyperparameter (kernel): rbf
 
Results on the Training Set:
Accuracy: 100.0
F1 Score: 100.0
Precision: 100.0
Recall: 100.0
Best Hyperparameter (nu): 0.1
Best Hyperparameter (kernel): rbf


## Chosen Model: Naive Bayes, Unigram, 900 Features

In [14]:
fulldf = pd.read_csv("debate_dataset.csv")
fullX = fulldf['speech_text']  # Features
fullX_preprocessed = [preprocess_text(text) for text in fullX]

# Create and fit the vectorizer on your training data first
vectorizer = CountVectorizer(min_df=5, max_df=0.95, lowercase=True, max_features=900)
X_train_dtm = vectorizer.fit_transform(X_train_preprocessed)

# Transform the full dataset
X_full = vectorizer.transform(fullX_preprocessed)

naive_bayes_classifier = MultinomialNB(alpha=0.7)
naive_bayes_classifier.fit(X_train_dtm, y_train)

y_pred_full = naive_bayes_classifier.predict(X_full)

## Labelling the rest of the dataset

In [15]:
fulldf['predicted_label'] = y_pred_full
fulldf.to_csv("fulldf_with_predictions.csv", index=False)

## Evaluating Model

#### Printing Correctly Classified Speeches

In [16]:
vectorizers = CountVectorizer(min_df=5, max_df=0.95, lowercase=True, max_features=600)
X_train_dtm = vectorizers.fit_transform(X_train_preprocessed)
X_test_dtm = vectorizers.transform(X_test_preprocessed)

naive_bayes_classifier = MultinomialNB(alpha=0.7)
naive_bayes_classifier.fit(X_train_dtm, y_train)
x_pred_chosenmodel = naive_bayes_classifier.predict(X_test_dtm)

# Printing correctly classified documents
correct_indices = [i for i, (pred, actual) in enumerate(zip(x_pred_chosenmodel, y_test)) if pred == actual]

print("Correctly classified documents:")
for idx in correct_indices[:50]: 
    print(f"Document {idx + len(X_train)}:")
    print(X_test_preprocessed[idx])  
    print("Predicted Label:", x_pred_chosenmodel[idx]) 
    print("Actual Label:", y_test.iloc[idx])  
    print("\n")

Correctly classified documents:
Document 480:
last time shall interven want interrupt flow last point accept offer put extra ground greec includ specialist abil support greek demonstr fact uk play role support greec thing call
Predicted Label: 2
Actual Label: 2


Document 481:
hope acknowledg discuss possibl futur visa liberalis involv schengen countri involv eu member state part schengen
Predicted Label: 1
Actual Label: 1


Document 482:
congratul member north thanet sir roger gale secur urgent question point lack consult local author health servic frankli appal want ask number question report parliamentari group immigr detent call end accommod asylum seeker describ fundament unsuit survivor war tortur seriou violenc offic ignor warn parliamentari colleagu offic previous ignor warn use napier barrack red cross public health england inevit result covid outbreak among held pandem enter anoth danger phase commit listen expert time follow health guidanc confirm part manston estat current 

In [17]:


vectorizer = CountVectorizer(min_df=5, max_df=0.95, lowercase=True, max_features=600)
X_train_dtm = vectorizer.fit_transform(X_train_preprocessed)
X_test_dtm = vectorizer.transform(X_test_preprocessed)

# Train Multinomial Naive Bayes classifier
naive_bayes_classifier = MultinomialNB(alpha=0.7)
naive_bayes_classifier.fit(X_train_dtm, y_train)

# Get feature names
feature_names = vectorizer.get_feature_names_out()

# Get the log probabilities of features given each class
log_probabilities = naive_bayes_classifier.feature_log_prob_

# Find the most important terms for each class
top_n = 30
for i, target_class in enumerate(naive_bayes_classifier.classes_):
    print(f"Top {top_n} features for class {target_class}:")
    class_probabilities = log_probabilities[i]  # Log probabilities for current class
    top_indices = class_probabilities.argsort()[-top_n:][::-1]  # Indices of top features
    top_features = [feature_names[index] for index in top_indices]  # Top features
    print(top_features)


Top 30 features for class 1:
['countri', 'asylum', 'would', 'immigr', 'member', 'illeg', 'say', 'make', 'come', 'system', 'take', 'want', 'work', 'issu', 'point', 'bill', 'border', 'need', 'go', 'year', 'said', 'mani', 'import', 'way', 'know', 'european', 'migrat', 'time', 'made', 'number']
Top 30 features for class 2:
['countri', 'refuge', 'member', 'work', 'children', 'asylum', 'support', 'make', 'need', 'would', 'uk', 'mani', 'come', 'one', 'year', 'take', 'famili', 'hous', 'offic', 'local', 'say', 'help', 'us', 'bill', 'also', 'respons', 'know', 'go', 'system', 'point']


#### Printing Incorrectly Classified Speeches

In [18]:
# Get indices of incorrectly classified documents
incorrect_indices = [i for i, (pred, actual) in enumerate(zip(x_pred_chosenmodel, y_test)) if pred != actual]

# Print 10 incorrectly classified documents
print("Incorrectly classified documents:")
for idx in incorrect_indices[:10]:  
    print(f"Document {idx + len(X_train)}:")
    print(X_test_preprocessed[idx])  
    print("Predicted Label:", x_pred_chosenmodel[idx]) 
    print("Actual Label:", y_test.iloc[idx]) 
    print("\n")

Incorrectly classified documents:
Document 485:
welcom statement agre whole hous prioriti prevent cross first place welcom said increas surveil air sea land franc nevertheless morn vessel left franc land dung constitu know often particularli treacher part coast need done prevent boat slip net case vessel detect local fish craft alert author first spot author
Predicted Label: 2
Actual Label: 1


Document 493:
much made safeguard illeg migrant think member hous would agre talk safeguard citizen thousand come know background forc put hotel nowher els go guarante give citizen live near hotel safe particularli hear go hotel
Predicted Label: 2
Actual Label: 1


Document 494:
happi give assur rightli say need chang time immigr system built futur flexibl meet need economi societi give one exampl look worker scheme look mani need economi includ cornwal need hospit industri season natur much demand happili give assur
Predicted Label: 2
Actual Label: 1


Document 497:
point clarif reason lot ques

# Sensitivity Test - Additional Classification Models

### Random Forest Classifier

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score
import pandas as pd

# Define the Random Forest parameter grid
param_grid_rf = {
    'n_estimators': [30,50],
    'max_depth': [5, 10, 20],
}

def RandomForest_train_and_evaluate(X_train, X_test, y_train, y_test, param_grid):
    grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=10, scoring='f1')
    grid_search.fit(X_train, y_train)
    best_rf_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    
    y_pred = best_rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    conf_matrix = confusion_matrix(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    
    return accuracy, f1, precision, recall, best_params

results_rf = []

for vectorizer in vectorizers:
    X_train_dtm = vectorizer.fit_transform(X_train_preprocessed)
    X_test_dtm = vectorizer.transform(X_test_preprocessed)
    num_features = len(vectorizer.get_feature_names_out())

    if "TfidfVectorizer" in str(type(vectorizer)):
        feature_type = "TF-IDF"
    elif vectorizer.ngram_range == (1, 1):
        feature_type = "Unigram"
    else:
        feature_type = "Bigram"

    # Call the Random Forest training and evaluation function
    accuracy_rf, f1_rf, precision_rf, recall_rf, chosen_params_rf = RandomForest_train_and_evaluate(X_train_dtm, X_test_dtm, y_train, y_test, param_grid_rf)
    
    # Append the results to the results_rf list
    results_rf.append({
        "Feature Type": feature_type,
        "Number of Features": num_features,
        "Accuracy": round(accuracy_rf, 3) * 100,
        "F1 Score": round(f1_rf, 3) * 100,
        "Precision": round(precision_rf, 3) * 100,
        "Recall": round(recall_rf, 3) * 100,
        "Parameters": chosen_params_rf
    })

results_rf_df = pd.DataFrame(results_rf)
print(results_rf_df)

results_rf_df.to_excel("rf_results.xlsx", index=False)

TypeError: 'CountVectorizer' object is not iterable

### Random Forest Classifier on Training Set

In [None]:
results = []

for vectorizer in vectorizers:
    X_train_dtm = vectorizer.fit_transform(X_train_preprocessed)
    X_test_dtm = vectorizer.transform(X_test_preprocessed)
    num_features = len(vectorizer.get_feature_names_out())

    if "TfidfVectorizer" in str(type(vectorizer)):
        feature_type = "TF-IDF"
    elif vectorizer.ngram_range == (1, 1):
        feature_type = "Unigram"
    else:
        feature_type = "Bigram"

    # Call the SVM training and evaluation function on the training set
    accuracy_train, f1_train, precision_train, recall_train, chosen_params_rf = RandomForest_train_and_evaluate(X_train_dtm, X_train_dtm, y_train, y_train, param_grid_rf)

    # Append the results to the results list
    results.append({
        "Feature Type": feature_type,
        "Number of Features": num_features,
        "Accuracy (Train)": round(accuracy_train, 3) * 100,
        "F1 Score (Train)": round(f1_train, 3) * 100,
        "Precision (Train)": round(precision_train, 3) * 100,
        "Recall (Train)": round(recall_train, 3) * 100,
        "Parameters (Train)": chosen_params_rf
    })

# Print a dataframe from the results list
results_df = pd.DataFrame(results)
print(results_df)

results_df.to_excel("rf_training_results.xlsx", index=False)

### XGBoost

In [None]:
import xgboost as xgb
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

df = pd.read_csv("SAMPLED_SPEECHES.csv")
df = df.dropna(subset=['doc_topics'])  # Remove rows with NA values in the 'doc_topics' column

# Map labels: 2 -> 1, 1 -> 0
df['doc_topics'] = df['doc_topics'].map({2: 1, 1: 0})

# Split the data - 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(df['speech_text'], df['doc_topics'], test_size=0.2, random_state=123)


model = xgb.XGBClassifier(num_class=2)

# Creating an XGBoost Cross Validation function
param_grid_xgb = {
    'learning_rate': [0.01],
}


def XGBoost_train_and_evaluate(X_train, X_test, y_train, y_test, param_grid):
    model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)
    grid_search = GridSearchCV(model, param_grid, cv=10, scoring='f1')
    grid_search.fit(X_train, y_train)
    best_xgb_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    
    y_pred = best_xgb_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    conf_matrix = confusion_matrix(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    
    return accuracy, f1, precision, recall, best_params

# Results for XGBoost
results_xgb = []

for vectorizer in vectorizers:
    X_train_dtm = vectorizer.fit_transform(X_train_preprocessed)
    X_test_dtm = vectorizer.transform(X_test_preprocessed)
    num_features = len(vectorizer.get_feature_names_out())

    if "TfidfVectorizer" in str(type(vectorizer)):
        feature_type = "TF-IDF"
    elif vectorizer.ngram_range == (1, 1):
        feature_type = "Unigram"
    else:
        feature_type = "Bigram"

    accuracy, f1, precision, recall, chosen_parameters = XGBoost_train_and_evaluate(X_train_dtm, X_test_dtm, y_train, y_test, param_grid_xgb)
    
    # appending the results
    results_xgb.append({
        "Feature Type": feature_type,
        "Number of Features": num_features,
        "Accuracy": round(accuracy, 3) * 100,
        "F1 Score": round(f1, 3) * 100,
        "Precision": round(precision, 3) * 100,
        "Recall": round(recall, 3) * 100,
        "Parameters": chosen_parameters
    })

results_xgb_df = pd.DataFrame(results_xgb)
print(results_xgb_df)

results_xgb_df.to_excel("xgb_results.xlsx", index=False)