### Imports
All imports needed to run code.

In [None]:
# imports
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import classification_report
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

## Text Preprocessing
Text is preprocessed using the spaCy library. SpaCy uses a pipeline to tokenize the text and then performs the preprocessing needed to lemmatize those tokens. Punctuation and stop words are also removed from the text.

In [None]:
# spaCy preprocessing
nlp = spacy.load("en_core_web_sm")

# returns the text string resulting from spaCy preprocessing of text
def preprocessText(text):
  doc = nlp(text)
  newText = ""

  for token in doc:
    # replaces most entities recognized by spaCy's ner
    if token.ent_type_ && token.ent_type_!="CARDINAL" && token.ent_type!="NORP":
      newText = newText + " " + token.ent_type_
    # removes stop words and punctuation as identified by spaCy
    elif not token.is_stop and not token.is_punct:
      newText = newText + " " + token.lemma_.lower()

  return newText

In [None]:
# Performs text preprocessing on data
data = pd.read_csv('orientation-gb-train.tsv', sep='\t')
data['text'] = data['text'].map(lambda text:preprocessText(text))

data.head(10) # to check output of preprocessing

Unnamed: 0,id,speaker,sex,text,text_en,label
0,gb00000,af4af2005beaffb71b70b0c4dd2b4c1b,M,right hon friend accept problem illegal immig...,Does my right hon. Friend accept that the prob...,1
1,gb00001,77f2408f71f881219e1125e79b5a1ada,M,clearly progress net migration ORG ORG ORG co...,It is clearly progress that net migration from...,1
2,gb00002,d99fc5be5cd4dfa671fb49ecd8043522,M,CARDINAL plan meet independent chief inspecto...,5. When she next plans to meet the independent...,0
3,gb00003,234848ad13c121d6c1bb410e807d95a6,M,CARDINAL assistance ORG offer people DATE DAT...,9. What assistance her Department offers to pe...,0
4,gb00004,77f443ccf2fc1a2c0d7efa4739377ed3,F,CARDINAL NORP refugee resettle GPE ORG vulner...,10. How many Syrian refugees have been resettl...,0
5,gb00005,d2e83f30cdbe9d3fefbd7c33cdf7ae4b,M,honest perfectly understand chief constable <...,"To be honest, I perfectly understand that any ...",1
6,gb00006,1dde0ed1a6b16a0aff9bd2a47504069b,M,CARDINAL assessment implication policy findin...,18. What assessment she has made of the implic...,0
7,gb00007,03c46a784171a62e74b1cd33dd5e8e63,M,urgent question ask secretary ORG ORG ORG sta...,(Urgent Question): To ask the Secretary of Sta...,0
8,gb00008,704bb324d1e8c1cf067bce0147ee047e,M,right hon friend accept investment financial ...,Does my right hon. Friend accept that the inve...,1
9,gb00009,77592b72996ffcffdf6bd65abc54d7ac,F,south west cut GPE DATE DATE ORG ORG perform ...,The south-west was cut off from the UK last wi...,0


## Saving and Loading Preprocessed Text
Below is the code (commented out) to save and load the preprocessed text data, so that preprocessing steps would not need to be run every time.

In [None]:
# Loads the text data from 'orientation-gb-preprocessed.feather'

#from google.colab import drive
#drive.mount('/content/drive')
#data = pd.read_feather("orientation-gb-preprocessed.feather")
#data.head(10)

In [None]:
# Saves the text data from 'orientation-gb-preprocessed.feather'

#data.to_feather("orientation-gb-preprocessed.feather")


## TF-IDF
The preprocessed text data is converted into TF-IDF weight vectors for each document. Only the 100 most informative (K best) terms are used for the final vectors. We tested different K values and 100 seemed to result in the best overall performance.

In [None]:
# Separates the text and label data and performs train/test split

# Assuming X contains your features and y contains your labels/targets
data.dropna(inplace=True)
X = data['text']
y = data['label']

# Splitting the dataset into 90% training and 10% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train.columns = ['id', 'text']
y_train.isnull().sum()

In [None]:
# TF-IDF

# Makes the TF-IDF vectors for all documents in the training data
tfidf = TfidfVectorizer(min_df=5)
tfidf_vects = tfidf.fit_transform(X_train)
tfidf_X_train = pd.DataFrame(tfidf_vects.toarray(), columns=tfidf.get_feature_names_out())

# Extracts the K best terms to use as features for training data
featSelector = SelectKBest(score_func=chi2, k=100)
featSelector = featSelector.fit(tfidf_X_train, y_train)
tfidf_features = featSelector.get_feature_names_out()
tfidf_X_train = pd.DataFrame(featSelector.transform(tfidf_X_train), columns=tfidf_features)
tfidf_X_test = pd.DataFrame(tfidf.transform(X_test).toarray(), columns=tfidf.get_feature_names_out())
tfidf_X_test = pd.DataFrame(featSelector.transform(tfidf_X_test), columns=tfidf_features)

tfidf_X_train.head(10)

Unnamed: 0,absolutely,animal,ask,assessment,austerity,bad,bank,black,business,cardinal,...,universal,urgent,vaccine,wage,wait,warn,welcome,woman,work,worker
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.121941,0.0,0.0,0.220706,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009979,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.072773,0.0,0.0,0.0,0.0,0.126564,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027281,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033579,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.050361,0.0,0.0,0.0,0.0,0.0,0.028193,0.25821,...,0.0,0.0,0.0,0.0,0.039499,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.122273,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.044802,0.051546,0.0,0.0,...,0.0,0.044529,0.0,0.0,0.0,0.0,0.0,0.036651,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.186874,0.195599,...,0.0,0.0,0.0,0.0,0.0,0.0,0.022406,0.267886,0.031621,0.031413
9,0.184986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111255,0.0


## Tuning Hyperparameters
Logistic Regression and SVM models both have hyperparameters that need to be adjusted to achieve optimal results, which are what the below functions do. Optimal values are manually input into the models used for testing so that these functions do not need to be ran every time, which is also why the lines running the functions are commented out.

In [None]:
# LogReg Hyperparameter Tuning

# Compares cross-validations for all combination of params given for log reg classifier to get optimal parameter values
def logreg_tune(X_train, y_train):
  param_grid =   {
    'penalty' : ['l1', 'l2', 'elasticnet'],
    'C' : np.logspace(-4, 4, 9),
    'solver' : ['lbfgs','newton-cg','sag','saga'],
     'max_iter' : [100,1000]
  }

  logreg = LogisticRegression()
  clf = GridSearchCV(logreg, param_grid=param_grid, cv=5, n_jobs=-1)
  clf.fit(X_train, y_train)

  print("Best hyperparameters:", clf.best_params_)

logreg_tune(tfidf_X_train, y_train)

Best hyperparameters: {'C': 10.0, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}


In [None]:
# SVM Hyperparameter + Kernel Tuning

# Compares cross-validations for all combination of params given for SVC to get optimal parameter and kernel values
def svm_tune(X_train, y_train):
  param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01]
  }

  svc = SVC()
  clf = GridSearchCV(svc, param_grid, cv=5, refit=True, verbose=True, n_jobs=-1)
  clf.fit(X_train, y_train)

  print("Best hyperparameters:", clf.best_params_)

svm_tune(tfidf_X_train, y_train)

## ML Models
Naive Bayes, Logistic Regression, and SVM models are trained on the training data for binary classification using the TF-IDF vectors.

In [None]:
# Naive Bayes Model
def naive_bayes(X_train, y_train):
  nb = GaussianNB()
  nb = nb.fit(X_train, y_train)
  return nb

In [None]:
# Log-Reg Model
def logreg(X_train, y_train):
    lrcv = LogisticRegressionCV(cv=10, max_iter=100, random_state=0, Cs=10, penalty='l1', solver='saga')
    lrcv.fit(X_train, y_train)
    return lrcv

In [None]:
# SVM Model
def svm(X_train, y_train):
  svc = SVC(kernel='rbf', C=10, gamma=1)
  svc.fit(X_train, y_train)
  return svc

## Scoring
Makes a report of the precicion, recall, F1, and accuracy scores for each of the models using ```classification_report``` from scikit-learn.

In [None]:
# Generates a classification report for each model
# Classification reports include precision, recall, F1, and accuracy scores.

def get_classreports(models, names):
    reports = []
    i = 0
    # Gets and prints the classification report with all scores for each model
    for e in models:
        report_arr = []
        model = e[0]
        X_test = e[1]
        y_pred = model.predict(X_test)
        print('Classification Report for:', names[i])
        i+=1
        print(classification_report(y_test, y_pred))
        print()
        # Saves the scores for both labels to be added to a chart
        report = classification_report(y_test, y_pred, output_dict=True)
        report_arr.append(report['0']['precision'])
        report_arr.append(report['0']['recall'])
        report_arr.append(report['0']['f1-score'])
        report_arr.append(report['1']['precision'])
        report_arr.append(report['1']['recall'])
        report_arr.append(report['1']['f1-score'])
        reports.append(report_arr)

    return reports

In [None]:
# Classification Reports (Scoring)

model_names = ['Naive Bayes', 'Logistic Regression', 'SVM']

# Gets all the trained models
models = []
models.append([naive_bayes(tfidf_X_train, y_train), tfidf_X_test])
models.append([logreg(tfidf_X_train, y_train), tfidf_X_test])
models.append([svm(tfidf_X_train, y_train), tfidf_X_test])

# Prints classification reports and makes a table of the scores for each label for each model
header = pd.MultiIndex.from_product([['Left', 'Right'], ['Precision', 'Recall', 'F1-score']])
results_df = pd.DataFrame(get_classreports(models, model_names), index=model_names, columns=header)
display(results_df)

Classification Report for: Naive Bayes
              precision    recall  f1-score   support

           0       0.72      0.42      0.53      1213
           1       0.59      0.84      0.69      1211

    accuracy                           0.63      2424
   macro avg       0.65      0.63      0.61      2424
weighted avg       0.65      0.63      0.61      2424


Classification Report for: Logistic Regression
              precision    recall  f1-score   support

           0       0.73      0.68      0.71      1213
           1       0.70      0.75      0.73      1211

    accuracy                           0.72      2424
   macro avg       0.72      0.72      0.72      2424
weighted avg       0.72      0.72      0.72      2424


Classification Report for: SVM
              precision    recall  f1-score   support

           0       0.74      0.68      0.71      1213
           1       0.71      0.76      0.73      1211

    accuracy                           0.72      2424
   macro 

Unnamed: 0_level_0,Left,Left,Left,Right,Right,Right
Unnamed: 0_level_1,Precision,Recall,F1-score,Precision,Recall,F1-score
Naive Bayes,0.718927,0.419621,0.529932,0.589744,0.835673,0.691493
Logistic Regression,0.732568,0.684254,0.707587,0.703331,0.749794,0.725819
SVM,0.742832,0.68343,0.711894,0.706422,0.763006,0.733624
