In [1]:
import pandas as pd
import numpy as np

from sklearn import model_selection
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder

from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import matplotlib.pyplot as plt
import seaborn as sn

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer 
import PIL as pillow
import wordcloud
from operator import itemgetter

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
import scikitplot as skplt
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC



In [5]:
df =pd.read_csv('../part-03/tory_labour_snp.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,mp,person_id,debate_id,date,constituency,party,topic,text,word_count
0,0,Brandon Lewis,24879,2020-01-07a,2020-01-07,Great Yarmouth,Conservative,,let me finish the point it takes five to minut...,140
1,1,Brandon Lewis,24879,2020-01-07a,2020-01-07,Great Yarmouth,Conservative,,i will be brief i just want to respond to a co...,164
2,2,Lindsay Hoyle,10295,2020-01-07b,2020-01-07,Chorley,Labour,Speaker’s Statement,on behalf of the whole house i wish to express...,339
3,3,Mark Logan,25886,2020-01-07b,2020-01-07,Bolton North East,Conservative,Per Pupil Funding,whether he plans to increase the level of per ...,11
4,4,Sajid Javid,24854,2020-01-07b,2020-01-07,Bromsgrove,Conservative,Per Pupil Funding,first mr speaker may i associate myself with t...,83


In [4]:
df.party.value_counts(normalize = True)

Conservative               0.648980
Labour                     0.276583
Scottish National Party    0.074437
Name: party, dtype: float64

Baseline is 0.648980, so we are looking to improve on this as much as possible. (I will try to address the class imbalance later on )

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.text, df.party, stratify = df.party, random_state = 1)

Due to time restrictions for now I will use one model for all cvec and tfidf - a basic Logistic Regression model.

I will compare a range of ngrams and use a mindf limit of 20 words (makes the model generalise better as it cuts out words that appear only a couple of times) and use maxdf 95% to get rid of words that appear in more than 95% of my data. I am hoping that this will remove some of the more stuffy parliamentary language such as 'honerable friend' that I don't think will add much predictive power to my model. 

Ocne I have optimised cvec vs tfidf, I will pick the best nlp vectoriser and run gridsearch across various classification models, comparing their accuracy scores. 

In [8]:
pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words= 'english')),
    ('logreg', LogisticRegression(solver='saga', multi_class = 'ovr'))
]) 

pipeline.fit(X_train, y_train)

print(pipeline.score(X_train, y_train))
print(cross_val_score(pipeline, X_train, y_train, cv=5).mean())
print(pipeline.score(X_test, y_test))

predictions = pipeline.predict(X_test)

print()
print(classification_report(y_test, predictions))

pd.DataFrame(confusion_matrix(y_test, predictions,
                              labels=y_test.unique()),
             columns=y_test.unique(),
             index=y_test.unique())



0.7445350323811121




0.7380272155065756
0.7393190606432142

                         precision    recall  f1-score   support

           Conservative       0.79      0.87      0.83     20643
                 Labour       0.61      0.57      0.59      8798
Scottish National Party       0.58      0.21      0.31      2368

               accuracy                           0.74     31809
              macro avg       0.66      0.55      0.58     31809
           weighted avg       0.73      0.74      0.72     31809



Unnamed: 0,Labour,Conservative,Scottish National Party
Labour,4976,3722,100
Conservative,2342,18034,267
Scottish National Party,876,985,507


In [11]:
pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words= 'english', min_df = 20, max_df = 0.95)),
    ('logreg', LogisticRegression(solver='saga', multi_class = 'ovr', max_iter = 10000, n_jobs = 2))
]) 

pipeline.fit(X_train, y_train)

score1 = pipeline.score(X_train, y_train)
cvscore1 = cross_val_score(pipeline, X_train, y_train, cv=5).mean()
testscore1 = pipeline.score(X_test, y_test)

print(score1)
print(cvscore1)
print(testscore1)

predictions = pipeline.predict(X_test)

print()
print(classification_report(y_test, predictions))

pd.DataFrame(confusion_matrix(y_test, predictions,
                              labels=y_test.unique()),
             columns=y_test.unique(),
             index=y_test.unique())

0.8058076415232746
0.7711839177199155
0.7705052029299884

                         precision    recall  f1-score   support

           Conservative       0.80      0.91      0.85     20643
                 Labour       0.68      0.56      0.62      8798
Scottish National Party       0.67      0.30      0.41      2368

               accuracy                           0.77     31809
              macro avg       0.72      0.59      0.63     31809
           weighted avg       0.76      0.77      0.76     31809



Unnamed: 0,Labour,Conservative,Scottish National Party
Labour,4934,3709,155
Conservative,1577,18868,198
Scottish National Party,728,933,707


In [35]:
pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words= 'english', min_df = 20, max_df = 0.95, ngram_range= (1,2))),
    ('logreg', LogisticRegression(solver='saga', multi_class = 'ovr', max_iter = 10000, n_jobs = 2))
]) 

pipeline.fit(X_train, y_train)

score2 = pipeline.score(X_train, y_train)
cvscore2 = cross_val_score(pipeline, X_train, y_train, cv=5).mean()
testscore2 = pipeline.score(X_test, y_test)

print(score2)
print(cvscore2)
print(testscore2)

predictions = pipeline.predict(X_test)

print()
print(classification_report(y_test, predictions))

pd.DataFrame(confusion_matrix(y_test, predictions,
                              labels=y_test.unique()),
             columns=y_test.unique(),
             index=y_test.unique())

0.8391738100727265
0.7861064490550624
0.7884246596875099

                         precision    recall  f1-score   support

           Conservative       0.82      0.92      0.87     20643
                 Labour       0.70      0.61      0.65      8798
Scottish National Party       0.68      0.33      0.44      2368

               accuracy                           0.79     31809
              macro avg       0.73      0.62      0.65     31809
           weighted avg       0.78      0.79      0.78     31809



Unnamed: 0,Labour,Conservative,Scottish National Party
Labour,5337,3277,184
Conservative,1493,18963,187
Scottish National Party,747,842,779


In [36]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words= 'english')),
    ('logreg', LogisticRegression(solver='saga', multi_class = 'ovr', max_iter = 10000, n_jobs = 2))]) 

pipeline.fit(X_train, y_train)

score2 = pipeline.score(X_train, y_train)
cvscore2 = cross_val_score(pipeline, X_train, y_train, cv=5).mean()
testscore2 = pipeline.score(X_test, y_test)

print(score2)
print(cvscore2)
print(testscore2)

predictions = pipeline.predict(X_test)

print()
print(classification_report(y_test, predictions))

pd.DataFrame(confusion_matrix(y_test, predictions,
                              labels=y_test.unique()),
             columns=y_test.unique(),
             index=y_test.unique())

0.8110892209670321
0.7742438866741939
0.7753151623754283

                         precision    recall  f1-score   support

           Conservative       0.80      0.92      0.86     20643
                 Labour       0.69      0.58      0.63      8798
Scottish National Party       0.72      0.23      0.35      2368

               accuracy                           0.78     31809
              macro avg       0.74      0.58      0.61     31809
           weighted avg       0.77      0.78      0.76     31809



Unnamed: 0,Labour,Conservative,Scottish National Party
Labour,5094,3617,87
Conservative,1496,19018,129
Scottish National Party,823,995,550


In [37]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words= 'english', min_df = 20, max_df = 0.95)),
    ('logreg', LogisticRegression(solver='saga', multi_class = 'ovr', max_iter = 10000, n_jobs = 2))]) 

pipeline.fit(X_train, y_train)

score2 = pipeline.score(X_train, y_train)
cvscore2 = cross_val_score(pipeline, X_train, y_train, cv=5).mean()
testscore2 = pipeline.score(X_test, y_test)

print(score2)
print(cvscore2)
print(testscore2)

predictions = pipeline.predict(X_test)

print()
print(classification_report(y_test, predictions))

pd.DataFrame(confusion_matrix(y_test, predictions,
                              labels=y_test.unique()),
             columns=y_test.unique(),
             index=y_test.unique())

0.8057971621989815
0.7741810166582976
0.7745920965764407

                         precision    recall  f1-score   support

           Conservative       0.81      0.92      0.86     20643
                 Labour       0.68      0.58      0.63      8798
Scottish National Party       0.71      0.24      0.35      2368

               accuracy                           0.77     31809
              macro avg       0.73      0.58      0.61     31809
           weighted avg       0.76      0.77      0.76     31809



Unnamed: 0,Labour,Conservative,Scottish National Party
Labour,5091,3614,93
Conservative,1519,18991,133
Scottish National Party,830,981,557


In [38]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words= 'english', min_df = 20, max_df = 0.95, ngram_range= (1,2))),
    ('logreg', LogisticRegression(solver='saga', multi_class = 'ovr', max_iter = 10000, n_jobs = 2))]) 

pipeline.fit(X_train, y_train)

score2 = pipeline.score(X_train, y_train)
cvscore2 = cross_val_score(pipeline, X_train, y_train, cv=5).mean()
testscore2 = pipeline.score(X_test, y_test)

print(score2)
print(cvscore2)
print(testscore2)

predictions = pipeline.predict(X_test)

print()
print(classification_report(y_test, predictions))

pd.DataFrame(confusion_matrix(y_test, predictions,
                              labels=y_test.unique()),
             columns=y_test.unique(),
             index=y_test.unique())

0.8324251252279253
0.7870915504524822
0.7888647866955893

                         precision    recall  f1-score   support

           Conservative       0.82      0.93      0.87     20643
                 Labour       0.71      0.61      0.65      8798
Scottish National Party       0.76      0.26      0.39      2368

               accuracy                           0.79     31809
              macro avg       0.76      0.60      0.64     31809
           weighted avg       0.78      0.79      0.77     31809



Unnamed: 0,Labour,Conservative,Scottish National Party
Labour,5328,3379,91
Conservative,1385,19154,104
Scottish National Party,817,940,611


In [7]:
tfidf = TfidfVectorizer(stop_words= 'english', min_df = 20, max_df = 0.95, ngram_range= (1,2))

This was the best performing model on a Logistic Regression model. 