The data has been cleaned and stored in pickle files in p1_clean.ipynb.


In [1]:
import logging, importlib, sys, tqdm
import spacy
import pandas as pd
import os, re
import pickle
import spacy
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from _pckle import save_pickle_object, load_pickle_object
from _logging import set_logging
from _graph import histplot_count, histplot_range_count
from _utility import gl

sp = spacy.load('en_core_web_md')
stemmer = SnowballStemmer(language='english')
set_logging(logging)
df_text = load_pickle_object(gl.pkl_df_text)
df_edInput = load_pickle_object(gl.pkl_df_edInput)


2023-01-06 12:19:28,430 | INFO : Loading pickle file from: pickle\pkl_df_text.pkl
2023-01-06 12:19:28,438 | INFO : Loading pickle file from: pickle\pkl_df_edInput.pkl


Find out the balance of the dataset

In [2]:
def get_balance_metric(df_edInput):
    num_rows = len(df_edInput)
    num_class2 = len(df_edInput[df_edInput[gl.edInput] == 2])
    perc_class2 = (num_class2 / num_rows) * 100
    return perc_class2


In [3]:
perc_class2 = get_balance_metric(df_edInput)
logging.info(f"The number of tweets in the of class 2 are {round(perc_class2, 2)}%")

2023-01-06 12:19:28,544 | INFO : The number of tweets in the of class 2 are 40.59%


Now vectorise the tweets. This converts the text into a matrix of token counts.

In [4]:
df_text = df_text.to_frame()
df_text.columns = [gl.text]
vectorizer = CountVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(df_text[gl.text])
X_vec 

<30024x25956 sparse matrix of type '<class 'numpy.int64'>'
	with 299816 stored elements in Compressed Sparse Row format>

In [5]:
tfidf = TfidfTransformer() # by default applies "l2" normalization
X_tfidf = tfidf.fit_transform(X_vec)
X_tfidf

<30024x25956 sparse matrix of type '<class 'numpy.float64'>'
	with 299816 stored elements in Compressed Sparse Row format>

In [6]:
save_pickle_object(X_tfidf, gl.pkl_X_tfidf)

2023-01-06 12:19:29,173 | INFO : Saving pickle file from: pickle\pkl_X_tfidf.pkl


Do the same as above, but obtain the tf/idf matrix via tokens that have been stemmed and lemmatized

In [7]:
def create_tfidf_matrix(df, pickle_file, ngram=None):

	vectorizer = CountVectorizer(stop_words='english') if ngram is None else CountVectorizer(stop_words='english', ngram_range = ngram)
	df.columns.tolist()
	X_vec = vectorizer.fit_transform(df[gl.text])
	tfidf = TfidfTransformer() # by default applies "l2" normalization
	X_tfidf = tfidf.fit_transform(X_vec)
	save_pickle_object(X_tfidf, pickle_file)

Now try brigrams and trigrams to see if we get different results downstream.

In [9]:
bigram = (2, 2)
trigram = (3, 3)
create_tfidf_matrix(df_text, gl.pkl_X_bigram_tfidf, bigram)
create_tfidf_matrix(df_text, gl.pkl_X_trigram_tfidf, trigram)

2023-01-06 12:20:09,343 | INFO : Saving pickle file from: pickle\pkl_X_bigram_tfidf.pkl
2023-01-06 12:20:10,302 | INFO : Saving pickle file from: pickle\pkl_X_trigram_tfidf.pkl


This time we will apply stemming to the text and see if we get different results downstream.<br>
First we tokenise the text via Spacy, then we use NLTK stemming.

In [10]:
df_stem_text = df_text[gl.text].apply(lambda x: " ".join([stemmer.stem(t.text) for t in sp.tokenizer(x)])).to_frame()
df_stem_text.columns = [gl.text]
df_stem_text

Unnamed: 0,text
12252,5402612 uk prime minist theresa may will face ...
14042,5402612 uk pm theresa may win confid vote with...
16954,705706292 the probe of the inaugur fund part a...
18004,25984418 the week brexit hit the brick wall : ...
18396,61183568 have watch these kind of pictur look ...
...,...
785779,4805771380 this bouquet of rose is complet edibl
785809,4805771380 this fanci mcdonald has a handwash ...
785813,2401975454 spoiler ahead finish # strangerth...
785829,4805771380 these cake are top with yogurt


In [11]:
create_tfidf_matrix(df_stem_text, gl.pkl_X_stem_tfidf)

2023-01-06 12:20:18,416 | INFO : Saving pickle file from: pickle\pkl_X_stem_tfidf.pkl


Now try lemmatization

In [12]:
df_lem_text = df_text[gl.text].apply(lambda x: " ".join([t.lemma_ for t in sp(x)])).to_frame()
df_lem_text.columns = [gl.text]
df_lem_text

Unnamed: 0,text
12252,5402612 UK Prime Minister Theresa May will fac...
14042,5402612 UK PM Theresa May win confidence vote ...
16954,705706292 the probe of the inaugural fund part...
18004,25984418 the week Brexit hit the brick wall : ...
18396,61183568 have watch these kind of picture look...
...,...
785779,4805771380 this bouquet of rose be completely ...
785809,4805771380 this fancy McDonalds have a handwas...
785813,2401975454 spoiler ahead finished # Stranger...
785829,4805771380 these cake be top with yogurt


In [13]:
create_tfidf_matrix(df_lem_text, gl.pkl_X_lem_tfidf)

2023-01-06 12:23:25,060 | INFO : Saving pickle file from: pickle\pkl_X_lem_tfidf.pkl
