The data has been cleaned and stored in pickle files in p1_clean.ipynb.


In [1]:
import logging, importlib, sys, tqdm
import spacy
import pandas as pd
import os, re
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from _pckle import save_pickle_object, load_pickle_object
from _logging import set_logging
from _graph import histplot_count, histplot_range_count
from _utility import gl

set_logging(logging)
df_text = load_pickle_object(gl.pkl_df_text)
df_is_business = load_pickle_object(gl.pkl_df_is_business)


2023-01-03 18:57:06,118 | INFO : Loading pickle file from: pickle\pkl_df_text.pkl
2023-01-03 18:57:06,390 | INFO : Loading pickle file from: pickle\pkl_df_is_business.pkl


Find out the balance of the dataset

In [2]:
def get_balance_metric(df_is_business):
    num_rows = len(df_is_business)
    num_business = df_is_business.sum(axis=0)
    number_in_business_category = num_business.values[0]
    perc_business = (number_in_business_category / num_rows) * 100
    return perc_business


In [3]:
perc_business = get_balance_metric(df_is_business)
logging.info(f"The number of tweets in the business category are {round(perc_business, 2)}%")

2023-01-03 18:57:06,464 | INFO : The number of tweets in the business category are 20.94%


Now vectorise the tweets

In [10]:
arr_text = df_text[gl.text].to_numpy()
vectorizer = CountVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(df_text[gl.text])
X_vec 

<785916x189828 sparse matrix of type '<class 'numpy.int64'>'
	with 7196831 stored elements in Compressed Sparse Row format>

In [11]:
tfidf = TfidfTransformer() # by default applies "l2" normalization
X_tfidf = tfidf.fit_transform(X_vec)
#X_tfidf = X_tfidf.todense()
X_tfidf

<785916x189828 sparse matrix of type '<class 'numpy.float64'>'
	with 7196831 stored elements in Compressed Sparse Row format>

In [13]:
save_pickle_object(X_tfidf, gl.pkl_X_tfidf)

2023-01-03 19:12:54,634 | INFO : Saving pickle file from: pickle\pkl_X_tfidf.pkl
