In [1]:
import pandas as pd
import codecs
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import f1_score, precision_score,recall_score

import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.models import Word2Vec

from tqdm import tqdm
import pickle
import warnings
warnings.filterwarnings("ignore")

from sklearn_deltatfidf import DeltaTfidfVectorizer

# Data Loading and Preprocessing

In [2]:
covid_data = pd.read_csv('knowledge_base.csv')
covid_data = pd.DataFrame(covid_data.iloc[:,0])

In [3]:
covid_data.shape

(408, 1)

In [4]:
covid_data.columns = ['OriginalText']
print(list(covid_data['OriginalText'])[0])
train_x_tf = covid_data

What is a coronavirus


In [5]:
from bs4 import BeautifulSoup
from tqdm import tqdm
train_x_tf['CleanedText'] = ''
# tqdm is for printing the status bar
for i in tqdm(range(0,train_x_tf['OriginalText'].shape[0]-1)):
    range(0,train_x_tf['OriginalText'].shape[0]-1)
    sentence = str(train_x_tf.iloc[i,[train_x_tf.columns.get_loc('OriginalText')]].values)    
    sentence = re.sub("\S*\d\S*", "", sentence).strip()    
    sentence = re.sub('[<>%\$\'\,\|]', ' ', sentence)    
    sentence = re.sub('[^a-zA-Z]',' ',train_x_tf['OriginalText'].iloc[:].values[i])  
#    sentence = ' '.join(e for e in sentence.split() if e not in final_stopwords)    
    train_x_tf.iloc[i,[train_x_tf.columns.get_loc('CleanedText')]] = sentence.strip()     
train_x_tf.head()


100%|███████████████████████████████████████████████████████████████████████████████| 407/407 [00:00<00:00, 717.60it/s]


Unnamed: 0,OriginalText,CleanedText
0,What is a coronavirus,What is a coronavirus
1,What kind of diseases are caused by corona virus,What kind of diseases are caused by corona virus
2,What is covid-19,What is covid
3,When does corona disease started,When does corona disease started
4,what are the symptoms of covid-19,what are the symptoms of covid


In [6]:
import nltk
import sklearn

print('The nltk version is {}.'.format(nltk.__version__))
print('The scikit-learn version is {}.'.format(sklearn.__version__))


The nltk version is 3.4.5.
The scikit-learn version is 0.22.2.


## TF-IDF Implementation

In [8]:
#TF-IDF Implementation

#initiate TfidfVectorizer with default parameters
tf_idf_vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=10)
#Learning the internal parameters of data before doing transform
#Here the dimension of the vectorizer is based on xtr
#(which will be applied to crossvalidation and test also during transform)
tf_idf_vect = tf_idf_vectorizer.fit(train_x_tf.CleanedText)

#Applying the learned parameters and creating vectorizer output (Dimension same as xtr)
final_xtr = tf_idf_vect.transform(train_x_tf.CleanedText)

with open('covid_tf_idf_vect.pkl', 'wb') as f:
    pickle.dump(tf_idf_vect, f)


##-----------------Standardizing --- START

#Standardizing the vectorized matrix
final_xtr_std = StandardScaler(with_mean=False)
# here it will learn mu and sigma
final_xtr_std.fit(final_xtr)

print("~~~~ STANDARDIZATION : Training ~~~~~")
# with the learned mu and sigma it will do std on train data
standardized_tfidf_train = final_xtr_std.transform(final_xtr)
print('Shape after standarizing:',standardized_tfidf_train.shape)
print(type(standardized_tfidf_train))

with open('covid_final_xtr_std.pkl', 'wb') as f:
    pickle.dump(final_xtr_std, f)
    
with open('covid_standardized_tfidf_train.pkl', 'wb') as f:
    pickle.dump(standardized_tfidf_train, f)    
    


~~~~ STANDARDIZATION : Training ~~~~~
Shape after standarizing: (408, 105)
<class 'scipy.sparse.csr.csr_matrix'>


***