# 2020_10_17 Preprocessing Revisited

The purpose of this notebook is to revisit the preprocessing steps from start to finish in order to make a unified pipeline. The aim is to reduce the dimensionality of the TDFIDF/CountVectorizer matrix to the greatest extent possible.

In [2]:
DSET_FOLDER_PATH = './dataset/quora/'
CORPUS_FOLDER_PATH = './corpi/'
import nltk 
import re 
import contractions 
import jamspell
import pandas as pd
import wordninja
from tqdm import tqdm 
tqdm.pandas()

In [3]:
train_dset_df = pd.read_csv(DSET_FOLDER_PATH + "train.csv")
test_dset_df = pd.read_csv(DSET_FOLDER_PATH + "test.csv")

In [4]:
class Preprocessor_2020_10_17:
    def __init__(self, jamspell_corpus,word_term=0, freq_term=1, separator=" ", stemmer="snowball"):
        '''
        Parameters:
            symspell_corpus: path to textfile of word-frequency pairs.
        '''
        self.tokenizer = nltk.tokenize.WhitespaceTokenizer()
        self.spellChecker = jamspell.TSpellCorrector()
        self.spellChecker.LoadLangModel(jamspell_corpus) 
        self.stopwordCorpus = set(nltk.corpus.stopwords.words())
        self.lemmatizer = nltk.stem.WordNetLemmatizer()
        self.nltk_tag_to_wordnet_tag = {'J':nltk.corpus.wordnet.ADJ, 'V':nltk.corpus.wordnet.VERB, 'N':nltk.corpus.wordnet.NOUN, 'R':nltk.corpus.wordnet.ADJ}
        if(stemmer == "porter"):
            self.stemmer = nltk.stem.PorterStemmer()
        elif(stemmer == "snowball"):
            self.stemmer = nltk.SnowballStemmer("english")
        elif(stemmer == "lancaster"):
            self.stemmer = nltk.LancasterStemmer()
        else:
            print("Error. Incorrect keyword passed for stemmer.")
            raise Exception
    def preprocess(self, sentence, spellcheck= True, stopword_removal = True, lemmatization=True, stemming=True):
        '''
        A string
        '''
        sentence= sentence.lower() #1
        if(spellcheck):
            sentence = self.spellChecker.FixFragment(sentence)
        sentence= contractions.fix(sentence) #2 
        tokenized_sentence= self.tokenizer.tokenize(sentence) #3
        tokenized_sentence= [''.join([i for i in s if i.isalpha()])for s in tokenized_sentence] #4
        if(spellcheck):
            new_sentence = []
            for word in tokenized_sentence:
                new_sentence += wordninja.split(word)
            tokenized_sentence = new_sentence
        tokenized_sentence= [i for i in tokenized_sentence if len(i) > 0] #4
        if(stopword_removal):
            tokenized_sentence= [word for word in tokenized_sentence if not word in self.stopwordCorpus]
        if(lemmatization):
            tokenized_sentence = nltk.pos_tag(tokenized_sentence)
            tokenized_sentence = [(word[0], self.nltk_tag_to_wordnet_tag.get(word[1][0] if len(word[1]) > 0 else None, nltk.corpus.wordnet.NOUN)) for word in tokenized_sentence]
            tokenized_sentence = [self.lemmatizer.lemmatize(word[0], pos=word[1]) for word in tokenized_sentence]
        if(stemming):
            tokenized_sentence = [self.stemmer.stem(word) for word in tokenized_sentence]
        return tokenized_sentence






In [5]:
pp = Preprocessor_2020_10_17(CORPUS_FOLDER_PATH + "en.bin", stemmer="snowball")

In [6]:
train_dset_df["preprocessed"] = train_dset_df["question_text"].progress_apply(lambda x: pp.preprocess(x, lemmatization=True, stemming=True, stopword_removal=True))

100%|██████████| 783673/783673 [27:40<00:00, 472.06it/s]


In [7]:
test_dset_df["preprocessed"] = test_dset_df["question_text"].progress_apply(lambda x: pp.preprocess(x, lemmatization=True, stemming=True, stopword_removal=True))

100%|██████████| 522449/522449 [18:22<00:00, 473.74it/s]


In [8]:
train_dset_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 783673 entries, 0 to 783672
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   qid            783673 non-null  object
 1   question_text  783673 non-null  object
 2   target         783673 non-null  int64 
 3   preprocessed   783673 non-null  object
dtypes: int64(1), object(3)
memory usage: 23.9+ MB


In [10]:
train_dset_df["preprocessed_joined"] =  train_dset_df.preprocessed.progress_apply(" ".join)

100%|██████████| 783673/783673 [00:00<00:00, 1312118.37it/s]


In [11]:
test_dset_df["preprocessed_joined"] =  test_dset_df.preprocessed.progress_apply(" ".join)

100%|██████████| 522449/522449 [00:00<00:00, 1284054.68it/s]


In [12]:
train_dset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 783673 entries, 0 to 783672
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   qid                  783673 non-null  object
 1   question_text        783673 non-null  object
 2   target               783673 non-null  int64 
 3   preprocessed         783673 non-null  object
 4   preprocessed_joined  783673 non-null  object
dtypes: int64(1), object(4)
memory usage: 29.9+ MB


In [13]:
test_dset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522449 entries, 0 to 522448
Data columns (total 4 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   qid                  522449 non-null  object
 1   question_text        522449 non-null  object
 2   preprocessed         522449 non-null  object
 3   preprocessed_joined  522449 non-null  object
dtypes: object(4)
memory usage: 15.9+ MB


In [14]:
train_dset_df.drop(inplace=True, axis="columns", labels =["question_text", "preprocessed"])

In [15]:
test_dset_df.drop(inplace=True, axis="columns", labels =["question_text", "preprocessed"])

In [16]:
train_dset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 783673 entries, 0 to 783672
Data columns (total 3 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   qid                  783673 non-null  object
 1   target               783673 non-null  int64 
 2   preprocessed_joined  783673 non-null  object
dtypes: int64(1), object(2)
memory usage: 17.9+ MB


In [17]:
test_dset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522449 entries, 0 to 522448
Data columns (total 2 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   qid                  522449 non-null  object
 1   preprocessed_joined  522449 non-null  object
dtypes: object(2)
memory usage: 8.0+ MB


In [18]:
train_dset_df.to_csv("2020_10_18_train_dset_df.csv", index=False)

In [19]:
test_dset_df.to_csv("2020_10_18_test_dset_df.csv", index=False)

# Building a dummy model for comparison

In [37]:
from sklearn.feature_extraction.text import CountVectorizer

In [38]:
vectorizer = CountVectorizer()

In [39]:
vectorizer.fit(train_dset_df["preprocessed_joined"])

CountVectorizer()

In [40]:
sparse_x_train = vectorizer.transform(train_dset_df["preprocessed_joined"])

In [1]:
sparse_x_train

NameError: name 'sparse_x_train' is not defined

In [42]:
from sklearn.naive_bayes import BernoulliNB

In [77]:
mnb = BernoulliNB(class_prior=(0.95, 0.05))

In [78]:
mnb.fit(sparse_x_train, train_dset_df["target"])

BernoulliNB(class_prior=(0.95, 0.05))

In [79]:
yhat = mnb.predict(sparse_x_train)

In [80]:
from sklearn.metrics import f1_score

In [81]:
y = train_dset_df["target"]

In [82]:
f1_score(y, yhat)

0.5378409739745454

In [83]:
train_dset_df["yhat"] = yhat

In [84]:
wrongs = train_dset_df[train_dset_df["yhat"] != train_dset_df["target"]]

In [85]:
[(name, len(x)) for name, x in wrongs.groupby(by="target")]

[(0, 30934), (1, 19250)]

In [86]:
[(name, len(x)) for name, x in train_dset_df.groupby(by="target")]

[(0, 735222), (1, 48451)]

In [87]:
train_dset_df.to_csv("2020_10_18_train_dset_df.csv")

In [88]:
test_dset_df.to_csv("2020_10_18_test_dset_df.csv")

# Building an SVM Model for further testing

In [89]:
from sklearn.svm import LinearSVC

In [90]:
svm = LinearSVC(verbose=2, max_iter = 4000, class_weight= {0:1, 1:8})

In [91]:
svm.fit(sparse_x_train, y)

[LibLinear]

LinearSVC(class_weight={0: 1, 1: 4}, max_iter=4000, verbose=2)

In [92]:
yhat = svm.predict(sparse_x_train)

In [93]:
y = train_dset_df["target"]

In [94]:
train_dset_df["yhat"] = yhat

In [95]:
wrongs = train_dset_df[train_dset_df["yhat"] != train_dset_df["target"]]

In [96]:
f1_score(y, yhat)

0.6448837041837691

In [97]:
[(name, len(x)) for name, x in wrongs.groupby(by="target")]

[(0, 26823), (1, 12629)]

In [98]:
[(name, len(x)) for name, x in train_dset_df.groupby(by="target")]

[(0, 735222), (1, 48451)]

# Testing

In [99]:
sparse_x = vectorizer.transform(test_dset_df["preprocessed_joined"])

In [100]:
yhat = svm.predict(sparse_x)

In [101]:
test_dset_df.question_text = yhat

In [105]:
test_dset_df = test_dset_df.drop(axis="columns", labels="preprocessed")
test_dset_df.head()

Unnamed: 0,qid,target
0,f56a9a31974dc66186e8,0
1,d957c3758060f45da303,0
2,ad822d5abaedb9e247b9,0
3,4e979c23eeb6a4bd1f2e,0
4,333cc031262566b8da49,0


In [103]:
test_dset_df = test_dset_df.rename(columns={"qid":"qid", "question_text":"target"})
test_dset_df.target = test_dset_df.target.apply(round)

In [106]:
test_dset_df.to_csv("./outputs/2020_10_18_a_testset_output.csv", index=False)