# Optimizing preprocessing quality

In [60]:
DSET_FOLDER_PATH = './dataset/quora/'
GLOVE_FOLDER_PATH = './embeddings/glove/'
CORPUS_FOLDER_PATH = './corpi/'
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt  
import wordcloud as wc 
import seaborn as sns 
import nltk
import re
import contractions
import symspellpy
train_dset_df = pd.read_csv(DSET_FOLDER_PATH + "train.csv")
from tqdm import tqdm 
tqdm.pandas()
import gc

In [61]:
# class Preprocessor:
#     def __init__(self, path_to_words_corpus):
#         self.sym_spell = symspellpy.SymSpell()
#         self.sym_spell.create_dictionary(path_to_words_corpus)
#         self.tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
#         self.stopwords_corpus = set(nltk.corpus.stopwords.words())
#         self.stemmer = nltk.stem.PorterStemmer()
#     def preprocess(self,sentence, remove_stopwords=True, stem_reduce=True):
#         sentence = sentence.lower()
#         sentence = re.sub(r"\d+", "", sentence)
#         sentence = contractions.fix(sentence)
#         sentence = self.tokenizer.tokenize(sentence)
#         if(remove_stopwords):
#             sentence = [word for word in sentence if not word in self.stopwords_corpus]
#         if(stem_reduce):
#             sentence = [self.stemmer.stem(word) for word in sentence]
#         sentence = [self.sym_spell.lookup(word, 0, include_unknown=True)[0].term for word in sentence]
#         return sentence

In [62]:
# preprocessor = Preprocessor(CORPUS_FOLDER_PATH + "words_alpha.txt")



In [63]:
# train_dset_df["preprocessed"] = train_dset_df.question_text.progress_apply(lambda x: preprocessor.preprocess(x, stem_reduce=False))
# train_dset_df["preprocessed_joined"] = train_dset_df.preprocessed.progress_apply(lambda x: " ".join(x))
# string_of_all_words = " ".join(train_dset_df.preprocessed_joined.to_list())
# list_of_all_words = string_of_all_words.split()
# set_of_all_words = set(list_of_all_words)
# len(set_of_all_words)
# string_of_all_words = " ".join(sorted(list(set_of_all_words)))
# file = open("2020_10_04_unique_words_0.txt", "w")
# file.write(string_of_all_words)
# file.close()

Result: The preprocessing is generally good, but lemmatization is needed. In order to do this lemmatization, let us do the tagging.

In [64]:
class Preprocessor:
    def __init__(self, path_to_words_corpus):
        self.sym_spell = symspellpy.SymSpell()
        self.sym_spell.create_dictionary(path_to_words_corpus)
        self.tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
        self.stopwords_corpus = set(nltk.corpus.stopwords.words())
        self.lemmatizer = nltk.stem.WordNetLemmatizer()
        self.nltk_tag_to_wordnet_tag = {'J':nltk.corpus.wordnet.ADJ, 'V':nltk.corpus.wordnet.VERB, 'N':nltk.corpus.wordnet.NOUN, 'R':nltk.corpus.wordnet.ADJ}
    def preprocess(self,sentence, remove_stopwords=True, lemmatize=True):
        sentence = re.sub(r"\d+", "", sentence)
        sentence = contractions.fix(sentence)
        sentence = self.tokenizer.tokenize(sentence)
        if(remove_stopwords):
            sentence = [word for word in sentence if not word in self.stopwords_corpus]
        
        if(lemmatize):
            sentence = nltk.pos_tag(sentence)
            sentence = [(word[0], self.nltk_tag_to_wordnet_tag.get(word[1][0] if len(word[1]) > 0 else None, nltk.corpus.wordnet.NOUN)) for word in sentence]
            sentence = [self.lemmatizer.lemmatize(word[0], pos=word[1]) for word in sentence]
        sentence = [word.lower() for word in sentence]
        sentence = [self.sym_spell.lookup(word, 0, include_unknown=True)[0].term for word in sentence]
        return sentence
preprocessor = Preprocessor(CORPUS_FOLDER_PATH + "words_alpha.txt")

In [65]:
sample_sentence = train_dset_df["question_text"][0]

In [66]:
preprocessor.preprocess(sample_sentence)

[&#39;how&#39;,
 &#39;i&#39;,
 &#39;reply&#39;,
 &#39;comment&#39;,
 &#39;india&#39;,
 &#39;poor&#39;,
 &#39;it&#39;,
 &#39;fact&#39;,
 &#39;i&#39;,
 &#39;understand&#39;,
 &#39;unnecessary&#39;,
 &#39;criticism&#39;,
 &#39;snapshot&#39;,
 &#39;cero&#39;,
 &#39;statement&#39;]

In [67]:
train_dset_df["preprocessed"] = train_dset_df.question_text.progress_apply(lambda x: preprocessor.preprocess(x))
train_dset_df.to_csv("2020_10_04_preprocessed_train.csv")

100%|██████████| 783673/783673 [09:03&lt;00:00, 1442.64it/s]


In [68]:
test_dset_df = pd.read_csv(DSET_FOLDER_PATH + "test.csv")
test_dset_df["preprocessed"] = test_dset_df.question_text.progress_apply(lambda x: preprocessor.preprocess(x))
test_dset_df.to_csv("2020_10_04_preprocessed_test.csv")

100%|██████████| 522449/522449 [05:47&lt;00:00, 1501.79it/s]
