In [5]:
DSET_FOLDER_PATH = './dataset/quora/'
CORPUS_FOLDER_PATH = './corpi/'
import nltk 
import re 
import contractions 
import jamspell
import pandas as pd
import wordninja
from tqdm import tqdm 
tqdm.pandas()

In [6]:
train_dset_df = pd.read_csv(DSET_FOLDER_PATH + "train.csv")
test_dset_df = pd.read_csv(DSET_FOLDER_PATH + "test.csv")

In [7]:
class Preprocessor_2020_10_17:
    def __init__(self, jamspell_corpus,word_term=0, freq_term=1, separator=" ", stemmer="snowball"):
        '''
        Parameters:
            symspell_corpus: path to textfile of word-frequency pairs.
        '''
        self.tokenizer = nltk.tokenize.WhitespaceTokenizer()
        self.spellChecker = jamspell.TSpellCorrector()
        self.spellChecker.LoadLangModel(jamspell_corpus) 
        self.stopwordCorpus = set(nltk.corpus.stopwords.words())
        self.lemmatizer = nltk.stem.WordNetLemmatizer()
        self.nltk_tag_to_wordnet_tag = {'J':nltk.corpus.wordnet.ADJ, 'V':nltk.corpus.wordnet.VERB, 'N':nltk.corpus.wordnet.NOUN, 'R':nltk.corpus.wordnet.ADJ}
        if(stemmer == "porter"):
            self.stemmer = nltk.stem.PorterStemmer()
        elif(stemmer == "snowball"):
            self.stemmer = nltk.SnowballStemmer("english")
        elif(stemmer == "lancaster"):
            self.stemmer = nltk.LancasterStemmer()
        else:
            print("Error. Incorrect keyword passed for stemmer.")
            raise Exception
    def preprocess(self, sentence, spellcheck= True, stopword_removal = True, lemmatization=True, stemming=True):
        '''
        A string
        '''
        sentence= sentence.lower() #1
        if(spellcheck):
            sentence = self.spellChecker.FixFragment(sentence)
        sentence= contractions.fix(sentence) #2 
        tokenized_sentence= self.tokenizer.tokenize(sentence) #3
        tokenized_sentence= [''.join([i for i in s if i.isalpha()])for s in tokenized_sentence] #4
        if(spellcheck):
            new_sentence = []
            for word in tokenized_sentence:
                new_sentence += wordninja.split(word)
            tokenized_sentence = new_sentence
        tokenized_sentence= [i for i in tokenized_sentence if len(i) > 0] #4
        if(stopword_removal):
            tokenized_sentence= [word for word in tokenized_sentence if not word in self.stopwordCorpus]
        if(lemmatization):
            tokenized_sentence = nltk.pos_tag(tokenized_sentence)
            tokenized_sentence = [(word[0], self.nltk_tag_to_wordnet_tag.get(word[1][0] if len(word[1]) > 0 else None, nltk.corpus.wordnet.NOUN)) for word in tokenized_sentence]
            tokenized_sentence = [self.lemmatizer.lemmatize(word[0], pos=word[1]) for word in tokenized_sentence]
        if(stemming):
            tokenized_sentence = [self.stemmer.stem(word) for word in tokenized_sentence]
        return tokenized_sentence






In [8]:
pp = Preprocessor_2020_10_17(CORPUS_FOLDER_PATH + "en.bin", stemmer="snowball")

In [9]:
train_dset_df["preprocessed"] = train_dset_df["question_text"].progress_apply(lambda x: pp.preprocess(x, lemmatization=True, stemming=False, stopword_removal=False))

100%|██████████| 783673/783673 [27:47<00:00, 469.98it/s]


In [10]:
test_dset_df["preprocessed"] = test_dset_df["question_text"].progress_apply(lambda x: pp.preprocess(x, lemmatization=True, stemming=False, stopword_removal=False))

100%|██████████| 522449/522449 [18:30<00:00, 470.55it/s]


In [11]:
train_dset_df["preprocessed_joined"] =  train_dset_df.preprocessed.progress_apply(" ".join)

100%|██████████| 783673/783673 [00:00<00:00, 1121098.34it/s]


In [12]:
test_dset_df["preprocessed_joined"] =  test_dset_df.preprocessed.progress_apply(" ".join)

100%|██████████| 522449/522449 [00:00<00:00, 1140127.94it/s]


In [13]:
train_dset_df.drop(inplace=True, axis="columns", labels =["question_text", "preprocessed"])

In [14]:
test_dset_df.drop(inplace=True, axis="columns", labels =["question_text", "preprocessed"])

In [15]:
train_dset_df.to_csv("2020_10_19_train_dset_df_nostem_nostoprem.csv", index=False)

In [16]:
test_dset_df.to_csv("2020_10_19_test_dset_df_nostem_nostoprem.csv", index=False)