In [1]:
DSET_FOLDER_PATH = './dataset/quora/'
GLOVE_FOLDER_PATH = './embeddings/glove/'
CORPUS_FOLDER_PATH = './corpi/'
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt  
import wordcloud as wc 
import seaborn as sns 
import nltk
import re
import contractions
import symspellpy
train_dset_df = pd.read_csv(DSET_FOLDER_PATH + "train.csv")
from tqdm import tqdm 
tqdm.pandas()

In [2]:
class Preprocessor:
    def __init__(self, path_to_words_corpus):
        self.sym_spell = symspellpy.SymSpell()
        self.sym_spell.create_dictionary(path_to_words_corpus)
        self.tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
        self.stopwords_corpus = set(nltk.corpus.stopwords.words())
        self.stemmer = nltk.stem.PorterStemmer()
    def preprocess(self,sentence):
        sentence = sentence.lower()
        sentence = re.sub(r"\d+", "", sentence)
        sentence = contractions.fix(sentence)
        sentence = self.tokenizer.tokenize(sentence)
        sentence = [word for word in sentence if not word in self.stopwords_corpus]
        sentence = [self.stemmer.stem(word) for word in sentence]
        sentence = [self.sym_spell.lookup(word, 0, include_unknown=True)[0].term for word in sentence]
        return sentence

In [3]:
test_dset_df = pd.read_csv(DSET_FOLDER_PATH + "test.csv")

In [4]:
preprocessor = Preprocessor(CORPUS_FOLDER_PATH + "words_alpha.txt")

In [5]:
test_dset_df["preprocessed"] = test_dset_df["question_text"].progress_apply(preprocessor.preprocess)

100%|██████████| 522449/522449 [04:43&lt;00:00, 1840.47it/s]


In [6]:
import utils.PreprocessingEmbedding20200928 as pped 
embedder = pped.Glove_Embedder(GLOVE_FOLDER_PATH + "glove.6B.50d.txt")

In [7]:
test_dset_df["vectorized"] = test_dset_df["preprocessed"].progress_apply(embedder.get_embedding_for_sentence)

X = np.array(test_dset_df.vectorized.to_list())

100%|██████████| 522449/522449 [00:06&lt;00:00, 82609.84it/s]


In [8]:
import pickle 
import joblib

In [9]:
pickled_model = joblib.load("./2020_10_02_new_model.joblib")
svc = pickle.loads(pickled_model)

In [10]:
y = svc.predict(X)

In [11]:
y

array([0, 0, 1, ..., 1, 0, 1])

In [12]:
test_dset_df.question_text = y

In [13]:
test_dset_df = test_dset_df.rename(columns={"qid":"qid", "question_text":"target"})

In [14]:
test_dset_df = test_dset_df.drop(labels="preprocessed", axis="columns")


In [15]:
test_dset_df = test_dset_df.drop(labels="vectorized", axis="columns")

In [16]:
test_dset_df.to_csv("2020_10_03_testset_output.csv", index=False)