# Initial data preprocessing steps, mark one

In [1]:
DSET_FOLDER_PATH = './dataset/quora/'
GLOVE_FOLDER_PATH = './embeddings/glove/'

## 1. Collecting NLTK and the dataset

In [2]:
import nltk
# nltk.download()

In [3]:
import numpy as np 
import matplotlib.pyplot as plt  
import pandas as pd 
import seaborn as sns 
import contractions

In [4]:
train_dset_df = pd.read_csv(DSET_FOLDER_PATH + 'train.csv')

In [5]:
train_dset_df.head()

Unnamed: 0,qid,question_text,target
0,6f47b0f60633c2056455,"How can I reply to this comment, ""India is poo...",0
1,d49b3966070b27bf07fc,What did they use for transportation in Ancien...,0
2,6d5faa49380557c8ca7b,What are the most important provisions of Obam...,0
3,cebea75faa47388edcf5,At what age do most Finns master English today?,0
4,2a7b76a679cadb0a016e,What is cheapest place to live in India for on...,0


## 2. Steps of pre-embedding preprocessing:


In [6]:
sample_sentence = train_dset_df.question_text[0]
sample_sentence

'How can I reply to this comment, "India is poor. It is a fact. I don\'t understand the unnecessary criticism of Snapchat CEO\'s statement"?'

### 2.1. Lowercasing

In [7]:
sample_sentence = sample_sentence.lower()
sample_sentence

'how can i reply to this comment, "india is poor. it is a fact. i don\'t understand the unnecessary criticism of snapchat ceo\'s statement"?'

### 2.2. Contractions removal

In [8]:
sample_sentence = contractions.fix(sample_sentence) 
sample_sentence

'how can i reply to this comment, "india is poor. it is a fact. i do not understand the unnecessary criticism of snapchat ceo\'s statement"?'

### 2.3. Tokenization

In [9]:
basic_tok = nltk.tokenize.RegexpTokenizer(r"\w+")
sample_sentence = basic_tok.tokenize(sample_sentence)
sample_sentence

['how',
 'can',
 'i',
 'reply',
 'to',
 'this',
 'comment',
 'india',
 'is',
 'poor',
 'it',
 'is',
 'a',
 'fact',
 'i',
 'do',
 'not',
 'understand',
 'the',
 'unnecessary',
 'criticism',
 'of',
 'snapchat',
 'ceo',
 's',
 'statement']

### 2.4. Stop-word removal

In [10]:
stopwords_corpus = nltk.corpus.stopwords
sample_sentence = [word for word in sample_sentence if not word in stopwords_corpus.words()]
sample_sentence

['reply',
 'comment',
 'india',
 'poor',
 'fact',
 'understand',
 'unnecessary',
 'criticism',
 'snapchat',
 'ceo',
 'statement']

### 2.5 Lemmatization

In [11]:
lemmatizer = nltk.stem.WordNetLemmatizer()
sample_sentence = [lemmatizer.lemmatize(word) for word in sample_sentence]
sample_sentence

['reply',
 'comment',
 'india',
 'poor',
 'fact',
 'understand',
 'unnecessary',
 'criticism',
 'snapchat',
 'ceo',
 'statement']

In [12]:
class Preprocessor_AKB:
    def __init__(self):
        import nltk
        import contractions 
        self.tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
        self.stopwords_corpus = set(nltk.corpus.stopwords.words())
        self.lemmatizer = nltk.stem.WordNetLemmatizer()
    def preprocess(self,sentence):
        sentence = sentence.lower()
        sentence = contractions.fix(sentence)
        sentence = self.tokenizer.tokenize(sentence)
        sentence = [word for word in sentence if not word in self.stopwords_corpus]
        sentence = sentence = [self.lemmatizer.lemmatize(word) for word in sentence]
        return sentence

In [13]:
preprocessor = Preprocessor_AKB()
sample_sentence_2 = train_dset_df.question_text[0]
sample_sentence_2

'How can I reply to this comment, "India is poor. It is a fact. I don\'t understand the unnecessary criticism of Snapchat CEO\'s statement"?'

In [14]:
preprocessor.preprocess(sample_sentence_2)

['reply',
 'comment',
 'india',
 'poor',
 'fact',
 'understand',
 'unnecessary',
 'criticism',
 'snapchat',
 'ceo',
 'statement']

## 3. GloVe Embeddings

### 3.1. Importing the embeddings

In [15]:
class Glove_Embedder:
    def __init__(self, PATH_TO_TEXTFILE):
        self.glove_embeddings_dict = {}
        glove_embeddings_file = open(PATH_TO_TEXTFILE, 'r')
        firstTime = True
        while True:
            line = glove_embeddings_file.readline()
            if not line:
                break
            splitted = line.split()
            key = splitted[0]
            value = np.array([float(i) for i in splitted[1:]])
            if(firstTime):
                firstTime = False 
                self.embedding_vector_size = value.size
            self.glove_embeddings_dict[key] = value
        glove_embeddings_file.close()
    def get_embedding_for_sentence(self, sentence_list):
        '''
        The sentence should be lowercased and free of special characters and numbers. Ideally, it should be lemmatized, too. The sentence should be a list of words.
        '''
        number_of_words = len(sentence_list)
        embedding = np.zeros((self.embedding_vector_size, ))
        if(number_of_words == 0):
            return embedding 
        for word in sentence_list:
            if word in self.glove_embeddings_dict:
                embedding += self.glove_embeddings_dict[word]
        embedding /= number_of_words
        return embedding.tolist()
            
        

In [16]:
embedder = Glove_Embedder(GLOVE_FOLDER_PATH + "glove.6B.50d.txt")

In [17]:
sample_sentence_3 = train_dset_df.question_text[101]
sample_sentence_3

'Has Greek life changed over years? If yes, how?'

In [18]:
sample_sentence_3 = preprocessor.preprocess(sample_sentence_3)
sample_sentence_3

['greek', 'life', 'changed', 'year', 'yes']

In [19]:
sample_embedding = embedder.get_embedding_for_sentence(sample_sentence_3)
sample_embedding

[0.020713399999999983,
 0.40521799999999997,
 -0.166104,
 -0.36722,
 0.4106754,
 0.1663996,
 -0.5283260000000001,
 -0.29789000000000004,
 -0.6921976800000001,
 0.11451759999999997,
 0.12571399999999996,
 0.196332,
 -0.09415899999999999,
 -0.06583960000000001,
 0.832236,
 0.10686359999999999,
 -0.19629939999999999,
 0.051657600000000005,
 -0.3432042,
 0.043220000000000015,
 0.022201000000000005,
 0.16160839999999999,
 -0.01245400000000001,
 -0.032322800000000006,
 0.410208,
 -1.43652,
 -0.6133586,
 -0.14951799999999998,
 0.033224,
 0.08150400000000005,
 2.86002,
 0.1380098,
 -0.23056519999999997,
 -0.04970600000000002,
 -0.03637840000000001,
 -0.266058,
 0.123546,
 -0.09892679999999997,
 -0.08487340000000002,
 -0.264732,
 -0.41872119999999996,
 -0.11833960000000002,
 0.1870414,
 0.028474200000000005,
 -0.323266,
 0.17178480000000002,
 -0.21671899999999997,
 0.12431800000000001,
 0.073934,
 -0.08107039999999996]

## 4. Putting it all together to obtain ndarrays 

In [20]:
from tqdm import tqdm 
tqdm.pandas()

In [21]:
train_dset_df.question_text = train_dset_df.question_text.progress_apply(preprocessor.preprocess)

100%|██████████| 783673/783673 [00:25<00:00, 30980.09it/s]


In [22]:
train_dset_df.question_text = train_dset_df.question_text.progress_apply(embedder.get_embedding_for_sentence)

100%|██████████| 783673/783673 [00:10<00:00, 72238.83it/s]


In [23]:
X = np.array(train_dset_df.question_text.to_list())

In [24]:
Y = train_dset_df.target.to_numpy()

In [28]:
train_X = X[::2,:]
train_Y = Y[::2]
test_X = X[1::2,:]
test_Y = Y[1::2]

## 5. Training a polynomial-kernel SVM on the dataset

In [29]:
import sklearn.metrics 
import sklearn.linear_model

In [30]:
log_reg = sklearn.linear_model.LogisticRegression()
log_reg.fit(train_X, train_Y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [31]:
train_Yhat = log_reg.predict(train_X)

In [32]:
train_accuracy = sklearn.metrics.accuracy_score(train_Y, train_Yhat)
train_accuracy

0.9369457197763356

In [33]:
test_Yhat = log_reg.predict(test_X)
test_accuracy = sklearn.metrics.accuracy_score(test_Y, test_Yhat)

In [34]:
test_accuracy

0.9371931114037505

In [36]:
import pickle
import joblib

In [37]:
pickled_model = pickle.dumps(log_reg)

In [38]:
joblib.dump(pickled_model,'2020_09_28_pickled_model.joblib')

['2020_09_28_pickled_model.joblib']