# Preprocessing

## Import libraries

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import SnowballStemmer,WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from collections import defaultdict
from nltk.corpus import stopwords
from string import punctuation
from nltk import pos_tag
import numpy as np
import pandas as pd
import pickle
import nltk
import sys
import os
import re

## Preprocessing Keys

In [2]:
stop_words = set(stopwords.words('english'))
tag_map = defaultdict(lambda : "n")
tag_map['J'],tag_map['V'],tag_map['R'] = "a","v","r"
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

replacements=[(r"\b([A-Za-z]+)'s\b", '\\1 is'),(r"\b([A-Za-z]+)'re\b", '\\1 are'),
              (r"\b([A-Za-z]+)'ve\b", '\\1 have'),(r"\b([A-Za-z]+)'ll\b", '\\1 will'),
              (r"\b([A-Za-z]+)n't\b", '\\1 not'),
              ("whats","what is"),("whos","who is"),("wheres","where is"),
              ("whens","when is"),("hows","how is"),(" im ","i am"),
              ("hes","he is"),("shes","she is"),("thats","that is"),
              ("theres","there is"),("isnt","is not"),("wasnt","was not"),
              ("arent","are not"),("werent","were not"),("cant","can not"),
              ("cannot","can not"),("couldnt","could not"),("dont","do not"),
              ("didnt","did not"),("shouldnt","should not"),("wouldnt","would not"),
              ("doesnt","does not"),("havent","have not"),("hasnt","has not"),
              ("hadnt","had not"),('\s+',' '), # replace multi space with one single space
              (" J K ", " JK "),("banglore", "Banglore"),("bangalore", "Banglore"),("bengaluru", "Banglore"),
              ("Find", "find"), ("Method", "method"),("Astrology", "astrology"),
              ("bestfriend", "best friend"),(" bf ","boy friend"),(" gf "," girl friend "),
              ("boyfriend"," boy friend "),("girlfriend","girl friend"),
              ("programing", "programming"),("calender", "calendar"),("intially", "initially"),
              ("quikly", "quickly"),("imrovement", "improvement"),("demonitization", "demonetization"),
              (" dms ", "direct messages "),("upvote", "up vote"),(" downvotes ", " up votes "),
              ("ios", "operating system"),(" iPhone ", " phone "),(" iphone ", " phone "),
              (" i phone ", " phone "),(" cs ", " computer science "),(" cse ", " computer science "),
              (" CS ", " computer science "),(" CSE ", " computer science "),
              ("KMs", " kilometers "),("kms", " kilometers "),("actived", "active"),
              (" UK ", " England "),(" uk ", " England "),(" u s ", " America "),(" USA ", " America "),
              (" US "," America "),("the US", "America"),(" usa ", " America "),
              ("e-mail", "email"),(" 9 11 ", "911"),(" b g ", " bg "),("60k", " 60000"),
              ('₹',' rupee '), (' txt '," text "),(" OS "," operating system "), ("Wi-Fi", "wifi"),
              ("cgpa","gpa"),("watsapp","whatsapp"),("tution", "tuition"),
              (" II ", " two "),(" III ", " three "),(" V ", " five "),
              ("1st"," one "),("2nd"," two "),("3rd"," three "),("4th"," four "),(" 10th "," ten "),
              (" 12th "," twelve "),(" 00 "," 0 "),(" 000 "," 0 "),(" 0000 "," 0 "),(" 0 "," zero "),
              (" 1 "," one "),(" 01 "," one "),(" 2 "," two "),(" 3 "," three "),(" 4 "," four "),
              (" 10 "," ten "),(" 20 "," twenty "),(" 50 "," fifty "),(" 100 "," hundred "),
              (" 1000 "," thousand "),(r"\0rs ", " rs "),(r"\'s", " "),(r"\'ve", " have "),
              (r"\'d", " would "),(r"\'ll", " will "),(r"\0s", "0"),(r"\s{2,}", " "),(r"[^A-Za-z0-9]", " ")
             ]

## Cleaning Function

In [3]:
def clean_text(text, to_lowercase=True, remove_stop_words=False, lemmatize=True, stem_words=False):
    ## Replace old patterns with new
    for old,new in replacements:
        text= re.sub(old,new, text)   
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])   
    # Convert to lowercase
    if to_lowercase:
        text=text.lower()
    text = text.split()
    # Lemmatize words
    if lemmatize:
        text = [ lemmatizer.lemmatize(word,tag_map[tag[0]]) for word,tag in pos_tag(text) ]  
    # Remove stop words
    if remove_stop_words:
        text = [w for w in text if not w in stop_words] 
    # Shorten words to their stems
    if stem_words:
        text = [stemmer.stem(word) for word in text]
    text = " ".join(text)
    # Return the clean text as string
    return(text)

## Load Training Data

In [4]:
data = pd.read_csv('cleaned-train.csv').fillna("")

## Clean the Data

In [5]:
# data['question1']=data['question1'].apply(lambda x:clean_text(x,True,True,True,True))
# print('Question 1 done.')
# data['question2']=data['question2'].apply(lambda x:clean_text(x,True,True,True,True))
# print('Question 2 done.')

Question 1 done.
Question 2 done.


## Save The Cleaned Data

In [7]:
# data.to_csv('cleaned-train.csv', index=False)

## Extract Questions with QID

In [8]:
q1, q2 = data[['qid1', 'question1']], data[['qid2', 'question2']]
q1.columns = ['qid', 'question']
q2.columns = ['qid', 'question']
question_data = pd.concat((q1, q2), axis=0).fillna("").sort_values(by='qid').drop_duplicates('qid').values

## Train-Test Split

In [9]:
X_train, X_test = train_test_split(data, test_size=0.75, random_state=10, stratify=data[['is_duplicate']])

## Extract Train Questions with QID

In [10]:
q1, q2 = X_train[['qid1', 'question1']], X_train[['qid2', 'question2']]
q1.columns = ['qid', 'question']
q2.columns = ['qid', 'question']
train_question_data = pd.concat((q1, q2), axis=0).fillna("").sort_values(by='qid').drop_duplicates('qid').values

## TF-IDF Learning

In [11]:
# vals = []
# for i in range(128, 8197, 128):
#     vectorizer = TfidfVectorizer(stop_words='english', strip_accents = 'unicode',
#                                  max_features = i, norm='l1')
#     x = vectorizer.fit_transform(question_data[:,1]).todense()
#     vals.append(np.median(np.count_nonzero(x, axis=1), axis=0)[0,0])
vectorizer = TfidfVectorizer(stop_words='english', strip_accents = 'unicode', max_features = 256, norm='l1')
vectorizer = vectorizer.fit(train_question_data[:,1])
q_vectors = vectorizer.transform(question_data[:,1]).todense()

## TF-IDF Vectorization of Train Data

In [12]:
np_data = X_train.values
train_vectors = np.zeros((np_data.shape[0], q_vectors.shape[1]+1))
for i in range(np_data.shape[0]):
    train_vectors[i,:-1] = q_vectors[np_data[i,1]-1] - q_vectors[np_data[i,2]-1]
    train_vectors[i,-1] = np_data[i,-1]

## TF-IDF Vectorization of Test Data

In [13]:
np_data = X_test.values
test_vectors = np.zeros((np_data.shape[0], q_vectors.shape[1]+1))
for i in range(np_data.shape[0]):
    test_vectors[i,:-1] = q_vectors[np_data[i,1]-1] - q_vectors[np_data[i,2]-1]
    test_vectors[i,-1] = np_data[i,-1]

## PCA

In [14]:
pca = PCA(n_components=8, random_state=10)
pca = pca.fit(train_vectors[:,:-1])

In [15]:
train_pca = pca.transform(train_vectors[:,:-1])
train_pca = np.hstack((train_pca, train_vectors[:,-1:]))
test_pca = pca.transform(test_vectors[:,:-1])
test_pca = np.hstack((test_pca, test_vectors[:,-1:]))

In [17]:
column_names = ['feature {}'.format(x+1) for x in range(8)]
column_names.append('is_duplicate')
train_data = pd.DataFrame(data=train_pca, columns=column_names).astype({'is_duplicate': int})
test_data = pd.DataFrame(data=test_pca, columns=column_names).astype({'is_duplicate': int})

## Save Models and Data

In [18]:
train_data.to_csv('pca_train_2.csv', index=False)
test_data.to_csv('pca_test_2.csv', index=False)
pickle.dump(pca, open('pca_obj_2.pkl', 'wb'))
pickle.dump(vectorizer, open('tf-idf_obj_2', 'wb'))