In [1]:
import pandas as pd
import numpy as np
import re
from IPython.display import display # Allows the use of display() for DataFrames

In [2]:
def tokenize_str(string):
    '''
    Tokenization/string cleaning
    
    '''
    string = re.sub(r"[^A-Za-z0-9()[],?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\’s", " \'s", string)
    string = re.sub(r"\'m", " am", string)
    string = re.sub(r"\'ve", " have", string)
    string = re.sub(r"can\'t", " cannot", string)
    string = re.sub(r"n\'t", " not", string)
    string = re.sub(r"\'re", " are", string)
    string = re.sub(r"\'d", " had", string)
    string = re.sub(r"\'ll", " will", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"[...]", " ", string)
    string = re.sub(r"/", " or ", string)
    string = re.sub(r"-", " ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

def load_data_and_preprocess(filename):
    '''
    Read csv file into a DataFrame and tokenize Q1 and Q2 strings
    '''
    df = pd.read_csv(filename)
    print(filename + " loaded. Preprocessing...")
    df["q1"] = df["question1"].apply(lambda row: tokenize_str(str(row)))
    df["q2"] = df["question2"].apply(lambda row: tokenize_str(str(row)))
    print("Preprocess done!")
    return df

df_train = load_data_and_preprocess("train.csv")

train.csv loaded. Preprocessing...
Preprocess done!


In [3]:
# Inspect several samples of preprocessed text
a = 80
for i in range(a,a+10):
    print(i, df_train["q1"][i])
    print(i, df_train["q2"][i])
    print("")

(80, "when will the bjp government strip all the muslims and the christians of the indian citizenship and put them on boats like the rohingya 's of burma \\?")
(80, 'why india does not apply the "burma rohingya model" to deport illegal bangladeshis \\?')

(81, 'what is the right etiquette for wishing a jehovah witness happy birthday \\?')
(81, 'how important is it to be the first person to wish someone a happy birthday \\?')

(82, 'if someone wants to open a commercial fm radio station in any city of india , how much does it cost and what is the procedure \\?')
(82, 'i want to make a travel commercial or clip video hd , for india and new zealand how much will it cost \\?')

(83, 'why do swiss despise asians \\?')
(83, 'why do technical employees despise sales people so much \\?')

(84, 'what are some of the high salary income jobs in the field of biotechnology \\?')
(84, 'what are some high paying jobs for a fresher with an m tech in biotechnology \\?')

(85, 'how can i increase my hei

In [4]:
# Merge Q1 and Q2 as one feature
df_train["merged"] = df_train["q1"] + " " + df_train["q2"]

print(df_train["merged"][0])

what is the step by step guide to invest in share market in india \? what is the step by step guide to invest in share market \?


In [5]:
# Split into stratified training and validation set
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    df_train, 
    df_train["is_duplicate"], 
    test_size=0.01, 
    random_state=42)

print(X_train.shape)
print(X_val.shape)

(400247, 9)
(4043, 9)


In [19]:
# Random oversampling

df_pos = X_train[X_train["is_duplicate"] == 1]
df_neg = X_train[X_train["is_duplicate"] == 0]

print("Total positive pair examples: ", len(df_pos))
print("Total negative pair examples: ", len(df_neg))

def pos_neg_stats(df_pos, df_neg, pct):
    '''
    Calculates stats on class imbalance
    '''
    # Pos / Pos + Neg * 100
    pos_neg_ratio = float(len(df_pos)) / (len(df_pos) + len(df_neg)) * 100
    print("Percentage duplicates in dataset: " + str(np.round(pos_neg_ratio, 2)) + " %")

    # Pos / Pos + Neg = 16.5%, then: Neg = Pos * 100 / 16.5 - Pos
    neg_class = (len(df_pos) * 100 / pct) - len(df_pos)
    print("Neg class needs to be: " +
          str(np.round(neg_class, 0)) + " to balance to " + str(pct) + " % duplicates.")

def random_oversample(pos, neg, size):
    '''
    Oversamples majority class by an amount
    '''
    pos = pos.sample(frac=1).reset_index(drop=True)
    neg = neg.sample(frac=1).reset_index(drop=True)
    
    df = pd.DataFrame()
    df["q1"] = pos["q1"]
    df["q2"] = neg["q2"]
    print(len(df))
    
    df2 = pd.DataFrame()
    df2["q1"] = pos["q2"]
    df2["q2"] = neg["q1"]
    print(len(df2))
    
    df = df.append(df2)
    
    pos = pos.sample(frac=1).reset_index(drop=True)
    neg = neg.sample(frac=1).reset_index(drop=True)
    
    df3 = pd.DataFrame()
    df3["q1"] = neg["q2"]
    df3["q2"] = neg["q1"]
    print(len(df3))
    
    df = df.append(df3)
    
    pos = pos.sample(frac=1).reset_index(drop=True)
    neg = neg.sample(frac=1).reset_index(drop=True)
    
    df4 = pd.DataFrame()
    df4["q1"] = pos["q2"]
    df4["q2"] = neg["q2"]
    print(len(df4))
    
    df = df.append(df4)
    
    pos = pos.sample(frac=1).reset_index(drop=True)
    neg = neg.sample(frac=1).reset_index(drop=True)
    
    df5 = pd.DataFrame()
    df5["q1"] = pos["q1"]
    df5["q2"] = neg["q1"]
    print(len(df5))
    
    df = df.append(df5)
    df = df.reset_index(drop=True)
    print("Total oversampled examples to choose from: ", len(df))
    
    df_sample_ind = np.random.choice(df.index.values, size=size, replace=False)
    df_sample = df.loc[df_sample_ind]
    df_sample = df_sample.reset_index(drop=True)
    df_sample["is_duplicate"] = 0 
    
    print("Total examples in rebalanced dataset: ", len(df_sample))
    display(df_sample.head())
    display(df_sample.tail())
    display(df_sample[["q1", "q2"]].describe())
    
    return df_sample
    
pos_neg_stats(df_pos, df_neg, 16.5)
df_sample = random_oversample(df_pos, df_neg, 747796)

('Total positive pair examples: ', 147768)
('Total negative pair examples: ', 252479)
Percentage duplicates in dataset: 36.92 %
Neg class needs to be: 747796.0 to balance to 16.5 % duplicates.
147768
147768
252479
147768
147768
('Total oversampled examples to choose from: ', 843551)
('Total examples in rebalanced dataset: ', 747796)


Unnamed: 0,q1,q2,is_duplicate
0,how do i know if my phone is tapped \?,what does it feel like to awaken from a coma \?,0
1,"i like quora , but what is the difference betw...",which are the worst engineering colleges in in...,0
2,is the indian currency has gps nano chip \?,why mustache grows in young girls face \?,0
3,linguistics: how to self study linguistics \?,why do i get anxiety after a night of drinking \?,0
4,does gary johnson have any chance left at winn...,where are the ideal places to install security...,0


Unnamed: 0,q1,q2,is_duplicate
747791,is proof of god 's existence overwhelming \?,what are some of the strongest proofs of god '...,0
747792,will you computers be able to have dreams or a...,are there any cultures in which feminine men a...,0
747793,what are the sizes of pizza hut pizzas \?,what 's the size of a domino 's pizza \( mediu...,0
747794,i have written a proof of a proposition about ...,is “please see attached document” correct \?,0
747795,how do i stop growing taller \?,what is the university of guelph good for \?,0


Unnamed: 0,q1,q2
count,747796,747796
unique,330555,361495
top,how do i lose weight \?,what are the best ways to lose weight \?
freq,198,127


In [29]:
# Merge Q1 and Q2 as one feature
df_sample["merged"] = df_sample["q1"] + " " + df_sample["q2"]

# Set feature and value
X_train_sample = df_sample["merged"]
X_val_sample = df_sample["is_duplicate"]

print(len(X_train_sample))
print(len(X_val_sample))

747796
747796


In [30]:
# Saves preprocessed data into a pickle file
import pickle

with open('preprocess.pickle', 'wb') as f:
    pickle.dump(X_train_sample, f, protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(X_val_sample, f, protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(y_train, f, protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(y_val, f, protocol=pickle.HIGHEST_PROTOCOL)
    
print("File saved.")

File saved.
