In [76]:
import pandas as pd
import numpy as np
import re
from IPython.display import display # Allows the use of display() for DataFrames

In [2]:
def tokenize_str(string):
    '''
    Tokenization/string cleaning
    
    '''
    string = re.sub(r"[^A-Za-z0-9()[],?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\’s", " \'s", string)
    string = re.sub(r"\'m", " am", string)
    string = re.sub(r"\'ve", " have", string)
    string = re.sub(r"can\'t", " cannot", string)
    string = re.sub(r"n\'t", " not", string)
    string = re.sub(r"\'re", " are", string)
    string = re.sub(r"\'d", " had", string)
    string = re.sub(r"\'ll", " will", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"[...]", " ", string)
    string = re.sub(r"/", " or ", string)
    string = re.sub(r"-", " ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

def load_data_and_preprocess(filename):
    '''
    Read csv file into a DataFrame and tokenize Q1 and Q2 strings
    '''
    df = pd.read_csv(filename)
    print(filename + " loaded. Preprocessing...")
    df["q1"] = df["question1"].apply(lambda row: tokenize_str(str(row)))
    df["q2"] = df["question2"].apply(lambda row: tokenize_str(str(row)))
    print("Preprocess done!")
    return df

df_train = load_data_and_preprocess("train.csv")

train.csv loaded. Preprocessing...
Preprocess done!


In [9]:
# Inspect several samples of preprocessed text
a = 80
for i in range(a,a+10):
    print(i, df_train["q1"][i])
    print(i, df_train["q2"][i])
    print("")

(80, "when will the bjp government strip all the muslims and the christians of the indian citizenship and put them on boats like the rohingya 's of burma \\?")
(80, 'why india does not apply the "burma rohingya model" to deport illegal bangladeshis \\?')

(81, 'what is the right etiquette for wishing a jehovah witness happy birthday \\?')
(81, 'how important is it to be the first person to wish someone a happy birthday \\?')

(82, 'if someone wants to open a commercial fm radio station in any city of india , how much does it cost and what is the procedure \\?')
(82, 'i want to make a travel commercial or clip video hd , for india and new zealand how much will it cost \\?')

(83, 'why do swiss despise asians \\?')
(83, 'why do technical employees despise sales people so much \\?')

(84, 'what are some of the high salary income jobs in the field of biotechnology \\?')
(84, 'what are some high paying jobs for a fresher with an m tech in biotechnology \\?')

(85, 'how can i increase my hei

In [10]:
# Merge Q1 and Q2 as one feature
df_train["merged"] = df_train["q1"] + " " + df_train["q2"]

print(df_train["merged"][0])

what is the step by step guide to invest in share market in india \? what is the step by step guide to invest in share market \?


In [20]:
# Split into stratified training and validation set
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    df_train["merged"], 
    df_train["is_duplicate"], 
    test_size=0.01, 
    random_state=42)

print(X_train.shape)
print(X_val.shape)

(400247L,)
(4043L,)


In [None]:
# Random oversampling

df_pos = df_train[df_train["is_duplicate"] == 1]
df_neg = df_train[df_train["is_duplicate"] == 0]

print(len(df_pos))
print(len(df_neg))

def pos_neg_stats(df_pos, df_neg, pct):
    '''
    Calculates stats on class imbalance
    '''
    # Pos / Pos + Neg * 100
    pos_neg_ratio = float(len(df_pos)) / (len(df_pos) + len(df_neg)) * 100
    print("Percentage duplicates in dataset: " + str(np.round(pos_neg_ratio, 2)) + " %")

    # Pos / Pos + Neg = 16.5%, then: Neg = Pos * 100 / 16.5 - Pos
    neg_class = (len(df_pos) * 100 / pct) - len(df_pos)
    print("Neg class needs to be: " +
          str(np.round(neg_class, 0)) + " to balance to " + str(pct) + " % duplicates.")

def random_oversample(pos, neg, amt):
    '''
    Oversamples majority class by an amount
    '''
    # Create empty DF
    df = pd.DataFrame(columns=("q1", "q2", "is_duplicate"))
    # Randomly pick Q1 and Q2 to form synthetic rows. One from Pos and one from Neg
    for i in range(amt):
        ques1 = np.random.choice(["q1", "q2"])
        rand_pos = np.random.choice(pos[ques1])
        ques2 = np.random.choice(["q1", "q2"])
        rand_neg = np.random.choice(neg[ques2])
        df.loc[i, "q1"] = rand_pos
        df.loc[i, "q2"] = rand_neg
        df.loc[i, "is_duplicate"] = 0
     
    # Merge Q1 and Q2
    df["merged"] = df["q1"] + " " + df["q2"]
    return df
    
pos_neg_stats(df_pos, df_neg, 16.5)
df_add = random_oversample(df_pos, df_neg, 500334)
display(df_add.head())

149263
255027
Percentage duplicates in dataset: 36.92 %
Neg class needs to be: 755361.0 to balance to 16.5 % duplicates.


In [6]:
# Check avaiable columns
print(df_train.columns)

# Drop unused columns
'''df_train = df_train[[5, 8]]

print(df_train.columns)'''

Index([u'id', u'qid1', u'qid2', u'question1', u'question2', u'is_duplicate',
       u'q1', u'q2', u'merged'],
      dtype='object')


'df_train = df_train[[5, 8]]\n\nprint(df_train.columns)'

In [7]:
# Split into stratified training and validation set
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN

'''X_train, X_val, y_train, y_val = train_test_split(
    df_train["merged"], 
    df_train["is_duplicate"], 
    test_size=0.1, 
    random_state=42)'''

# Oversample the majority class (non-duplicates) to match 16.5% of duplicates in the test.csv set
'''rom = ADASYN(random_state=42, ratio = 0.165) # ratio: minority class / majority class
X_train_res, y_train_res = rom.fit_sample(X_train, y_train)

print(y_train.value_counts())'''



'rom = ADASYN(random_state=42, ratio = 0.165) # ratio: minority class / majority class\nX_train_res, y_train_res = rom.fit_sample(X_train, y_train)\n\nprint(y_train.value_counts())'

In [8]:
# Saves preprocessed data into a pickle file
'''import pickle

with open('preprocess_x_1.pickle', 'wb') as handle:
    pickle.dump(x_shuffled, handle, protocol=pickle.HIGHEST_PROTOCOL)'''

"import pickle\n\nwith open('preprocess_x_1.pickle', 'wb') as handle:\n    pickle.dump(x_shuffled, handle, protocol=pickle.HIGHEST_PROTOCOL)"