In [1]:
import gc
import re
import os
import pandas as pd
import numpy as np
import random
from sklearn import metrics
import string
import math
import operator
import time
from keras.preprocessing import text, sequence
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.utils.data
from gensim import utils

Using TensorFlow backend.


In [55]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# GENERAL HYPERPARAMS
num_folds = 2
seed = 42

# HYPERPARAMS FOR TEXT PROCESSING
max_features = 120000
maxlen = 100

# HYPERPARAMS FOR NN
batch_size = 1024
epochs_fixed = 1
epochs_trainable = 1
embed_size = 300
early_stopping_patience = 2
hidden_size = 60

set_seed(seed)

In [5]:
PATH = "./input/"

puncts = {'\u200b', ',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√'}

def clean_text(x):
    x = str(x)
    table = str.maketrans({key: ' {punct} ' for key in puncts})
    return x.translate(table)

def clean_numbers(x):
    if bool(re.search(r'\d', x)):
        x = re.sub('[0-9]{5,}', '#####', x)
        x = re.sub('[0-9]{4}', '####', x)
        x = re.sub('[0-9]{3}', '###', x)
        x = re.sub('[0-9]{2}', '##', x)
    return x

mispell_dict = {
"tamilans" : "tamilians",
"feku" : "liar",
"quorans" : "people who use quora",
"qoura": "quora",
"xiomi" : "phone",
"ipill" : "contraception",
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "i would",
"i'd" : "i had",
"i'll" : "i will",
"i'm" : "i am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "i have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" we will",
"didn't": "did not",
"tryin'":"trying"}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

def replace_typical_misspell(text):
    mispellings, mispellings_re = _get_mispell(mispell_dict)
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

In [13]:
%%time

train_df = pd.read_csv(PATH+'train.csv', usecols=['question_text', 'target'])
test_df = pd.read_csv(PATH+'test.csv', usecols = ['question_text'])

# 3RD PARTY CLEAN
#train_df["question_text"] = train_df["question_text"].apply(lambda x: x.lower())
#test_df["question_text"] = test_df["question_text"].apply(lambda x: x.lower())

train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_text(x))
test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_text(x))

train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_numbers(x))
test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_numbers(x))

train_df["question_text"] = train_df["question_text"].apply(lambda x: replace_typical_misspell(x))
test_df["question_text"] = test_df["question_text"].apply(lambda x: replace_typical_misspell(x))

# FOR CREATING PROCESSED DATA AND LABELS
train_sentences = train_df['question_text']
train_labels = train_df['target']
test_sentences = test_df['question_text']

print(train_labels[20:30])

del train_df, test_df

20    0
21    0
22    1
23    0
24    0
25    0
26    0
27    0
28    0
29    0
Name: target, dtype: int64
CPU times: user 43.9 s, sys: 120 ms, total: 44 s
Wall time: 44 s


# With multiprocessing

In [6]:
import psutil
from multiprocessing import Pool

num_cores = psutil.cpu_count()  # number of cores on your machine
num_partitions = num_cores  # number of partitions to split dataframe

print('number of cores:', num_cores)
def df_parallelize_run(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

number of cores: 16


In [7]:
def preprocess(text):
    """
    preprocess text main steps
    """
    text = clean_text(text)
    text = clean_numbers(text)
    text = replace_typical_misspell(text)
    return text

def text_clean_wrapper(df):
    df["question_text"] = df["question_text"].apply(preprocess)
    return df

In [8]:
%%time

train_df = pd.read_csv(PATH+'train.csv', usecols=['question_text', 'target'])
test_df = pd.read_csv(PATH+'test.csv', usecols = ['question_text'])

train_df = df_parallelize_run(train_df, text_clean_wrapper)
test_df = df_parallelize_run(test_df, text_clean_wrapper)

# FOR CREATING PROCESSED DATA AND LABELS
train_sentences = train_df['question_text']
train_labels = train_df['target']
test_sentences = test_df['question_text']

print(train_labels[20:30])

del train_df, test_df

20    0
21    0
22    1
23    0
24    0
25    0
26    0
27    0
28    0
29    0
Name: target, dtype: int64
CPU times: user 2.86 s, sys: 801 ms, total: 3.66 s
Wall time: 9.18 s


In [21]:
n_0, n_1 = train_labels.value_counts()

In [22]:
n_0, n_1 

(1225312, 80810)

In [49]:
w = (1-beta)/(1-pow(beta,n_1))

In [50]:
w

0.00010003093321952716

In [51]:
pow(beta,n_1)

0.00030923653856437924

In [52]:
(1-pow(beta,n_1))/(1-pow(beta,n_0))

0.9996907634614356

In [64]:
w_1 = (1-beta)/(1-pow(beta,n_1))

In [57]:
(n_0-n_1)/n_0

0.9340494502624638

In [68]:
beta = (len(train_labels)-1)/len(train_labels)

In [69]:
w_ll = torch.ones([batch_size,1])*w_1

In [70]:
p_w = (1-pow(beta,n_1))/(1-pow(beta,n_0))

In [71]:
p_w*w_ll

tensor([[1.2579e-06],
        [1.2579e-06],
        [1.2579e-06],
        ...,
        [1.2579e-06],
        [1.2579e-06],
        [1.2579e-06]])

In [72]:
w_ll

tensor([[1.2761e-05],
        [1.2761e-05],
        [1.2761e-05],
        ...,
        [1.2761e-05],
        [1.2761e-05],
        [1.2761e-05]])

In [84]:
beta = (len(train_labels)-1)/len(train_labels)
n_sincere, n_insincere = train_labels.value_counts()
weight_insincere = (1-beta)/(1-pow(beta,n_insincere))
logit_weights = torch.ones([batch_size,1])*weight_insincere
pos_weight = torch.tensor((1-pow(beta,n_insincere))/(1-pow(beta,n_sincere)))

In [85]:
weight_insincere

1.2761461192400608e-05

In [86]:
pos_weight

tensor(0.0986)

In [76]:
logit_weights*pos_weight

tensor([[1.2579e-06],
        [1.2579e-06],
        [1.2579e-06],
        ...,
        [1.2579e-06],
        [1.2579e-06],
        [1.2579e-06]])

In [77]:
logit_weights

tensor([[1.2761e-05],
        [1.2761e-05],
        [1.2761e-05],
        ...,
        [1.2761e-05],
        [1.2761e-05],
        [1.2761e-05]])

In [87]:
beta = (len(train_labels)-1)/len(train_labels)
n_sincere, n_insincere = train_labels.value_counts()
weight_insincere = (1-beta)/(1-pow(beta,n_insincere))
weight_sincere = (1-beta)/(1-pow(beta,n_sincere))

In [88]:
weight_sincere, weight_insincere

(1.2579257144344684e-06, 1.2761461192400608e-05)