In [None]:
import gc
import re
import pandas as pd
import random
import numpy as np
from unidecode import unidecode
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import string
import re
import math
import operator
import time
import os
os.environ['OMP_NUM_THREADS'] = '4'
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = '0'
    np.random.seed(seed)

In [None]:
# GENERAL HYPERPARAMS
num_folds = 5
seed = 42

# HYPERPARAMS FOR TEXT PROCESSING
max_features = 200000
maxlen = 100

# HYPERPARAMS FOR NN
batch_size = 1024
epochs = 2
embed_size = 300

set_seed(seed)

# PATH TO DATA DIRECTORY
PATH = "../input/"

In [None]:
puncts = {',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√'}

def clean_text(x):
    x = str(x)
    table = str.maketrans({key: ' {punct} ' for key in puncts})
    return x.translate(table)

def clean_numbers(x):
    if bool(re.search(r'\d', x)):
        x = re.sub('[0-9]{5,}', '#####', x)
        x = re.sub('[0-9]{4}', '####', x)
        x = re.sub('[0-9]{3}', '###', x)
        x = re.sub('[0-9]{2}', '##', x)
    return x

mispell_dict = {"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

# Word Features

In [None]:
def legacy_round(number, points=0):
    p = 10 ** points
    return math.floor((number * p) + math.copysign(0.5, number))/p

def char_count(text, ignore_spaces=True):
        if ignore_spaces:
            text = text.replace(" ", "")
        return len(text)

def lexicon_count(text):
        count = len(text.split())
        return count
    
def syllable_count(text):
        text = text.lower()
        text = "".join(x for x in text if x not in list(string.punctuation))
        if not text:
            return 0
        count = 0
        vowels = {'a', 'e', 'i', 'o', 'u', 'y'}
        for word in text.split(' '):
            word = word.strip(".:;?!")
            if len(word) < 1:
                continue
            if word[0] in vowels:
                count +=1
            for index in range(1,len(word)):
                if word[index] in vowels and word[index-1] not in vowels:
                    count +=1
            if word.endswith('e'):
                count -= 1
            if word.endswith('le'):
                count+=1
            if count == 0:
                count +=1
        return count

def sentence_count(text):
        ignore_count = 0
        sentences = re.split(r' *[\.\?!][\'"\)\]]*[ |\n](?=[A-Z])', text)
        for sentence in sentences:
            if lexicon_count(sentence) <= 2:
                ignore_count += 1
        return max(1, len(sentences) - ignore_count)
    
def avg_sentence_length(text):
        try:
            asl = lexicon_count(text) / sentence_count(text)
            return legacy_round(asl, 1)
        except ZeroDivisionError:
            return 0.0

def avg_syllables_per_word(text):
        syllable = syllable_count(text)
        words = lexicon_count(text)
        try:
            syllables_per_word = syllable/words
            return legacy_round(syllables_per_word, 1)
        except ZeroDivisionError:
            return 0.0

def avg_letter_per_word(text):
        try:
            letters_per_word = char_count(text) / lexicon_count(text)
            return legacy_round(letters_per_word, 2)
        except ZeroDivisionError:
            return 0.0

def avg_sentence_per_word(text):
        try:
            sentence_per_word = sentence_count(text) / lexicon_count(text)
            return legacy_round(sentence_per_word, 2)
        except ZeroDivisionError:
            return 0.0
        
def flesch_reading_ease(text):
        sentence_length = avg_sentence_length(text)
        syllables_per_word = avg_syllables_per_word(text)
        flesch = 206.835 - 1.015 * sentence_length - 84.6 * syllables_per_word
        return legacy_round(flesch, 2)

def flesch_kincaid_grade(text):
        sentence_lenth = avg_sentence_length(text)
        syllables_per_word = avg_syllables_per_word(text)
        flesch = 0.39 * sentence_lenth + 11.8 * syllables_per_word - 15.59
        return legacy_round(flesch, 1)

def polysyllabcount(text):
        count = 0
        for word in text.split():
            wrds = syllable_count(word)
            if wrds >= 3:
                count += 1
        return count

def smog_index(text):
        sentences = sentence_count(text)
        if sentences >= 3:
            try:
                poly_syllab = polysyllabcount(text)
                smog = (1.043 * (30 * (poly_syllab / sentences)) ** .5) + 3.1291
                return legacy_round(smog, 1)
            except ZeroDivisionError:
                return 0.0
        else:
            return 0.0

def coleman_liau_index(text):
        letters = legacy_round(avg_letter_per_word(text)*100, 2)
        sentences = legacy_round(avg_sentence_per_word(text)*100, 2)
        coleman = (0.058 * letters) - (0.296 * sentences) - 15.8
        return legacy_round(coleman, 2)

def automated_readability_index(text):
        chrs = char_count(text)
        words = lexicon_count(text)
        sentences = sentence_count(text)
        try:
            a = chrs/words
            b = words /sentences
            readability = (4.71 * legacy_round(a, 2)) + (0.5 * legacy_round(b, 2)) - 21.43
            return legacy_round(readability, 1)
        except ZeroDivisionError:
            return 0.0

def linsear_write_formula(text):
        easy_word = 0
        difficult_word = 0
        text_list = text.split()[:100]
        for word in text_list:
            if syllable_count(word) < 3:
                easy_word += 1
            else:
                difficult_word += 1
        text = ' '.join(text_list)
        number = (easy_word * 1 + difficult_word * 3)/ sentence_count(text)
        if number <= 20:
            number -= 2
        return number / 2

# Metric

In [None]:
def f1_score(true,pred):
    #considering sigmoid activation, threshold = 0.5
    pred = K.cast(K.greater(pred,0.5), K.floatx())

    groundPositives = K.sum(true) + K.epsilon()
    correctPositives = K.sum(true * pred) + K.epsilon()
    predictedPositives = K.sum(pred) + K.epsilon()

    precision = correctPositives / predictedPositives
    recall = correctPositives / groundPositives

    m = (2 * precision * recall) / (precision + recall)

    return m

# Misc Functions

In [None]:
def threshold_search(y_true, y_proba):
    precision, recall, thresholds = metrics.precision_recall_curve(y_true, y_proba)
    thresholds = np.append(thresholds, 1.001) 
    F = 2/(1/precision + 1/recall)
    best_score = np.max(F)
    best_th = thresholds[np.argmax(F)]
    search_result = {'threshold': best_th, 'f1': best_score}
    return search_result

In [None]:
def clean_text_for_features(x):
    special_character_removal = re.compile(r'[^A-Za-z\.\-\?\!\,\#\@\% ]',re.IGNORECASE)
    x_ascii = unidecode(x)
    x_clean = special_character_removal.sub('',x_ascii)
    return x_clean

In [None]:
def add_features(df, function_list):
    df['question_text'] = df['question_text'].apply(lambda x:str(x))
    for text_function in function_list:
        df[text_function.__name__] = df['question_text'].apply(lambda x: text_function(str(x)))
    df['total_length'] = df['question_text'].apply(len)
    df['capitals'] = df['question_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/(float(row['total_length'])+1),
                                axis=1)
    df['num_words'] = df['question_text'].str.count('\S+')
    df['num_unique_words'] = df['question_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / (df['num_words']+1)
    gc.collect()
    return df

In [None]:
text_function_list = [char_count, lexicon_count, syllable_count, sentence_count, avg_letter_per_word, avg_sentence_length, avg_sentence_per_word, avg_syllables_per_word, flesch_kincaid_grade, flesch_reading_ease, polysyllabcount, smog_index, coleman_liau_index, automated_readability_index, linsear_write_formula]
#text_function_list = []

# Begin Main

In [None]:
train_df = pd.read_csv(PATH+'train.csv', usecols=['question_text', 'target'])
test_df = pd.read_csv(PATH+'test.csv', usecols = ['question_text'])

In [None]:
train_df['question_text'] = train_df['question_text'].apply(lambda x: clean_text_for_features(str(x)))
test_df['question_text'] = test_df['question_text'].apply(lambda x: clean_text_for_features(str(x)))

In [None]:
%%time
# CREATE TEXT FEATURES
train_df = add_features(train_df, text_function_list)
test_df = add_features(test_df, text_function_list)

In [None]:
# SAVE AND PROCESS FEATURES TO SEND TO NN
train_features = train_df.drop(['question_text', 'target'], axis=1)
test_features = test_df.drop(['question_text'], axis=1)
train_labels = train_df['target']
del train_df, test_df

In [None]:
ss = StandardScaler()
ss.fit(np.vstack((train_features, test_features)))
train_features = ss.transform(train_features)
test_features = ss.transform(test_features)

del ss
gc.collect()

In [None]:
train_features.shape, test_features.shape

In [None]:
# TO SAVE FINAL PREDICTIONS
final_preds_rf = np.zeros((test_features.shape[0], 1))
final_preds_lgbm = np.zeros((test_features.shape[0], 1))
oof_preds_rf = np.zeros((train_features.shape[0], 1))
oof_preds_lgbm = np.zeros((train_features.shape[0], 1))

In [None]:
%%time
# FOLDS FOR CV
folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_features, train_labels)):
            print('FOLD NUMBER {}:'.format(n_fold+1))
            train_x, train_y = train_features[train_idx], train_labels[train_idx]
            valid_x, valid_y = train_features[valid_idx], train_labels[valid_idx]
            
            # RF
            rf = RandomForestClassifier(n_estimators=1400, max_depth=8, random_state=seed, n_jobs=-1, verbose=1)
            rf.fit(train_x, train_y)
            rf_preds = rf.predict_proba(valid_x)[:, 1]
            oof_preds_rf[valid_idx] = np.reshape(rf_preds, (rf_preds.shape[0],1))
            
            #LGBM
            lgbm = LGBMClassifier(
            nthread=4,
            max_depth=8,
            min_split_gain=0.0222415,
            min_child_weight=40,
            silent=False,
            verbose=-1,
            n_estimators= 1400,
            num_leaves= 77,
            learning_rate= 0.007641070180129345,
            min_child_samples= 460,
            subsample_for_bin= 240000,
            reg_lambda= 0.2040816326530612,
            reg_alpha= 0.8775510204081632,
            subsample= 0.9494949494949496,
            colsample_bytree= 0.7333333333333333
            )

            lgbm.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
                     eval_metric= 'auc', verbose= 500, early_stopping_rounds= 200)
            lgbm_preds = lgbm.predict_proba(valid_x, num_iteration=lgbm.best_iteration_)[:,1]
            oof_preds_lgbm[valid_idx] = np.reshape(lgbm_preds, (rf_preds.shape[0],1))
            
            print('>>\t PREDICTING!')
            temp_preds_rf = rf.predict_proba(test_features)[:,1]
            final_preds_rf += np.reshape(temp_preds_rf, (temp_preds_rf.shape[0],1))
            
            temp_preds_lgbm = lgbm.predict_proba(test_features, num_iteration=lgbm.best_iteration_)[:,1]
            final_preds_lgbm += np.reshape(temp_preds_lgbm, (temp_preds_lgbm.shape[0],1))
            print('>>\t PREDICTING DONE!\n')
        
            del train_y, valid_y, train_x, valid_x, rf, temp_preds_rf, rf_preds, lgbm, temp_preds_lgbm, lgbm_preds
            gc.collect()

In [None]:
optimal_threshold_rf = threshold_search(train_labels, oof_preds_rf)
optimal_threshold_lgbm = threshold_search(train_labels, oof_preds_lgbm)
print(optimal_threshold_rf,  '\n', optimal_threshold_lgbm)

In [None]:
optimal_threshold = threshold_search(train_labels, (oof_preds_rf+oof_preds_lgbm)/2)
print(optimal_threshold)

In [None]:
final_preds = (final_preds_lgbm + final_preds_rf)/2
final_preds = final_preds/num_folds
print('>>\t CREATING FINAL SUBMISSION FILE!')
final_preds = (final_preds > optimal_threshold['threshold']).astype(int)
sample = pd.read_csv(PATH+'sample_submission.csv')
sample['prediction'] = final_preds
sample.to_csv('submission.csv', index=False)
print('>>\t CREATING FINAL SUBMISSION FILE \t DONE!')

In [None]:
gc.collect()

In [None]:
np.array(np.unique(final_preds, return_counts=True)).T