In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv("../input/train.csv", dtype={"project_essay_3": object, "project_essay_4": object})
target = train['project_is_approved']
train = train.drop('project_is_approved', axis=1)

test = pd.read_csv("../input/test.csv", dtype={"project_essay_3": object, "project_essay_4": object})
resources = pd.read_csv("../input/resources.csv")

train.fillna(('unk'), inplace=True)
test.fillna(('unk'), inplace=True)

In [3]:
from sklearn import preprocessing
from tqdm import tqdm
import gc
from sklearn.preprocessing import LabelEncoder

features = [
    'teacher_id', 
    'teacher_prefix', 
    'school_state', 
    'project_grade_category',
    'project_subject_categories', 
    'project_subject_subcategories']

df_all = pd.concat([train, test], axis=0)
    
for c in tqdm(features):
    le = LabelEncoder()
    le.fit(df_all[c].astype(str))
    train[c] = le.transform(train[c].astype(str))
    test[c] = le.transform(test[c].astype(str))

100%|██████████| 6/6 [00:01<00:00,  4.46it/s]


In [4]:
# Feature engineering

# Date and time
train['project_submitted_datetime'] = pd.to_datetime(train['project_submitted_datetime'])
test['project_submitted_datetime'] = pd.to_datetime(test['project_submitted_datetime'])

# Date as int may contain some ordinal value
train['datetime_int'] = train['project_submitted_datetime'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
test['datetime_int'] = test['project_submitted_datetime'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)

# Date parts
train["year"] = train["project_submitted_datetime"].dt.year
train["month"] = train["project_submitted_datetime"].dt.month
#train['weekday'] = train['project_submitted_datetime'].dt.weekday
train["hour"] = train["project_submitted_datetime"].dt.hour
train["month_Day"] = train['project_submitted_datetime'].dt.day
#train["year_Day"] = train['project_submitted_datetime'].dt.dayofyear
train['datetime_dow'] = train['project_submitted_datetime'].dt.dayofweek
train = train.drop('project_submitted_datetime', axis=1)


# ****** Test data *********
test["year"] = test["project_submitted_datetime"].dt.year
test["month"] = test["project_submitted_datetime"].dt.month
#test['weekday'] = test['project_submitted_datetime'].dt.weekday
test["hour"] = test["project_submitted_datetime"].dt.hour
test["month_Day"] = test['project_submitted_datetime'].dt.day
#test["year_Day"] = test['project_submitted_datetime'].dt.dayofyear
test['datetime_dow'] = test['project_submitted_datetime'].dt.dayofweek
test = test.drop('project_submitted_datetime', axis=1)

# Essay length
train['e1_length'] = train['project_essay_1'].apply(len)
test['e1_length'] = train['project_essay_1'].apply(len)

train['e2_length'] = train['project_essay_2'].apply(len)
test['e2_length'] = train['project_essay_2'].apply(len)

# Title length
train['project_title_len'] = train['project_title'].apply(lambda x: len(str(x)))
test['project_title_len'] = test['project_title'].apply(lambda x: len(str(x)))

# Project resource summary length
train['project_resource_summary_len'] = train['project_resource_summary'].apply(lambda x: len(str(x)))
test['project_resource_summary_len'] = test['project_resource_summary'].apply(lambda x: len(str(x)))

# Has more than 2 essays?
train['has_gt2_essays'] = train['project_essay_3'].apply(lambda x: 0 if x == 'unk' else 1)
test['has_gt2_essays'] = test['project_essay_3'].apply(lambda x: 0 if x == 'unk' else 1)

In [5]:
resources['resources_total'] = resources['quantity'] * resources['price']

dfr = resources.groupby(['id'], as_index=False)[['resources_total']].sum()
train = pd.merge(train, dfr, how='left', on='id').fillna(-1)
test = pd.merge(test, dfr, how='left', on='id').fillna(-1)

dfr = resources.groupby(['id'], as_index=False)[['resources_total']].mean()
dfr = dfr.rename(columns={'resources_total':'resources_total_mean'})
train = pd.merge(train, dfr, how='left', on='id').fillna(-1)
test = pd.merge(test, dfr, how='left', on='id').fillna(-1)

dfr = resources.groupby(['id'], as_index=False)[['quantity']].count()
dfr = dfr.rename(columns={'quantity':'resources_quantity_count'})
train = pd.merge(train, dfr, how='left', on='id').fillna(-1)
test = pd.merge(test, dfr, how='left', on='id').fillna(-1)

dfr = resources.groupby(['id'], as_index=False)[['quantity']].sum()
dfr = dfr.rename(columns={'quantity':'resources_quantity_sum'})
train = pd.merge(train, dfr, how='left', on='id').fillna(-1)
test = pd.merge(test, dfr, how='left', on='id').fillna(-1)

# We're done with IDs for now
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [6]:
train['project_essay'] = train.apply(lambda row: ' '.join([
    str(row['project_title']),
    str(row['project_essay_1']), 
    str(row['project_essay_2']), 
    str(row['project_essay_3']),
    str(row['project_essay_4']),
    str(row['project_resource_summary'])]), axis=1)
test['project_essay'] = test.apply(lambda row: ' '.join([
    str(row['project_title']),
    str(row['project_essay_1']), 
    str(row['project_essay_2']), 
    str(row['project_essay_3']),
    str(row['project_essay_4']),
    str(row['project_resource_summary'])]), axis=1)

train = train.drop([
    'project_title',
    'project_essay_1', 
    'project_essay_2', 
    'project_essay_3', 
    'project_essay_4',
    'project_resource_summary'], axis=1)
test = test.drop([
    'project_title',
    'project_essay_1', 
    'project_essay_2', 
    'project_essay_3', 
    'project_essay_4',
    'project_resource_summary'], axis=1)

In [7]:
from nltk.corpus import stopwords
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import punkt
import re

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def prep_text(text):
    q = "[\'\’\´\ʻ]"
    text = text.strip().lower()
    text = re.sub('\W+',' ', text)
    text = re.sub(r'(\")', ' ', text)
    text = re.sub(r"\\r|\\n", " ", text)
    text = re.sub(re.compile("won%st" % q), "will not", text)
    text = re.sub(re.compile("can%st" % q), "can not", text)
    text = re.sub(re.compile("n%st" % q), " not", text)
    text = re.sub(re.compile("%sre" % q), " are", text)
    text = re.sub(re.compile("%ss" % q), " is", text)
    text = re.sub(re.compile("%sd" % q), " would", text)
    text = re.sub(re.compile("%sll" % q), " will", text)
    text = re.sub(re.compile("%st" % q), " not", text)
    text = re.sub(re.compile("%sve" % q), " have", text)
    text = re.sub(re.compile("%sm" % q), " am", text)
    text = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
    return text

train['project_essay'] = train['project_essay'].apply(lambda x: prep_text(x))
test['project_essay'] = test['project_essay'].apply(lambda x: prep_text(x))

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer(norm='l2', min_df=0,  max_features=8000, 
            strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
            ngram_range=(1,2), use_idf=True, smooth_idf=False, sublinear_tf=True,
            stop_words = 'english')

In [9]:
train_text = train['project_essay'].apply(lambda x: ' '.join(x))
test_text = test['project_essay'].apply(lambda x: ' '.join(x))

# Fitting tfidf on train + test might be leaky
tfv.fit(list(train_text.values) + list(test_text.values))
train_tfv = tfv.transform(train_text)
test_tfv = tfv.transform(test_text)

In [10]:
from scipy.sparse import hstack, csr_matrix
feat_train = train.drop('project_essay', axis=1)
feat_test = test.drop('project_essay', axis=1)

feat_train = csr_matrix(feat_train.values)
feat_test = csr_matrix(feat_test.values)

X_train_stack = hstack([feat_train, train_tfv[0:feat_train.shape[0]]])
X_test_stack = hstack([feat_test, test_tfv[0:feat_test.shape[0]]])

print('Train shape: ', X_train_stack.shape, '\n\nTest Shape: ', X_test_stack.shape)

Train shape:  (182080, 8022) 

Test Shape:  (78035, 8022)


In [None]:
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.model_selection import train_test_split
import random
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

print("Building model using Light GBM and finding AUC(Area Under Curve)")

cnt = 0
p_buf = []
n_splits = 5
n_repeats = 1
kf = RepeatedKFold(
    n_splits=n_splits, 
    n_repeats=n_repeats, 
    random_state=28)
auc_buf = []  

for train_index, valid_index in kf.split(X_train_stack):
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_stack, target, test_size=0.20, random_state=random.seed(28))
    print('Fold {}/{}'.format(cnt + 1, n_splits))
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'max_depth': 7,
        'num_leaves': 32,
        'learning_rate': 0.02,
        'feature_fraction': 0.80,
        'bagging_fraction': 0.80,
        'bagging_freq': 5,
        'verbose': 0,
        'lambda_l2': 1,
    }  

    model = lgb.train(
        params,
        lgb.Dataset(X_train, y_train),
        num_boost_round=10000,
        valid_sets=[lgb.Dataset(X_valid, y_valid)],
        early_stopping_rounds=50,
        verbose_eval=100
        )

    p = model.predict(X_valid, num_iteration=model.best_iteration)
    auc = roc_auc_score(y_valid, p)

    print('{} AUC: {}'.format(cnt, auc))

    p = model.predict(X_test_stack, num_iteration=model.best_iteration)
    if len(p_buf) == 0:
        p_buf = np.array(p)
    else:
        p_buf += np.array(p)
    auc_buf.append(auc)

    cnt += 1
    
auc_mean = np.mean(auc_buf)
auc_std = np.std(auc_buf)
print('AUC = {:.6f} +/- {:.6f}'.format(auc_mean, auc_std))

lgb_preds = p_buf/cnt

Building model using Light GBM and finding AUC(Area Under Curve)


In [None]:
l_preds = pd.DataFrame(lgb_preds)
l_preds.columns = ['project_is_approved']
l_preds.head()

submid = sub['id']
lsub = pd.concat([submid, l_preds], axis=1)
lsub.to_csv('lgbm_submission.csv', index=False)