In [1]:
# import
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split

pd.set_option('max_columns',100)
import datetime

import os
os.chdir('/Users/shubhamjain/Downloads/AV/Student Identity/')

In [2]:
print ("loading data...")
train = pd.read_csv('train_HK6lq50.csv')
test = pd.read_csv('test_2nAIblo.csv')

sample = pd.read_csv('sample_submission_vaSxamm.csv')

loading data...


### Data Cleaning

In [4]:
## filling missing values of age 

train.loc[(train['education'] == 'Matriculation') & (train['age'].isnull()), 'age'] = 36
train.loc[(train['education'] == 'Bachelors') & (train['age'].isnull()), 'age'] = 39
train.loc[(train['education'] == 'High School Diploma') & (train['age'].isnull()), 'age'] = 35
train.loc[(train['education'] == 'Masters') & (train['age'].isnull()), 'age'] = 43
train.loc[(train['education'] == 'No Qualification') & (train['age'].isnull()), 'age'] = 34

test.loc[(test['education'] == 'Matriculation') & (test['age'].isnull()), 'age'] = 36
test.loc[(test['education'] == 'Bachelors') & (test['age'].isnull()), 'age'] = 39
test.loc[(test['education'] == 'High School Diploma') & (test['age'].isnull()), 'age'] = 35
test.loc[(test['education'] == 'Masters') & (test['age'].isnull()), 'age'] = 43
test.loc[(test['education'] == 'No Qualification') & (test['age'].isnull()), 'age'] = 34

In [5]:
train['trainee_engagement_rating'].fillna(2, inplace=True)
test['trainee_engagement_rating'].fillna(2, inplace=True)

### Feature Engg

In [None]:
train['program_number'] = train.program_id.apply(lambda x: str(x).split('_')[1])
test['program_number'] = test.program_id.apply(lambda x: str(x).split('_')[1])

In [6]:
## mean encoder of program_id

maper = train.groupby('program_id')['is_pass'].mean().reset_index()
maper = maper.rename(columns = {'is_pass': 'program_id_mean'})

train = train.merge(maper, on = ['program_id'], how='left')
test = test.merge(maper, on = ['program_id'], how='left')

In [7]:
## avg passing age of trainee of each program id

mapper = train[train['is_pass'] == 1].groupby('program_id')['age'].mean().round().astype('int')

train['avg_passing_age'] = train['program_id']
test['avg_passing_age'] = test['program_id']

train['avg_passing_age'] = train['avg_passing_age'].map(mapper)
test['avg_passing_age'] = test['avg_passing_age'].map(mapper)

In [8]:
## program id and city tier

mapper = train.groupby(['program_id', 'city_tier'])['is_pass'].mean().reset_index()
mapper = mapper.rename(columns = {'is_pass': 'program_city_pass'})

train =train.merge(mapper, on = ['program_id','city_tier'], how='left')
test = test.merge(mapper, on = ['program_id','city_tier'], how='left')

In [9]:
### avg, max attempts require for each program id

temp = train[train['is_pass'] == 0].groupby('program_id')['trainee_id'].value_counts()
temp = pd.DataFrame(temp).rename(columns = {'trainee_id': 'attempts'}).reset_index()

mean = temp.groupby('program_id')['attempts'].mean().round(2).reset_index().rename(columns = {'attempts':'mean_attempts'})
maxi = temp.groupby('program_id')['attempts'].max().round(2).reset_index().rename(columns = {'attempts':'max_attempts'})

train = train.merge(mean, on = ['program_id'], how='left')
test = test.merge(mean, on = ['program_id'], how='left')

train = train.merge(maxi, on = ['program_id'], how='left')
test = test.merge(maxi, on = ['program_id'], how='left')

In [10]:
train = train.sort_values(by = ['trainee_id','id'], ascending=True)

In [11]:
## percentage of trainee who cleared it in first one

temp = train.groupby(['program_id','trainee_id'])['is_pass'].apply(lambda x: x.head(1)).reset_index()
temp = temp.groupby('program_id')['is_pass'].mean().reset_index().rename(columns = {'is_pass':'first_attempt'})

train = train.merge(temp, on = ['program_id'], how='left')
test = test.merge(temp, on = ['program_id'], how='left')

In [12]:
## avg rating of each trainee

combi = pd.concat([train,test],axis=0)
temp = combi.groupby('trainee_id')['trainee_engagement_rating'].mean().reset_index().rename(columns = {'trainee_engagement_rating':
                                                                                                'avg_rating'})

train = train.merge(temp, on = ['trainee_id'], how='left')
test = test.merge(temp, on = ['trainee_id'], how='left')

### Preparing data

In [13]:
training = train.copy()
testing = test.copy()

In [14]:
###### one hot encoding
feat = ['program_type', 'education', 'program_number']

dummies = pd.get_dummies(training[feat], prefix = feat)
training = pd.concat([training, dummies], axis=1)

dummies = pd.get_dummies(testing[feat], prefix = feat)
testing = pd.concat([testing, dummies], axis=1)

In [15]:
## mean encoding test_id 

mapper = training.groupby('test_id')['is_pass'].mean()

training['test_id'] = training['test_id'].map(mapper)
testing['test_id'] = testing['test_id'].map(mapper)

In [16]:
## label encoding

feat = ['test_type', 'gender', 'is_handicapped']
from sklearn.preprocessing import LabelEncoder

lb = LabelEncoder()

for i in feat:
    training[i] = lb.fit_transform(training[i].astype('str'))
    testing[i] = lb.transform(testing[i].astype('str'))
    
    training[i] = training[i].astype('object')
    testing[i] = testing[i].astype('object')

In [17]:
## for difficulty level
#training.difficulty_level.value_counts()
dic = {'easy':1, 'intermediate':2, 'hard':3, 'vary hard': 4}

training['difficulty_level'] = training['difficulty_level'].map(dic)
testing['difficulty_level'] = testing['difficulty_level'].map(dic)

In [18]:
## for training features

## trainee id sum encoder

temp = training.groupby('trainee_id')['is_pass'].sum().reset_index().rename(columns = {'is_pass':'trainee_pass_sum'})

training = training.merge(temp, on = ['trainee_id'], how='left')
testing = testing.merge(temp, on = ['trainee_id'], how='left')

testing.fillna(-1, inplace=True)


## trainee last result

temp = training.groupby(['trainee_id'])['is_pass'].apply(lambda x: x.tail(1)).reset_index().rename(columns = {'is_pass':'last_result'})
temp.drop('level_1',axis=1,inplace=True)

training = training.merge(temp, on = ['trainee_id'], how='left')
testing = testing.merge(temp, on = ['trainee_id'], how='left')

testing.fillna(-1, inplace=True)

## trainee attempt number

temp = training.groupby(['trainee_id','program_id'])['is_pass'].count().reset_index().rename(columns = {'is_pass':'trainee_attempts'})

                                                                                            
training = training.merge(temp, on = ['trainee_id','program_id'], how='left')
testing = testing.merge(temp, on = ['trainee_id','program_id'], how='left')

testing.fillna(-1, inplace=True)                                                                                           

In [19]:
## program type and education
temp = training.groupby(['program_type','education'])['is_pass'].mean().reset_index().rename(columns = {'is_pass':'type_education'})

training = training.merge(temp, on = ['program_type','education'], how='left')
testing = testing.merge(temp, on = ['program_type','education'], how='left')

In [20]:
## percentile of course passed by him

temp = training.groupby(['program_id','trainee_id'])['is_pass'].sum().groupby('program_id').max().reset_index()

temp = temp.rename(columns = {'is_pass': 'total_tests'})

training = training.merge(temp, on ='program_id', how='left')
testing = testing.merge(temp, on ='program_id', how='left')

training['%_program_cleared'] = training['trainee_pass_sum'] / training['total_tests']
testing['%_program_cleared'] = testing['trainee_pass_sum'] / testing['total_tests']


training['%_program_cleared'] = (training['%_program_cleared'] -training['%_program_cleared'].min()) /(training['%_program_cleared'].max() -training['%_program_cleared'].min())
testing['%_program_cleared'] = (testing['%_program_cleared'] -testing['%_program_cleared'].min())/ (testing['%_program_cleared'].max() -testing['%_program_cleared'].min())

In [21]:
training.to_csv('train_df.csv', index=False)
testing.to_csv('test_df.csv', index=False)

In [23]:
features = list(training.drop(['id', 'program_id','program_type', 'trainee_id','education', 'is_pass'
                              ],axis=1))

### Modelling

#### XGB

In [24]:
for i in training[features].columns:
    if (training[i].dtype == 'object'):
        training[i] = training[i].astype('int')
        
for i in testing[features].columns:
    if (testing[i].dtype == 'object'):
        testing[i] = testing[i].astype('int')

In [26]:
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.metrics import  roc_auc_score,roc_auc_score
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0,max_depth=10):
        params = {}
        params["objective"] = "binary:logistic"
        params['eval_metric'] = 'auc'
        params["eta"] = 0.01 #0.00334
        params["min_child_weight"] = 1
        params["subsample"] = 0.8
        params["colsample_bytree"] = 0.3
        params["silent"] = 1
        params["max_depth"] = max_depth
        params["seed"] = seed_val
        #params["max_delta_step"] = 2
        params["verbose"] = 100
        num_rounds = 5000 #2500

        plst = list(params.items())
        xgtrain = xgb.DMatrix(train_X, label=train_y)

        if test_y is not None:
            xgtest = xgb.DMatrix(test_X, label=test_y)
            watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
            model = xgb.train(plst , xgtrain, num_rounds, watchlist, early_stopping_rounds= 500)
        else:
            xgtest = xgb.DMatrix(test_X)
            #xgtest1 = xgb.DMatrix(test_X1)
            model = xgb.train(plst, xgtrain, 2000)
        #print feature_names,if feature_names
        if feature_names:
            create_feature_map(feature_names)
            model.dump_model('xgbmodel.txt', 'xgb.fmap', with_stats=True)
            importance = model.get_fscore(fmap='xgb.fmap')
            importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)
            imp_df = pd.DataFrame(importance, columns=['feature','fscore'])
            imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum()
            imp_df.to_csv("imp_feat.txt", index=False)

        pred_test_y = model.predict(xgtest)
        
        if test_y is not None:
            loss = roc_auc_score(test_y, pred_test_y)
            print (loss)
            return (pred_test_y, loss)
        else:
            return (pred_test_y)

In [None]:
pred1 = runXGB(training[features], y, testing[features],max_depth=10)
pred2 = runXGB(training[features], y, testing[features],max_depth=6)
pred3 = runXGB(training[features], y, testing[features],max_depth=12)

In [None]:
pred = (pred1 + pred2 + pred3)/3
sample['is_pass'] = pred
sample.to_csv('sub/xgb.csv', index=False)

#### XGB Model 2

In [27]:
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')

In [28]:
features = list(train_df.drop(['id', 'program_id','program_type', 'trainee_id','education', 'is_pass'
                              ],axis=1))

In [29]:
y_train = train_df['is_pass'].ravel()
#train = training[features]
x_train = train_df[features].values # Creates an array of the train data
x_test = test_df[features].values

In [30]:
# Create 5 objects that represent our 4 models
SEED = 0
NFOLDS = 5

ntrain = train_df.shape[0]
ntest = test_df.shape[0]

from sklearn.cross_validation import KFold
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

In [31]:
import xgboost as xgb
def runXGB(train_X, train_y, test_X, weight = 1):
    params = {}
    params["objective"] = "binary:logistic"
    params["eta"] = 0.01
    params["min_child_weight"] = weight
    params["subsample"] = 0.8
    params["colsample_bytree"] = 0.7
    params["silent"] = 1
    params["max_depth"] = 10
    #params["max_delta_step"]=2
    params["seed"] = 0
    params['eval_metric'] = "auc"
    plst = list(params.items())
    num_rounds = 2500

    xgtrain = xgb.DMatrix(train_X, label=train_y)
    xgtest = xgb.DMatrix(test_X)
    model = xgb.train(plst, xgtrain, num_rounds)
    pred_test_y = model.predict(xgtest)
    return pred_test_y

In [32]:
train_X = np.array(train_df[features]).astype('float')
test_X = np.array(test_df[features]).astype('float')

train_y = train_df['is_pass']

In [None]:
print ("Running model 2..")
pred_test_y = runXGB(train_X, train_y, test_X)

sample['is_pass'] = pred_test_y
sample.to_csv("sub/sub2.csv", index=False) #0.7937

In [None]:
print ("Running model 3..")
pred_test_y = runXGB(train_X, train_y, test_X, weight = 10)

sample['is_pass'] = pred_test_y
sample.to_csv("sub/sub2_1.csv", index=False) #0.7910

#### Model 3

In [2]:
print ("loading data...")
train = pd.read_csv('train_HK6lq50.csv')
test = pd.read_csv('test_2nAIblo.csv')

sample = pd.read_csv('sample_submission_vaSxamm.csv')

loading data...


In [3]:
def getCountVar(compute_df, count_df, var_name):
    grouped_df = count_df.groupby(var_name)
    count_dict = {}
    for name, group in grouped_df:
        count_dict[name] = group.shape[0]

    count_list = []
    for index, row in compute_df.iterrows():
        name = row[var_name]
        count_list.append(count_dict.get(name, 0))
    return count_list

In [4]:
def getMeanVar(compute_df, count_df, var_name):
    grouped_df = count_df.groupby(var_name)
    count_dict = {}
    for name, group in grouped_df:
        count_dict[name] = np.mean(np.array(group["is_pass"]))

    count_list = []
    for index, row in compute_df.iterrows():
        name = row[var_name]
        count_list.append(count_dict.get(name, 0))
    return count_list

In [5]:
train['program_number'] = train.program_id.apply(lambda x: str(x).split('_')[1])
test['program_number'] = test.program_id.apply(lambda x: str(x).split('_')[1])

In [6]:
print ("filling missing values")
train.age.fillna(np.mean(train.age), inplace=True)
test.age.fillna(np.mean(train.age), inplace=True)

train['trainee_engagement_rating'].fillna(2, inplace=True)
test['trainee_engagement_rating'].fillna(2, inplace=True)

filling missing values


In [7]:
print ("getting count features")

train['program_id_count'] = getCountVar(train,train, "program_id")
test['program_id_count'] = getCountVar(test,test, "program_id")

train['program_type_count'] = getCountVar(train,train, "program_type")
test['program_type_count'] = getCountVar(test,test, "program_type")

train['test_id_count'] = getCountVar(train,train, "test_id")
test['test_id_count'] = getCountVar(test,test, "test_id")

train['trainee_id_count'] = getCountVar(train,train, "trainee_id")
test['trainee_id_count'] = getCountVar(test,test, "trainee_id")

train['program_number_count'] = getCountVar(train,train, "program_number")
test['program_number_count'] = getCountVar(test,test, "program_number")

getting count features


In [8]:
## print getting mean encoders

train['program_id_mean'] = getMeanVar(train,train, "program_id")
test['program_id_mean'] = getMeanVar(test,train, "program_id")

train['program_type_mean'] = getMeanVar(train,train, "program_type")
test['program_type_mean'] = getMeanVar(test,train, "program_type")

train['test_id_mean'] = getMeanVar(train,train, "test_id")
test['test_id_mean'] = getMeanVar(test,train, "test_id")

train['difficulty_level_mean'] = getMeanVar(train,train, "difficulty_level")
test['difficulty_level_mean'] = getMeanVar(test,train, "difficulty_level")

train['education_mean'] = getMeanVar(train,train, "education")
test['education_mean'] = getMeanVar(test,train, "education")

train['city_tier_mean'] = getMeanVar(train,train, "city_tier")
test['city_tier_mean'] = getMeanVar(test,train, "city_tier")

train['program_number_mean'] = getMeanVar(train,train, "program_number")
test['program_number_mean'] = getMeanVar(test,train, "program_number")

In [9]:
training = train.copy()
testing = test.copy()

In [10]:
###### one hot encoding
feat = ['program_type', 'education', 'program_number']

dummies = pd.get_dummies(training[feat], prefix = feat)
training = pd.concat([training, dummies], axis=1)

dummies = pd.get_dummies(testing[feat], prefix = feat)
testing = pd.concat([testing, dummies], axis=1)

In [11]:
## label encoding

feat = ['test_type', 'gender', 'is_handicapped']
from sklearn.preprocessing import LabelEncoder

lb = LabelEncoder()

for i in feat:
    training[i] = lb.fit_transform(training[i].astype('str'))
    testing[i] = lb.transform(testing[i].astype('str'))
    
    training[i] = training[i].astype('object')
    testing[i] = testing[i].astype('object')

In [12]:
## for difficulty level
#training.difficulty_level.value_counts()
dic = {'easy':1, 'intermediate':2, 'hard':3, 'vary hard': 4}

training['difficulty_level'] = training['difficulty_level'].map(dic)
testing['difficulty_level'] = testing['difficulty_level'].map(dic)

In [13]:
features = list(training.drop(['id', 'program_id','program_type', 'trainee_id','education', 'is_pass','test_id'
                              ],axis=1))

In [14]:
testing.fillna(-1, inplace=True)

In [15]:
train_X = np.array(training[features]).astype('float')
test_X = np.array(testing[features]).astype('float')

In [16]:
train_y = training['is_pass']

In [17]:
import xgboost as xgb
def runXGB(train_X, train_y, test_X):
    params = {}
    params["objective"] = "binary:logistic"
    params["eta"] = 0.01
    params["min_child_weight"] = 1
    params["subsample"] = 0.8
    params["colsample_bytree"] = 0.7
    params["silent"] = 1
    params["max_depth"] = 10
    #params["max_delta_step"]=2
    params["seed"] = 0
    params['eval_metric'] = "auc"
    plst = list(params.items())
    num_rounds = 2500

    xgtrain = xgb.DMatrix(train_X, label=train_y)
    xgtest = xgb.DMatrix(test_X)
    model = xgb.train(plst, xgtrain, num_rounds)
    pred_test_y = model.predict(xgtest)
    return pred_test_y



In [None]:
print ("Running model..")
pred_test_y = runXGB(train_X, train_y, test_X)

sample['is_pass'] = pred_test_y
sample.to_csv("sub/sub1.csv", index=False) #0.7499

#### Model 4: Stacking

In [33]:
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')

sample = pd.read_csv('sample_submission_vaSxamm.csv')

In [34]:
def runLR(train_X, train_y, test_X, test_y=None, test_X2=None):
    model = linear_model.LogisticRegression(fit_intercept=True, C=0.3)
    model.fit(train_X, train_y)
    print (model.coef_, model.intercept_)
    train_preds = model.predict_proba(train_X)[:,1]
    test_preds = model.predict_proba(test_X)[:,1]
    test_preds2 = model.predict_proba(test_X2)[:,1]
    test_loss = 0
    if test_y is not None:
        train_loss = metrics.roc_auc_score(train_y, train_preds)
        test_loss = metrics.roc_auc_score(test_y, test_preds)
        print ("Train and Test loss : ", train_loss, test_loss)
    return test_preds, test_loss, test_preds2

In [35]:
def runET(train_X, train_y, test_X, test_y=None, test_X2=None, depth=10, leaf=5, feat=0.3):
    model = ensemble.ExtraTreesClassifier(
            n_estimators = 300,
                    max_depth = depth,
                    min_samples_split = 10,
                    min_samples_leaf = leaf,
                    max_features =  feat,
                    n_jobs = 6,
                    random_state = 0)
    model.fit(train_X, train_y)
    train_preds = model.predict_proba(train_X)[:,1]
    test_preds = model.predict_proba(test_X)[:,1]
    test_preds2 = model.predict_proba(test_X2)[:,1]
    test_loss = 0
    if test_y is not None:
        train_loss = metrics.roc_auc_score(train_y, train_preds)
        test_loss = metrics.roc_auc_score(test_y, test_preds)
        print ("Depth, leaf, feat : ", depth, leaf, feat)
        print ("Train and Test loss : ", train_loss, test_loss)
    return test_preds, test_loss, test_preds2

In [36]:
import lightgbm as lgb
def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, seed_val=0, rounds=500, dep=3, eta=0.001):
    params = {}
    params["objective"] = "binary"
    params['metric'] = 'auc'
    params["max_depth"] = dep
    params["min_data_in_leaf"] = 100
    params["learning_rate"] = eta
    params["bagging_fraction"] = 0.7
    params["feature_fraction"] = 0.7
    params["bagging_freq"] = 5
    params["bagging_seed"] = seed_val
    params["verbosity"] = -1
    num_rounds = rounds

    plst = list(params.items())
    lgtrain = lgb.Dataset(train_X, label=train_y)

    if test_y is not None:
        lgtest = lgb.Dataset(test_X, label=test_y)
        model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=100, verbose_eval=20)
    else:
        lgtest = lgb.DMatrix(test_X)
        model = lgb.train(params, lgtrain, num_rounds)

    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)

    loss = 0
    if test_y is not None:
        loss = metrics.roc_auc_score(test_y, pred_test_y)
        print (loss)
        return pred_test_y, loss, pred_test_y2
    else:
        return pred_test_y, loss, pred_test_y2

In [37]:
def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, seed_val=0, rounds=500, dep=8, eta=0.001):
        params = {}
        params["objective"] = "binary:logistic"
        params['eval_metric'] = 'auc'
        params["eta"] = eta
        params["subsample"] = 0.7
        params["min_child_weight"] = 10
        params["colsample_bytree"] = 0.7
        params["max_depth"] = dep
        params["silent"] = 1
        params["seed"] = seed_val
        #params["max_delta_step"] = 2
        #params["gamma"] = 0.5
        num_rounds = rounds

        plst = list(params.items())
        xgtrain = xgb.DMatrix(train_X, label=train_y)

        if test_y is not None:
                xgtest = xgb.DMatrix(test_X, label=test_y)
                watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
                model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=100, verbose_eval=20)
        else:
                xgtest = xgb.DMatrix(test_X)
                model = xgb.train(plst, xgtrain, num_rounds)

        pred_test_y = model.predict(xgtest, ntree_limit=model.best_ntree_limit)
        pred_test_y2 = model.predict(xgb.DMatrix(test_X2), ntree_limit=model.best_ntree_limit)

        loss = 0
        if test_y is not None:
                loss = metrics.log_loss(test_y, pred_test_y)
                print (loss)
                return pred_test_y, loss, pred_test_y2
        else:
                return pred_test_y, loss, pred_test_y2


In [38]:
cols_to_use = list(train_df.drop(['id', 'program_id','program_type', 'trainee_id','education', 'is_pass'
                              ],axis=1))

train_X = train_df[cols_to_use]
test_X = test_df[cols_to_use]

train_y = train_df['is_pass']

In [39]:
train_unique_programs = np.array(train_df["program_type"].unique())

In [40]:
from sklearn import model_selection
from sklearn import metrics
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=98765)

In [None]:
print ("Model building..")
model_name = "XGB"
cv_scores = []
pred_test_full = 0
pred_val_full = np.zeros(train_df.shape[0])	
for dev_index, val_index in kf.split(train_unique_programs):
#for [dev_camp, val_camp] in camp_indices:
    dev_camp, val_camp = train_unique_programs[dev_index].tolist(), train_unique_programs[val_index].tolist()
    dev_X, val_X = train_X[train_df['program_type'].isin(dev_camp)], train_X[train_df['program_type'].isin(val_camp)]
    dev_y, val_y = train_y[train_df['program_type'].isin(dev_camp)], train_y[train_df['program_type'].isin(val_camp)]
    print (dev_X.shape, val_X.shape)

    if model_name == "LGB":
        pred_val1, loss1, pred_test1 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4)
        pred_val2, loss2, pred_test2 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=2018)
        pred_val3, loss3, pred_test3 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=9876)
        pred_val = (pred_val1 + pred_val2 + pred_val3)/3. 
        pred_test = (pred_test1 + pred_test2 + pred_test3)/3.
        loss = (loss1 + loss2 + loss3)/3. 
    elif model_name == "XGB":
        pred_val1, loss1, pred_test1 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4)
        pred_val2, loss2, pred_test2 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=2018)
        pred_val3, loss3, pred_test3 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=9876)
        pred_val = (pred_val1 + pred_val2 + pred_val3)/3. 
        pred_test = (pred_test1 + pred_test2 + pred_test3)/3.
        loss = (loss1 + loss2 + loss3)/3. 
    elif model_name == "ET":
        pred_val, loss, pred_test = runET(dev_X, dev_y, val_X, val_y, test_X, depth=20, leaf=20, feat=0.3)
    elif model_name == "LR":
        pred_val, loss, pred_test = runLR(dev_X, dev_y, val_X, val_y, test_X)

    pred_test_full += pred_test
    pred_val_full[train_df['program_type'].isin(val_camp)] = pred_val
    loss = metrics.roc_auc_score(train_y[train_df['program_type'].isin(val_camp)], pred_val)
    cv_scores.append(loss)
    print (cv_scores)


pred_test_full /= 5.
print (np.mean(cv_scores), metrics.roc_auc_score(train_y, pred_val_full))

#sub_df = pd.DataFrame({"id":test_id})
sample["is_pass"] = pred_test_full
sample.to_csv("sub/stack_xgb.csv", index=False)

In [None]:
print ("Model building..")
model_name = "LGB"
cv_scores = []
pred_test_full = 0
pred_val_full = np.zeros(train_df.shape[0])	
for dev_index, val_index in kf.split(train_unique_programs):
#for [dev_camp, val_camp] in camp_indices:
    dev_camp, val_camp = train_unique_programs[dev_index].tolist(), train_unique_programs[val_index].tolist()
    dev_X, val_X = train_X[train_df['program_type'].isin(dev_camp)], train_X[train_df['program_type'].isin(val_camp)]
    dev_y, val_y = train_y[train_df['program_type'].isin(dev_camp)], train_y[train_df['program_type'].isin(val_camp)]
    print (dev_X.shape, val_X.shape)

    if model_name == "LGB":
        pred_val1, loss1, pred_test1 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4)
        pred_val2, loss2, pred_test2 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=2018)
        pred_val3, loss3, pred_test3 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=9876)
        pred_val = (pred_val1 + pred_val2 + pred_val3)/3. 
        pred_test = (pred_test1 + pred_test2 + pred_test3)/3.
        loss = (loss1 + loss2 + loss3)/3. 
    elif model_name == "XGB":
        pred_val1, loss1, pred_test1 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4)
        pred_val2, loss2, pred_test2 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=2018)
        pred_val3, loss3, pred_test3 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=9876)
        pred_val = (pred_val1 + pred_val2 + pred_val3)/3. 
        pred_test = (pred_test1 + pred_test2 + pred_test3)/3.
        loss = (loss1 + loss2 + loss3)/3. 
    elif model_name == "ET":
        pred_val, loss, pred_test = runET(dev_X, dev_y, val_X, val_y, test_X, depth=20, leaf=20, feat=0.3)
    elif model_name == "LR":
        pred_val, loss, pred_test = runLR(dev_X, dev_y, val_X, val_y, test_X)

    pred_test_full += pred_test
    pred_val_full[train_df['program_type'].isin(val_camp)] = pred_val
    loss = metrics.roc_auc_score(train_y[train_df['program_type'].isin(val_camp)], pred_val)
    cv_scores.append(loss)
    print (cv_scores)


pred_test_full /= 5.
print (np.mean(cv_scores), metrics.roc_auc_score(train_y, pred_val_full))

#sub_df = pd.DataFrame({"id":test_id})
sample["is_pass"] = pred_test_full
sample.to_csv("sub/stack_lgb.csv", index=False)

In [None]:
from sklearn import linear_model
print ("Model building..")
model_name = "LR"
cv_scores = []
pred_test_full = 0
pred_val_full = np.zeros(train_df.shape[0])	
for dev_index, val_index in kf.split(train_unique_programs):
#for [dev_camp, val_camp] in camp_indices:
    dev_camp, val_camp = train_unique_programs[dev_index].tolist(), train_unique_programs[val_index].tolist()
    dev_X, val_X = train_X[train_df['program_type'].isin(dev_camp)], train_X[train_df['program_type'].isin(val_camp)]
    dev_y, val_y = train_y[train_df['program_type'].isin(dev_camp)], train_y[train_df['program_type'].isin(val_camp)]
    print (dev_X.shape, val_X.shape)

    if model_name == "LGB":
        pred_val1, loss1, pred_test1 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4)
        pred_val2, loss2, pred_test2 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=2018)
        pred_val3, loss3, pred_test3 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=9876)
        pred_val = (pred_val1 + pred_val2 + pred_val3)/3. 
        pred_test = (pred_test1 + pred_test2 + pred_test3)/3.
        loss = (loss1 + loss2 + loss3)/3. 
    elif model_name == "XGB":
        pred_val1, loss1, pred_test1 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4)
        pred_val2, loss2, pred_test2 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=2018)
        pred_val3, loss3, pred_test3 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=9876)
        pred_val = (pred_val1 + pred_val2 + pred_val3)/3. 
        pred_test = (pred_test1 + pred_test2 + pred_test3)/3.
        loss = (loss1 + loss2 + loss3)/3. 
    elif model_name == "ET":
        pred_val, loss, pred_test = runET(dev_X, dev_y, val_X, val_y, test_X, depth=20, leaf=20, feat=0.3)
    elif model_name == "LR":
        pred_val, loss, pred_test = runLR(dev_X, dev_y, val_X, val_y, test_X)

    pred_test_full += pred_test
    pred_val_full[train_df['program_type'].isin(val_camp)] = pred_val
    loss = metrics.roc_auc_score(train_y[train_df['program_type'].isin(val_camp)], pred_val)
    cv_scores.append(loss)
    print (cv_scores)


pred_test_full /= 5.
print (np.mean(cv_scores), metrics.roc_auc_score(train_y, pred_val_full))

#sub_df = pd.DataFrame({"id":test_id})
sample["is_pass"] = pred_test_full
sample.to_csv("sub/stack_lr.csv", index=False)

In [None]:
from sklearn import ensemble
print ("Model building..")
model_name = "ET"
cv_scores = []
pred_test_full = 0
pred_val_full = np.zeros(train_df.shape[0])	
for dev_index, val_index in kf.split(train_unique_programs):
#for [dev_camp, val_camp] in camp_indices:
    dev_camp, val_camp = train_unique_programs[dev_index].tolist(), train_unique_programs[val_index].tolist()
    dev_X, val_X = train_X[train_df['program_type'].isin(dev_camp)], train_X[train_df['program_type'].isin(val_camp)]
    dev_y, val_y = train_y[train_df['program_type'].isin(dev_camp)], train_y[train_df['program_type'].isin(val_camp)]
    print (dev_X.shape, val_X.shape)

    if model_name == "LGB":
        pred_val1, loss1, pred_test1 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4)
        pred_val2, loss2, pred_test2 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=2018)
        pred_val3, loss3, pred_test3 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=9876)
        pred_val = (pred_val1 + pred_val2 + pred_val3)/3. 
        pred_test = (pred_test1 + pred_test2 + pred_test3)/3.
        loss = (loss1 + loss2 + loss3)/3. 
    elif model_name == "XGB":
        pred_val1, loss1, pred_test1 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4)
        pred_val2, loss2, pred_test2 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=2018)
        pred_val3, loss3, pred_test3 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=9876)
        pred_val = (pred_val1 + pred_val2 + pred_val3)/3. 
        pred_test = (pred_test1 + pred_test2 + pred_test3)/3.
        loss = (loss1 + loss2 + loss3)/3. 
    elif model_name == "ET":
        pred_val, loss, pred_test = runET(dev_X, dev_y, val_X, val_y, test_X, depth=20, leaf=20, feat=0.3)
    elif model_name == "LR":
        pred_val, loss, pred_test = runLR(dev_X, dev_y, val_X, val_y, test_X)

    pred_test_full += pred_test
    pred_val_full[train_df['program_type'].isin(val_camp)] = pred_val
    loss = metrics.roc_auc_score(train_y[train_df['program_type'].isin(val_camp)], pred_val)
    cv_scores.append(loss)
    print (cv_scores)


pred_test_full /= 5.
print (np.mean(cv_scores), metrics.roc_auc_score(train_y, pred_val_full))

#sub_df = pd.DataFrame({"id":test_id})
sample["is_pass"] = pred_test_full
sample.to_csv("sub/stack_et.csv", index=False)

In [None]:
sub2 = pd.read_csv('sub/sub2.csv')
sub_stack = pd.read_csv('sub/stack_xgb.csv')
sub1 = pd.read_csv('sub/sub1.csv')
sub_lr = pd.read_csv('sub/stack_lr.csv')
sub_et = pd.read_csv('sub/stack_et.csv')
sub_lgb = pd.read_csv('sub/stack_lgb.csv')
sub2_1 = pd.read_csv('sub/sub2_1.csv')
sub_xgb = pd.read_csv('sub/xgb.csv')

In [None]:
sample['is_pass'] = (sub2['is_pass']  + sub_stack['is_pass']  + sub1['is_pass'] + 
                     sub_lgb['is_pass'] 
                     + sub_lr['is_pass']
                    + sub_et['is_pass']  + sub2_1['is_pass']  + sub_xgb['is_pass'])/8
sample['is_pass'].hist()

In [None]:
sample.to_csv("sub/avg._stacking.csv", index=False) ## 0.8161