In [1]:
import pandas as pd
import numpy as np

#model building
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

import warnings
warnings.filterwarnings('ignore')

import gc
gc.collect()

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,id,program_id,program_type,program_duration,test_id,test_type,difficulty_level,trainee_id,gender,education,city_tier,age,total_programs_enrolled,is_handicapped,trainee_engagement_rating,is_pass
0,9389_150,Y_1,Y,136,150,offline,intermediate,9389,M,Matriculation,3,24.0,5,N,1.0,0
1,16523_44,T_1,T,131,44,offline,easy,16523,F,High School Diploma,4,26.0,2,N,3.0,1
2,13987_178,Z_2,Z,120,178,online,easy,13987,M,Matriculation,1,40.0,1,N,2.0,1
3,13158_32,T_2,T,117,32,offline,easy,13158,F,Matriculation,3,,4,N,1.0,1
4,10591_84,V_3,V,131,84,offline,intermediate,10591,F,High School Diploma,1,42.0,2,N,4.0,1


In [4]:
test.head()

Unnamed: 0,id,program_id,program_type,program_duration,test_id,test_type,difficulty_level,trainee_id,gender,education,city_tier,age,total_programs_enrolled,is_handicapped,trainee_engagement_rating
0,1626_45,T_1,T,131,45,offline,intermediate,1626,F,Matriculation,3,46.0,2,N,4.0
1,11020_130,Y_3,Y,135,130,online,easy,11020,M,Bachelors,3,,4,N,4.0
2,12652_146,Y_2,Y,120,146,online,easy,12652,M,Matriculation,3,,2,N,3.0
3,7038_72,V_4,V,122,72,offline,vary hard,7038,F,High School Diploma,1,,2,N,2.0
4,888_71,V_4,V,122,71,offline,intermediate,888,F,Matriculation,3,,2,N,2.0


In [5]:
k = pd.DataFrame()
k["train"] = round(100*train.isna().sum()/len(train),2)
k["test"] = round(100*test.isna().sum()/len(test),2)
k.T

Unnamed: 0,id,program_id,program_type,program_duration,test_id,test_type,difficulty_level,trainee_id,gender,education,city_tier,age,total_programs_enrolled,is_handicapped,trainee_engagement_rating,is_pass
train,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.91,0.0,0.0,0.11,0.0
test,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.61,0.0,0.0,0.1,


In [6]:
train.shape,test.shape

((73147, 16), (31349, 15))

In [7]:
train.is_pass.value_counts(normalize=True)

1    0.695408
0    0.304592
Name: is_pass, dtype: float64

In [8]:
#check the distribution of variables in test and train
cols=['program_id','program_type','test_type','difficulty_level','education','gender','city_tier','is_handicapped']
for col in cols:
    k=pd.DataFrame()
    k[str(col)+'_train']= round(100*train[col].value_counts(normalize=True),2)
    k[str(col)+'_test']= round(100*train[col].value_counts(normalize=True),2)
    k.index.name=col
    print(k,"\n\n\n")

            program_id_train  program_id_test
program_id                                   
Y_1                     9.14             9.14
Y_3                     9.05             9.05
T_3                     8.54             8.54
Y_4                     6.85             6.85
T_4                     6.55             6.55
U_1                     6.49             6.49
V_4                     5.88             5.88
T_2                     5.83             5.83
Y_2                     5.80             5.80
V_3                     4.67             4.67
V_1                     4.66             4.66
T_1                     4.31             4.31
U_2                     4.20             4.20
Z_3                     3.64             3.64
Z_2                     2.87             2.87
V_2                     2.59             2.59
Z_1                     2.56             2.56
X_1                     1.84             1.84
X_3                     1.65             1.65
X_2                     1.06      

In [31]:
train[train['trainee_id']==9389]

Unnamed: 0,age,city_tier,difficulty_level,education,gender,id,is_handicapped,is_pass,program_duration,program_id,program_type,test_id,test_type,total_programs_enrolled,trainee_engagement_rating,trainee_id,total_prog_duration,program_id_passratio,program_type_passratio,city_tier_passratio,education_passratio,difficulty_level_passratio,test_id_passratio,trainee_id_passratio
0,24.0,3,intermediate,Matriculation,M,9389_150,N,0.0,136,Y_1,Y,150,offline,5,1.0,9389,680,7.1951,23.566243,24.515018,23.203959,18.503835,0.689023,0.0
49948,24.0,3,easy,Matriculation,M,9389_154,N,0.0,136,Y_1,Y,154,online,5,1.0,9389,680,7.1951,23.566243,24.515018,23.203959,40.414508,0.751911,0.0
64716,24.0,3,hard,Matriculation,M,9389_152,N,0.0,136,Y_1,Y,152,offline,5,1.0,9389,680,7.1951,23.566243,24.515018,23.203959,9.397515,0.489425,0.0


In [33]:
train.head()

Unnamed: 0,age,city_tier,difficulty_level,education,gender,id,is_handicapped,is_pass,program_duration,program_id,program_type,test_id,test_type,total_programs_enrolled,trainee_engagement_rating,trainee_id,total_prog_duration,program_id_passratio,program_type_passratio,city_tier_passratio,education_passratio,difficulty_level_passratio,test_id_passratio,trainee_id_passratio,total_prog_ids
0,24.0,3,intermediate,Matriculation,M,9389_150,N,0.0,136,Y_1,Y,150,offline,5,1.0,9389,680,7.1951,23.566243,24.515018,23.203959,18.503835,0.689023,0.0,1
1,26.0,4,easy,High School Diploma,F,16523_44,N,1.0,131,T_1,T,44,offline,2,3.0,16523,262,2.177806,17.631619,6.316049,33.58716,40.414508,0.418336,0.002734,1
2,40.0,1,easy,Matriculation,M,13987_178,N,1.0,120,Z_2,Z,178,online,1,2.0,13987,600,2.069805,6.532052,23.177984,23.203959,40.414508,0.187294,0.004101,1
3,,3,easy,Matriculation,F,13158_32,N,1.0,117,T_2,T,32,offline,4,1.0,13158,585,4.318701,17.631619,24.515018,23.203959,40.414508,0.434741,0.002734,1
4,42.0,1,intermediate,High School Diploma,F,10591_84,N,1.0,131,V_3,V,84,offline,2,4.0,10591,393,2.702777,10.432417,23.177984,33.58716,18.503835,0.282992,0.001367,1


In [32]:
train['type']='train'
test['type']='test'
data = pd.concat([train,test],ignore_index=True)
#data['trainee_engagement_rating']= data.groupby(['trainee_id'])['trainee_engagement_rating'].apply(lambda x: x.fillna(x.mean()))#more accurcy
data['total_prog_duration'] = data.groupby(['trainee_id'])['program_duration'].transform('sum')

train=data[data['type']=='train']
test=data[data['type']=='test']

train.drop(['type'],axis=1,inplace=True)
test.drop(['type'],axis=1,inplace=True)

In [11]:
temp = train.groupby(['program_id'])['is_pass'].agg('sum').reset_index()
temp.columns=['program_id','program_id_passratio']
temp['program_id_passratio'] = 100*temp['program_id_passratio']/train.shape[0]
train = pd.merge(train,temp,on=['program_id'],how='left')
test = pd.merge(test,temp,on=['program_id'],how='left')

temp = train.groupby(['program_type'])['is_pass'].agg('sum').reset_index()
temp.columns=['program_type','program_type_passratio']
temp['program_type_passratio'] = 100*temp['program_type_passratio']/train.shape[0]
train = pd.merge(train,temp,on=['program_type'],how='left')
test = pd.merge(test,temp,on=['program_type'],how='left')

temp = train.groupby(['city_tier'])['is_pass'].agg('sum').reset_index()
temp.columns=['city_tier','city_tier_passratio']
temp['city_tier_passratio'] = 100*temp['city_tier_passratio']/train.shape[0]
train = pd.merge(train,temp,on=['city_tier'],how='left')
test = pd.merge(test,temp,on=['city_tier'],how='left')

temp = train.groupby(['education'])['is_pass'].agg('sum').reset_index()
temp.columns=['education','education_passratio']
temp['education_passratio'] = 100*temp['education_passratio']/train.shape[0]
train = pd.merge(train,temp,on=['education'],how='left')
test = pd.merge(test,temp,on=['education'],how='left')

temp = train.groupby(['difficulty_level'])['is_pass'].agg('sum').reset_index()
temp.columns=['difficulty_level','difficulty_level_passratio']
temp['difficulty_level_passratio'] = 100*temp['difficulty_level_passratio']/train.shape[0]
train = pd.merge(train,temp,on=['difficulty_level'],how='left')
test = pd.merge(test,temp,on=['difficulty_level'],how='left')

temp = train.groupby(['test_id'])['is_pass'].agg('sum').reset_index()
temp.columns=['test_id','test_id_passratio']
temp['test_id_passratio'] = 100*temp['test_id_passratio']/train.shape[0]
train = pd.merge(train,temp,on=['test_id'],how='left')
test = pd.merge(test,temp,on=['test_id'],how='left')

temp = train.groupby(['trainee_id'])['is_pass'].agg('sum').reset_index()
temp.columns=['trainee_id','trainee_id_passratio']
temp['trainee_id_passratio'] = 100*temp['trainee_id_passratio']/train.shape[0]
train = pd.merge(train,temp,on=['trainee_id'],how='left')
test = pd.merge(test,temp,on=['trainee_id'],how='left')

In [34]:
log_train = train.copy()
log_test = test.copy()

In [35]:
cols=['program_id','program_type','test_type','gender',
      'difficulty_level', 'education','is_handicapped',
       'city_tier']

log_train = pd.get_dummies(log_train,columns=cols)
log_test = pd.get_dummies(log_test,columns=cols)

In [36]:
not_cols=['id','is_pass','trainee_id','test_id']
clf = LogisticRegression()
X = log_train[[col for col in log_train.columns if col not in not_cols]]
y = log_train[["is_pass"]]

log_test=log_test[X.columns]

skf = StratifiedKFold(n_splits=10,shuffle=True,random_state=101)
cv_score = []
pred_test =np.zeros((len(log_test),1))
for train_index,test_index in skf.split(X,y):
    x_train,x_val = X.iloc[train_index],X.iloc[test_index]
    y_train,y_val = y.iloc[train_index],y.iloc[test_index]
    clf = lgb.LGBMClassifier(n_estimators=1000)
    clf.fit(x_train,y_train)
    score = round(roc_auc_score(y_val,clf.predict_proba(x_val)[:,1]),4)
    cv_score.append(score)
    print(score,end=",")
    
#     #predictions
    pred_test += clf.predict_proba(log_test)[:,1].reshape(-1,1)

pred_test=pred_test/10
print("\n",np.mean(cv_score))

0.8951,0.9044,0.8979,0.9019,0.8998,0.8989,0.905,0.9014,0.8994,0.8964,
 0.9000199999999999


In [37]:
test['is_pass']=pred_test
temp=test[['id','is_pass']]
temp['is_pass'] = round(temp['is_pass'],4)
temp.index=temp.id
temp.drop(['id'],axis=1,inplace=True)
temp.to_csv('try9.csv')
temp.head()

Unnamed: 0_level_0,is_pass
id,Unnamed: 1_level_1
1626_45,0.0
11020_130,0.9868
12652_146,0.3628
7038_72,0.0
888_71,0.4543


In [17]:
train.head()

Unnamed: 0,age,city_tier,difficulty_level,education,gender,id,is_handicapped,is_pass,program_duration,program_id,program_type,test_id,test_type,total_programs_enrolled,trainee_engagement_rating,trainee_id,total_prog_duration,program_id_passratio,program_type_passratio,city_tier_passratio,education_passratio,difficulty_level_passratio,test_id_passratio,trainee_id_passratio
0,24.0,3,intermediate,Matriculation,M,9389_150,N,0.0,136,Y_1,Y,150,offline,5,1.0,9389,680,7.1951,23.566243,24.515018,23.203959,18.503835,0.689023,0.0
1,26.0,4,easy,High School Diploma,F,16523_44,N,1.0,131,T_1,T,44,offline,2,3.0,16523,262,2.177806,17.631619,6.316049,33.58716,40.414508,0.418336,0.002734
2,40.0,1,easy,Matriculation,M,13987_178,N,1.0,120,Z_2,Z,178,online,1,2.0,13987,600,2.069805,6.532052,23.177984,23.203959,40.414508,0.187294,0.004101
3,,3,easy,Matriculation,F,13158_32,N,1.0,117,T_2,T,32,offline,4,1.0,13158,585,4.318701,17.631619,24.515018,23.203959,40.414508,0.434741,0.002734
4,42.0,1,intermediate,High School Diploma,F,10591_84,N,1.0,131,V_3,V,84,offline,2,4.0,10591,393,2.702777,10.432417,23.177984,33.58716,18.503835,0.282992,0.001367
