In [115]:
import pandas as pd
import numpy as np
pd.set_option('max_columns', 120)
pd.set_option('max_colwidth', 5000)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['figure.figsize'] = (12,8)


pd.set_option('display.max_columns',None)
# set seed for reproducibility
np.random.seed(0)

In [4]:
train = pd.read_csv('train_HK6lq50.csv', low_memory=False)
half_count = len(train) / 2
train = train.dropna(thresh=half_count,axis=1) # Drop any column with more than 50% missing values
train = train.drop(['id','program_id'],axis=1) # These columns are not useful for our purposes


In [5]:
test = pd.read_csv('test_wF0Ps6O.csv', low_memory=False)
# half_count = len(train) / 2
# train = train.dropna(thresh=half_count,axis=1) # Drop any column with more than 50% missing values
test = test.drop(['id','program_id'],axis=1) # These columns are not useful for our purposes


In [6]:
print(train.shape,'\n',test.shape)

(73147, 14) 
 (31349, 13)


In [7]:
print(train.columns.to_list())

['program_type', 'program_duration', 'test_id', 'test_type', 'difficulty_level', 'trainee_id', 'gender', 'education', 'city_tier', 'age', 'total_programs_enrolled', 'is_handicapped', 'trainee_engagement_rating', 'is_pass']


In [8]:
#function to get the missing value from the dataset and also the percentage of the missing values 
def missing_val(data):
    total_cells = np.product(data.shape)
    total_missing_value = data.isnull().sum().sum()
    # percent of data that is missing
    percentage_missign_values_nfl = (total_missing_value/total_cells) * 100
    print('====================================')
    print("Percentage of missing value in the dataset is :",("%.2f" % round(percentage_missign_values_nfl, 2)),'%')
    print('\n=====Missing Values per coloumn=====')
    return (data.isnull().sum(),'\n')
    

In [9]:
missing_val(train)

Percentage of missing value in the dataset is : 2.72 %

=====Missing Values per coloumn=====


(program_type                     0
 program_duration                 0
 test_id                          0
 test_type                        0
 difficulty_level                 0
 trainee_id                       0
 gender                           0
 education                        0
 city_tier                        0
 age                          27729
 total_programs_enrolled          0
 is_handicapped                   0
 trainee_engagement_rating       77
 is_pass                          0
 dtype: int64, '\n')

In [10]:
train.describe(include=[np.number])

Unnamed: 0,program_duration,test_id,trainee_id,city_tier,age,total_programs_enrolled,trainee_engagement_rating,is_pass
count,73147.0,73147.0,73147.0,73147.0,45418.0,73147.0,73070.0,73147.0
mean,128.208676,91.079306,9843.07962,2.246845,36.494033,2.583934,2.395634,0.695408
std,6.889679,51.239048,5703.556093,1.011587,9.041119,1.240816,1.325288,0.460238
min,117.0,0.0,1.0,1.0,17.0,1.0,1.0,0.0
25%,121.0,45.0,5040.0,1.0,28.0,2.0,1.0,0.0
50%,131.0,90.0,9641.0,2.0,39.0,2.0,2.0,1.0
75%,134.0,135.0,14552.0,3.0,45.0,3.0,4.0,1.0
max,136.0,187.0,20098.0,4.0,63.0,14.0,5.0,1.0


In [11]:
#imputing the missing values
train['age'].fillna(method='bfill',inplace=True)
train['trainee_engagement_rating'].fillna(value=1.0,inplace=True)

In [12]:
dummy1 = pd.get_dummies(train['program_type'])
dummy1.drop('S',axis=1,inplace=True)

dummy2 = pd.get_dummies(train['test_type'])
dummy2.drop('online',axis=1,inplace=True)

dummy3 = pd.get_dummies(train['difficulty_level'])
dummy3.drop('vary hard',axis=1,inplace=True)

dummy4 = pd.get_dummies(train['gender'])
dummy4.drop('M',axis=1,inplace=True)

dummy5 = pd.get_dummies(train['education'])
dummy5.drop('Masters',axis=1,inplace=True)

dummy6 = pd.get_dummies(train['is_handicapped'])
dummy6.drop('Y',axis=1,inplace=True)

In [13]:
dummy = pd.concat([dummy1,dummy2,dummy3,dummy4,dummy5,dummy6],axis=1)

train = pd.concat([train,dummy],axis=1)

In [14]:
train.drop(['program_type','test_type','difficulty_level','gender','education','is_handicapped'],axis=1,inplace=True)

In [15]:
train.shape

(73147, 24)

## Let us perform the same actions on the test dataset so as to match the shape 

In [16]:
test.shape

(31349, 13)

In [17]:
test['age'].fillna(method='bfill',inplace=True)
test['trainee_engagement_rating'].fillna(value=1.0,inplace=True)

In [18]:
missing_val(test) # all the missing values are filled

Percentage of missing value in the dataset is : 0.00 %

=====Missing Values per coloumn=====


(program_type                 0
 program_duration             0
 test_id                      0
 test_type                    0
 difficulty_level             0
 trainee_id                   0
 gender                       0
 education                    0
 city_tier                    0
 age                          0
 total_programs_enrolled      0
 is_handicapped               0
 trainee_engagement_rating    0
 dtype: int64, '\n')

In [19]:
dummy1 = pd.get_dummies(test['program_type'])
dummy1.drop('S',axis=1,inplace=True)

dummy2 = pd.get_dummies(test['test_type'])
dummy2.drop('online',axis=1,inplace=True)

dummy3 = pd.get_dummies(test['difficulty_level'])
dummy3.drop('vary hard',axis=1,inplace=True)

dummy4 = pd.get_dummies(test['gender'])
dummy4.drop('M',axis=1,inplace=True)

dummy5 = pd.get_dummies(test['education'])
dummy5.drop('Masters',axis=1,inplace=True)

dummy6 = pd.get_dummies(test['is_handicapped'])
dummy6.drop('Y',axis=1,inplace=True)

In [20]:
dummy = pd.concat([dummy1,dummy2,dummy3,dummy4,dummy5,dummy6],axis=1)

test = pd.concat([test,dummy],axis=1)

In [21]:
test.drop(['program_type','test_type','difficulty_level','gender','education','is_handicapped'],axis=1,inplace=True)

In [22]:
test.shape

(31349, 23)

## Let us build the model 

In [64]:
X.corr()

Unnamed: 0,program_duration,test_id,trainee_id,city_tier,age,total_programs_enrolled,trainee_engagement_rating,T,U,V,X,Y,Z,offline,easy,hard,intermediate,F,Bachelors,High School Diploma,Matriculation,No Qualification,N
program_duration,1.0,0.204855,0.148651,-0.022368,-0.011587,-0.010128,0.098739,-0.192647,0.054055,-0.075177,0.11066,0.112389,0.01481,0.071348,-0.054852,0.145995,-0.057659,-0.113637,0.013227,-0.006534,0.00205,-0.024284,0.009341
test_id,0.204855,1.0,0.148945,-0.001891,-0.018409,-0.134198,0.323929,-0.70727,-0.227115,-0.065521,0.065901,0.595847,0.502388,-0.189007,0.141249,0.156889,-0.261721,-0.284986,-0.060095,-0.032159,0.080843,0.001125,-0.027875
trainee_id,0.148651,0.148945,1.0,-0.01019,0.069583,-0.07463,0.031083,-0.057572,0.038668,-0.053997,0.0448,0.008027,0.088112,0.027435,0.005548,0.062512,-0.058477,-0.022971,0.036559,-0.030984,0.005436,0.004801,0.017874
city_tier,-0.022368,-0.001891,-0.01019,1.0,-0.044009,0.016746,-0.052688,0.063962,-0.040392,-0.026112,-0.031291,-0.009564,0.038399,-0.011905,0.025615,-0.023306,-0.003146,0.069017,-0.062149,-0.022944,0.070832,0.063279,-0.060323
age,-0.011587,-0.018409,0.069583,-0.044009,1.0,-0.044017,0.114241,0.028079,-0.003108,-0.032194,-0.041276,-0.006033,0.025113,-0.031583,0.021373,-0.018228,-0.007692,0.016918,0.097556,-0.0779,-7.2e-05,-0.015095,0.02003
total_programs_enrolled,-0.010128,-0.134198,-0.07463,0.016746,-0.044017,1.0,0.083631,0.063333,-0.027738,0.04266,-0.071499,0.173614,-0.357196,0.017638,-0.073066,0.02079,0.064855,-0.059886,0.015105,0.055932,-0.067094,0.001564,-0.029886
trainee_engagement_rating,0.098739,0.323929,0.031083,-0.052688,0.114241,0.083631,1.0,-0.340605,-0.029033,-0.13031,0.024562,0.586097,-0.241097,-0.11338,-0.047203,0.172714,-0.075525,-0.267887,0.062283,-0.015002,-0.032621,-0.013781,0.03408
T,-0.192647,-0.70727,-0.057572,0.063962,0.028079,0.063333,-0.340605,1.0,-0.201016,-0.270405,-0.126837,-0.387965,-0.183399,0.034775,0.001699,-0.132795,0.137407,0.488979,-0.037279,-0.006888,0.03759,0.01556,-1.3e-05
U,0.054055,-0.227115,0.038668,-0.040392,-0.003108,-0.027738,-0.029033,-0.201016,1.0,-0.16105,-0.075543,-0.231067,-0.10923,-0.072079,0.061902,-0.140052,-0.01728,-0.151336,0.091106,0.017285,-0.095991,-0.007922,0.0222
V,-0.075177,-0.065521,-0.053997,-0.026112,-0.032194,0.04266,-0.13031,-0.270405,-0.16105,1.0,-0.101619,-0.31083,-0.146935,0.223106,-0.103947,-0.051226,0.082443,-0.056645,0.032189,0.033701,-0.058583,-0.007925,-0.013261


In [65]:
X.drop(['High School Diploma','Matriculation','N'],axis=1,inplace=True)

In [23]:
#seperate the dependant and independant variable, for simplicity 
y = train.pop('is_pass')

In [24]:
X = train.copy()

In [25]:
print(X.shape,y.shape)

(73147, 23) (73147,)


In [26]:
print(X.columns.tolist())


['program_duration', 'test_id', 'trainee_id', 'city_tier', 'age', 'total_programs_enrolled', 'trainee_engagement_rating', 'T', 'U', 'V', 'X', 'Y', 'Z', 'offline', 'easy', 'hard', 'intermediate', 'F', 'Bachelors', 'High School Diploma', 'Matriculation', 'No Qualification', 'N']


In [66]:
from imblearn.combine import SMOTETomek

In [67]:
smk = SMOTETomek(random_state=42)
X_res,y_res=smk.fit_sample(X,y)

In [68]:
X_res.shape,y_res.shape


((92880, 20), (92880,))

### Logistic Regression

In [41]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression(penalty = 'l2', C = 10,random_state = 0,max_iter=1000)

In [42]:
logisticRegr.fit(X_res, y_res)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [43]:
predictions  = logisticRegr.predict(test)

 ### Random Forest 

In [50]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 1,
 'min_samples_leaf': 3,
 'n_estimators': 300}

In [77]:
model = RandomForestClassifier(n_jobs=-1,random_state=50,oob_score=True,criterion='gini',max_depth=10,max_features=0.4,min_samples_leaf=200,n_estimators=1000)

In [57]:
model = RandomForestClassifier(criterion='gini',n_estimators=500)

In [78]:
%%timeit
model.fit(X_res, y_res)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features=0.4,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=200, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=True, random_state=50, verbose=0,
                       warm_start=False)

In [73]:
print('Number of Trees used : ', model.n_estimators)

Number of Trees used :  500


In [72]:
test.drop(['High School Diploma','Matriculation','N'],axis=1,inplace=True)

In [79]:
%%time
predictions = model.predict(test)

Wall time: 1.04 s


## Next Attempt

In [246]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

In [244]:
# define models and parameters
model = RandomForestClassifier()
n_estimators = [10, 100, 1000]
max_features = ['sqrt', 'log2']

In [None]:
# define grid search
grid = dict(n_estimators=n_estimators,max_features=max_features)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)

## Applying other models

In [95]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=100,max_depth=5)

In [101]:
model.fit(X,y)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=5,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [102]:
predictions = model.predict(test)

In [107]:
from xgboost import XGBClassifier
model = XGBClassifier()


In [108]:
model.fit(X_res,y_res)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [109]:
predictions = model.predict(test)

## Part to make the solution file 

In [44]:
test_main  = pd.read_csv("test_wF0Ps6O.csv")
test_main.head()

Unnamed: 0,id,program_id,program_type,program_duration,test_id,test_type,difficulty_level,trainee_id,gender,education,city_tier,age,total_programs_enrolled,is_handicapped,trainee_engagement_rating
0,1626_45,T_1,T,131,45,offline,intermediate,1626,F,Matriculation,3,46.0,2,N,4.0
1,11020_130,Y_3,Y,135,130,online,easy,11020,M,Bachelors,3,,4,N,4.0
2,12652_146,Y_2,Y,120,146,online,easy,12652,M,Matriculation,3,,2,N,3.0
3,7038_72,V_4,V,122,72,offline,vary hard,7038,F,High School Diploma,1,,2,N,2.0
4,888_71,V_4,V,122,71,offline,intermediate,888,F,Matriculation,3,,2,N,2.0


In [45]:
test_id = test_main.pop('id')
test_id = pd.DataFrame(test_id)

In [110]:
predictions  = pd.DataFrame(predictions,columns=['is_pass'])
predictions.head()

Unnamed: 0,is_pass
0,1
1,1
2,1
3,0
4,1


In [111]:
submission = pd.concat([test_id,predictions],axis=1)

In [112]:
submission.to_csv('submission_16.csv',index=False)