In [3]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
 
 
def change_obj_to_category(df):
    i=0;
    while i<df.columns.size:
        if (df.dtypes[df.columns[i]] == 'object' or df.dtypes[df.columns[i]] == 'bool'):
            col = df.dtypes.index[i]
            df[col] = df[col].astype('category')
        i = i+1 
        
def change_int_to_float(df):
    i=0;
    while i<df.columns.size:
        if 'int' in str(df.dtypes[df.columns[i]]):
            col = df.dtypes.index[i]
            df[col] = df[col].astype('float64')
        i = i+1
       
def generateXY(df,target_col,var_list):
   
    #make a copy of the required subset and drop rows containing Na
    mdata = df[var_list+[target_col]].copy()
    mdata.dropna(inplace=True)
   
    #Separate target and put it in Y
    Y = mdata[target_col].tolist()
    del mdata[target_col]
   
    #Now, process data and create dummy variables if required with final data in Xvars
    import pandas
    Xvars = pandas.DataFrame()
    import pandas as pd
    for cols in var_list:
        if (str(mdata[cols].dtype) == 'category'):
            dummySer= pd.get_dummies(mdata[cols],prefix=cols+'_')
            Xvars = pd.concat([Xvars,dummySer],axis=1)
        else:
            Xvars =  pd.concat([Xvars,mdata[cols]],axis=1)
   
    X = Xvars.values.tolist()
   
    return X,Y
        
        
def roc_score_model(model,X,Y):
    from sklearn.metrics import roc_auc_score
    return roc_auc_score(Y,pd.DataFrame((model.predict_proba(X)))[1].tolist())
 
def feature_imp(colNames,imps):
    df = pd.DataFrame(columns=('Feature','Importance'),index=[x for x in range(0,len(colNames))])
    i = 0
    for col in colNames:
        df['Feature'][i] = col
        df['Importance'][i] = imps[i]
        i=i+1
   
    df = df.sort_values(by='Importance',ascending=False)
    return df
 
def predict_th(model,tx,ty,threshold=0.5):
    import pandas as pd
    probs = model.predict_proba(tx)[:,1].tolist()
    predictions = []
    for i in range(0,len(ty)):
        if probs[i]>threshold:
            predictions.append(1)
        else:
            predictions.append(0)
   
    return predictions  
 
 
print('Starting data read from csv file.')
match_data = pd.read_csv('/Users/ashokvardhan/Downloads/Divorce_dataset/modeling_dataset.csv',sep=',')
cols_to_delete = ['CASEID_NEW','weight1','weight2','weight3','weight4','weight5','weight6','weight7','weight_couples_coresident','members_lt_2','members_13_to_17','members_gt_18','members_2_to_5','members_6_to_12','region_4','region_9','bg_survey_date','next_next_year','intro_summary_codes','target_survey_dt','survey_date']
single_valued_columns = ['same_gender_partner','partners_religion_at_16','religion_at_16','has_domestic_partnership','has_civil_union','no_domestic_partnership_or_civil_union','attended_same_college','how_met_online','partner_gender','lived_together_before_married','time_dating_until_married','time_met_until_married','time_met_until_dating','marriage_count_combined','zip_pct_white','zip_pct_black','zip_pct_hispanic','zip_median_income','zip_pct_foreign_born','zip_rural']
 
print('Deleting columns containing huge missing values, Single valued items and id columns which are of no use.')
print('No data leakage is present as such.')
match_data.drop(cols_to_delete,inplace=True,axis=1)
match_data.drop(single_valued_columns,inplace=True,axis=1)
match_data.broke_up = match_data.broke_up.astype('category')
 
print('Changing the type of columns from object to category.')
change_obj_to_category(match_data)
 
numeric_cols = ['household_size',
                 'age',
                 'children_in_hh',
                 'age_when_relationship_started',
                 'time_since_met',
                 'time_since_romantic',
                 'time_since_cohab',
                 'time_since_first_relationship',
                 'income']       
 
print('Separating numeric data from the data set to perform numeric imputation on missing values.')
numeric_data = match_data[numeric_cols]
for col in numeric_cols:
    match_data.drop([col],axis=1,inplace=True)
 
match_data = match_data.apply(lambda x:x.fillna(x.value_counts().index[0]))
print('Imputed categorical data with the mode of the respective column.')
numeric_data = numeric_data.fillna(numeric_data.median())
print('Imputed numeric columns to its median value.')
 
#Join numeric data to main data set
match_data = match_data.join(numeric_data)
 
print('Setting aside 20% of the data as hold_out.')
#Now, let's set 20% of the data as holdout data [test data]
from sklearn.model_selection import train_test_split
train_data, hold_out = train_test_split(match_data, train_size = 0.8,random_state=2135)
 
print('Using standard scaler of sklearn library to Standardize and scale the numeric inputs.')
#Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_data[numeric_cols] = scaler.fit_transform(train_data[numeric_cols])
hold_out[numeric_cols] = scaler.transform(hold_out[numeric_cols])
 
 
print('\nFinding Optimal parameters for 4 different models using GridSearch.')
target_col = 'broke_up'
varToUse = train_data.columns.tolist()
varToUse.remove('broke_up')
Train_X,Train_Y = generateXY(train_data,target_col,varToUse)
Test_X,Test_Y = generateXY(hold_out,target_col,varToUse)
 
 
print('Optimal Parameters for: ')
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
 
#Regularized Logistic Regression
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(random_state=2135)
pipeline = Pipeline(steps=[('standardize', preprocessing.StandardScaler())
                           , ('model', sgd) ])
optimized_sgd = GridSearchCV(estimator=pipeline
                            , cv=3
                            , param_grid=dict(model__alpha = [0.0001,0.0002,0.0003,0.0004])
                            , scoring = 'roc_auc'
                            , verbose = 0
                           )
sgdgc = optimized_sgd.fit(Train_X,Train_Y)
print('1. Regularized Logistic Regression: ' + str (sgdgc.best_params_))
 
#Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=2135)
pipeline = Pipeline(steps=[('standardize', preprocessing.StandardScaler())
                           , ('model', dt) ])
optimized_dt = GridSearchCV(estimator=pipeline
                            , cv=3
                            , param_grid=dict(model__max_depth =  [5,6,7], model__max_features=[20,25,30])
                            , scoring = 'roc_auc'
                            , verbose = 0
                           )
dtgc = optimized_dt.fit(Train_X,Train_Y)
print('2. Decision Tree: ' + str (dtgc.best_params_))
 
#Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=2135)
pipeline = Pipeline(steps=[('standardize', preprocessing.StandardScaler())
                           , ('model', rf) ])
optimized_rf = GridSearchCV(estimator=pipeline
                            , cv=3
                            , param_grid=dict(model__max_depth =  [5,6], model__max_features=[5,10],model__n_estimators=[40,50])
                            , scoring = 'roc_auc'
                            , verbose = 0
                           )
rfgc = optimized_rf.fit(Train_X,Train_Y)
print('3. Random Forest: ' + str (rfgc.best_params_))
 
 
#Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(random_state=2135)
pipeline = Pipeline(steps=[('standardize', preprocessing.StandardScaler())
                           , ('model', gbc) ])
optimized_gbc = GridSearchCV(estimator=pipeline
                            , cv=3
                            , param_grid=dict(model__max_depth =  [2,3], model__max_features=[20,25],model__n_estimators=[60,70])
                            , scoring = 'roc_auc'
                            , verbose = 0
                           )
gbcgc = optimized_gbc.fit(Train_X,Train_Y)
print('4. Gradient Boosting Classifier: ' + str (gbcgc.best_params_))
 
 
 
print('\nPerforming Cross validation for 4 different models using optimal parameters obtained from GridSearch.')
print('Cross Validation Scores for: ')
#
best_alpha = float(sgdgc.best_params_['model__alpha'])
sgd_cvscore = cross_val_score(SGDClassifier(random_state=2135,alpha=best_alpha),Train_X,Train_Y,cv=3,scoring='roc_auc')
print ('1. Regularized Logistic Regression: %f' %np.mean(sgd_cvscore))
#
best_depth = int(dtgc.best_params_['model__max_depth'])
best_features = int(dtgc.best_params_['model__max_features'])
dtgc_cvscore = cross_val_score(DecisionTreeClassifier(random_state=2135,max_depth=best_depth,max_features=best_features),Train_X,Train_Y,cv=3,scoring='roc_auc')
print ('2. Decision Tree: %f' %np.mean(dtgc_cvscore))
#
best_depthrf = int(rfgc.best_params_['model__max_depth'])
best_featuresrf = int(rfgc.best_params_['model__max_features'])
best_estimatorsrf = int(rfgc.best_params_['model__n_estimators'])
rfgc_cvscore = cross_val_score(RandomForestClassifier(random_state=2135,n_estimators=best_estimatorsrf,max_depth=best_depthrf,max_features=best_featuresrf),Train_X,Train_Y,cv=3,scoring='roc_auc')
print ('3. Random Forest: %f' %np.mean(rfgc_cvscore))
#
best_depthgbc = int(gbcgc.best_params_['model__max_depth'])
best_featuresgbc = int(gbcgc.best_params_['model__max_features'])
best_estimatorsgbc = int(gbcgc.best_params_['model__n_estimators'])
gbcgc_cvscore = cross_val_score(GradientBoostingClassifier(random_state=2135,n_estimators=best_estimatorsgbc,max_depth=best_depthgbc,max_features=best_featuresgbc),Train_X,Train_Y,cv=3,scoring='roc_auc')
print ('4. Gradient Boosting Classifier: %f' %np.mean(gbcgc_cvscore))
 
 
print ('\nBased on the Cross validation ouptuts, the best model that can be selected is: \'Gradient Boosting Classifier\'')
 
print('\nFitting Gradient Boosting classifier for the dataset.')
print('Testing accuracy of this model on Test data(or hold_out data).')
gbc = GradientBoostingClassifier(random_state=2135,n_estimators=best_estimatorsgbc,max_depth=best_depthgbc,max_features=best_featuresgbc)
gbc = gbc.fit(Train_X,Train_Y)
 
print('\nHoldout ROC/AUC accuracy for Gradient Boosting classifier: '+ str(roc_score_model(gbc,Test_X,Test_Y)))
 
#feature_imp(Train_X.colnames,gb.feature_importance_)

Starting data read from csv file.
Deleting columns containing huge missing values, Single valued items and id columns which are of no use.
No data leakage is present as such.
Changing the type of columns from object to category.
Separating numeric data from the data set to perform numeric imputation on missing values.
Imputed categorical data with the mode of the respective column.
Imputed numeric columns to its median value.
Setting aside 20% of the data as hold_out.
Using standard scaler of sklearn library to Standardize and scale the numeric inputs.

Finding Optimal parameters for 4 different models using GridSearch.
Optimal Parameters for: 
1. Regularized Logistic Regression: {'model__alpha': 0.0004}
2. Decision Tree: {'model__max_depth': 6, 'model__max_features': 30}
3. Random Forest: {'model__n_estimators': 50, 'model__max_depth': 5, 'model__max_features': 5}
4. Gradient Boosting Classifier: {'model__n_estimators': 70, 'model__max_depth': 2, 'model__max_features': 25}

Performing