## About this notebook

Cross-sectional district level models are run for which features generated by Arpit are used. For every indicator we have around 10 feature files (bin 5-15

Will follow a 5fold cross-validation approach and take the mean of weighted f1-score

Finally all scores will be reported for the best parameters

Not running MLP for this run

In [1]:
import numpy as np                               
import pandas as pd                              
import csv
import time
import keras
import pprint
import copy
import random
import math
import itertools
from operator import itemgetter
from imblearn.over_sampling import SMOTE
import pickle
import os

# mainly for stats models, and tensorflow,pandas version
# related warnings, which should not be ignored ideally
import warnings
warnings.filterwarnings("ignore")
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL) 
random.seed(5)
np.random.seed(5)

import src
import specs

# all the specs
from specs.mlSpecs import *
from specs.mlDrillSpecs import *
from specs.ensembleSpecs import *
# from specs.statsSpecs import *

# all the classes and functions for dictionaries
from src.scores import *

from src.models.machineLearning.randomForest import randomForest
from src.models.machineLearning.xgBoost import xgBoost
from src.models.machineLearning.svc import svc
from src.models.machineLearning.adaBoost import adaBoost
from src.models.machineLearning.bagging import bagging
from src.models.machineLearning.gradientBoosting import gradientBoosting
from src.models.machineLearning.knn import knn
from src.models.machineLearning.mlp import mlp
from src.models.machineLearning.ensembleMLP import ensembleMLP

from src.randomizedSearchSpecial import *

#all other functions
from src.dataPreprocessingUtils import *
from src.selectorFunctions import *
from src.mlPrediction import *
from src.combining import *
from src.randomizedSearch import *

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# These 3 need to be defined outside in some file which is loaded
scores_dict = {
        "f1_weighted" : {"function" : f1_weighted, "type" : "score",},
        "f1_macro" : {"function" : f1_macro, "type" : "score",},
        "f1_micro" : {"function" : f1_micro, "type" : "score",},
        "accuracy_normal" : {"function" : accuracy_normal, "type" : "score",},
        "accuracy_weighted" : {"function" : accuracy_weighted, "type" : "score",},
        "accuracy_classwise" : {"function" : accuracy_classwise, "type" : "score",},
        "precision_weighted" : {"function" : precision_weighted, "type" : "score",},
        "precision_macro" : {"function" : precision_macro, "type" : "score",},
        "precision_micro" : {"function" : precision_micro, "type" : "score",},
        "recall_weighted" : {"function" : recall_weighted, "type" : "score",},
        "recall_macro" : {"function" : recall_macro, "type" : "score",},
        "recall_micro" : {"function" : recall_micro, "type" : "score",},
#         "roc_auc_weighted" : {"function" : roc_auc_weighted, "type" : "score",},
#         "roc_auc_macro" : {"function" : roc_auc_macro, "type" : "score",},
#         "roc_auc_micro" : {"function" : roc_auc_micro, "type" : "score",},
        "categorical_crossentropy" : {"type" : "error",},
        "accuracy" : {"type" : "score",},
        "acc" : {"type" : "score",}
    }
models_dict = {
#                 "randomForest" : {"class": randomForest , "type": "machine_learning"},
                "xgBoost" : {"class": xgBoost , "type": "machine_learning"},
#                 "svc" : {"class": svc , "type": "machine_learning"},
#                 "bagging" : {"class": bagging , "type": "machine_learning"},
#                 "adaBoost" : {"class": adaBoost , "type": "machine_learning"},
#                 "gradientBoosting" : {"class": gradientBoosting , "type": "machine_learning"},
#                 "knn" : {"class": knn , "type": "machine_learning"},
#                 "mlp" : {"class": mlp , "type": "machine_learning"},
}
special_specs_functions = {
    'randomForest' : multipleSpecialRFSpecs,
    'mlp' : None,
    'xgBoost' : None,
    'svc' : multipleSpecialRFSpecs,
    'adaBoost' : multipleSpecialRFSpecs,
    'gradientBoosting' : multipleSpecialRFSpecs,
    'bagging' : multipleSpecialRFSpecs,
    'knn' : None,
}

In [3]:
print('Models that will be runnning...')
print('-------------------------------')
pprint.pprint(models_dict)

Models that will be runnning...
-------------------------------
{'xgBoost': {'class': <class 'src.models.machineLearning.xgBoost.xgBoost'>,
             'type': 'machine_learning'}}


In [4]:
# Should be a separate function or file # For avoiding bugs and unexpected behaviours # which are hard to find later, no value 
# should be replaced, new values should be # added

for key in specifications_master_dict.keys():
    specifications_master_dict[key].update({
          'n_randomized_search': { 
                        'value_type': 'constant',
                        'values': [200]},
          'test_ratio': { 
                        'value_type': 'constant',
                        'values': [0.0]},
          'y_col': { 
                      'value_type': 'constant',
                      'values': []},          
          'cols_to_drop': { 
                      'value_type': 'constant',
                      'values': [[]]},
           'ensemble': { 
                  'value_type': 'constant',
                  'values': [False]},
           'trail':  { 
                  'value_type': 'categories',
                  'values': [0]},
          'n_cols_dropped': { 
                      'value_type': 'constant',
                      'values': [0]},
          'current_date_col': { 
                      'value_type': 'constant',
                      'values': []},
          'no_steps_ahead': { 
                      'value_type': 'constant',
                      'values': [0]},
          'top_n_models_drill': { 
                             'value_type': 'constant',
                             'values': [5]},
          'top_n_models_pred': { 
                             'value_type': 'constant',
                             'values': [5]},
          'apply_smote': { 
                             'value_type': 'constant',
                             'values': [True]},      # can be True or False
          'time_cols': {    
                             'value_type': 'constant',
                             'values': [[]]},
          'n_drill_search': { 
                     'value_type': 'constant',
                     'values': [[100]]},
          'cross_validation': { 
                     'value_type': 'constant',
                     'values': [['KFold']]},     # can take values - KFold, LeaveOneOut, train_val_holdout
                                                 # LeavePOut - in case of KFold & LeavePOut, we should have 
                                                 # additional parameter of k and p
         'kFold_splits': { 
                             'value_type': 'categories',
                             'values': [5]}, 
         'pFold_splits': { 
                             'value_type': 'categories',
                             'values': [3,4,5,10,15,20]}, 
         'feature_cols': { 
                             'value_type': 'constant',
                             'values': []},},
    )
# pprint.pprint(specifications_master_dict)
print('Done Updating the Common Specs')

Done Updating the Common Specs


In [5]:
# pprint.pprint(specifications_master_dict)

In [6]:
# def apply_smote(data, feature_cols, target):
#     '''
#     Input:
#     data - the original dataframe
#     feature_cols - the feature columns (list of columns)
#     target - the target column (string value)
#     '''
#     sm = SMOTE(random_state=42)
#     features, targets = sm.fit_resample(data[feature_cols],data[target])
#     feature_df = pd.DataFrame(features, columns=feature_cols)
#     target_df = pd.DataFrame(targets, columns=[target])
#     output = pd.concat([feature_df, target_df], axis=1)
#     # Shuffling dataset
#     output = output.sample(frac=1).reset_index(drop=True)
#     return output

In [7]:
ground_truth = pd.read_csv("/Users/arpitjain/Downloads/SatPRo/District - Ground Truth - 2011&2001.csv")
femp_lit = pd.read_csv("/Users/arpitjain/Downloads/SatPRo/2001_L7_data/ChangeClassifier/FEMP&LIT.csv")
cols_2 = ['census_code', 'LIT_2011', 'FEMP_2011']
femp_lit = femp_lit[cols_2]

In [8]:
folder = "/Users/arpitjain/Downloads/SatPRo/2001_L7_data/ChangeClassifier/input_data_2011/"
for files in os.listdir(folder):
    indicator = files.split("_")[0]
    
    cols = ['census_code', indicator+'_2001', indicator+'_2011']
    gt = ground_truth[cols]
    
    df = pd.read_csv(os.path.join(folder, files))
    
    data = gt.merge(df,on='census_code', how='left')
    data = femp_lit.merge(data, on='census_code', how='left')
    
    data.dropna(inplace=True)
    
    feature_cols = [indicator+'_2011','LIT_2011','FEMP_2011','predictions_2003','predictions_2005', 'predictions_2007', 'predictions_2009']
    target = indicator+'_2001'
    drop_cols = ['census_code']
    print('data_points:',data.shape[0],'  ',files,'------' ,target, drop_cols, feature_cols)
    for key in specifications_master_dict.keys():
               specifications_master_dict[key].update({
                     'y_col': {
                                 'value_type': 'constant',
                                 'values': [target]},
                     'current_date_col': {
                                 'value_type': 'constant',
                                 'values': drop_cols},
                    'feature_cols': {
                                        'value_type': 'constant',
                                        'values': [feature_cols]},},
               )
    print('-------------------------------------')
    print('-------------------------------------') 
    model_specs_vs_score = callModels(models_dict, scores_dict, specifications_master_dict, data, False)
    top_scores_specs = topSpecs(model_specs_vs_score, scores_dict,get_train_score=True)
    print('------------------------')
    print(top_scores_specs['xgBoost']['val_scores'])
    print(top_scores_specs['xgBoost']['train_scores'])
    pickle.dump(top_scores_specs, open(target + '_TT_FEMP&LIT_2011_base_XGB.pkl', 'wb'))
    print('------------------------')

data_points: 580    CHH_CC.csv ------ CHH_2001 ['census_code'] ['CHH_2011', 'LIT_2011', 'FEMP_2011', 'predictions_2003', 'predictions_2005', 'predictions_2007', 'predictions_2009']
-------------------------------------
-------------------------------------
Running model  xgBoost
####### n_randomized_search [200]
    run 0   CV Type ['KFold'] 5
    run 1   CV Type ['KFold'] 5
    run 2   CV Type ['KFold'] 5
    run 3   CV Type ['KFold'] 5
    run 4   CV Type ['KFold'] 5
    run 5   CV Type ['KFold'] 5
    run 6   CV Type ['KFold'] 5
    run 7   CV Type ['KFold'] 5
    run 8   CV Type ['KFold'] 5
    run 9   CV Type ['KFold'] 5
    run 10   CV Type ['KFold'] 5
    run 11   CV Type ['KFold'] 5
    run 12   CV Type ['KFold'] 5
    run 13   CV Type ['KFold'] 5
    run 14   CV Type ['KFold'] 5
    run 15   CV Type ['KFold'] 5
    run 16   CV Type ['KFold'] 5
    run 17   CV Type ['KFold'] 5
    run 18   CV Type ['KFold'] 5
    run 19   CV Type ['KFold'] 5
    run 20   CV Type ['KFold'] 5
   

    run 21   CV Type ['KFold'] 5
    run 22   CV Type ['KFold'] 5
    run 23   CV Type ['KFold'] 5
    run 24   CV Type ['KFold'] 5
    run 25   CV Type ['KFold'] 5
    run 26   CV Type ['KFold'] 5
    run 27   CV Type ['KFold'] 5
    run 28   CV Type ['KFold'] 5
    run 29   CV Type ['KFold'] 5
    run 30   CV Type ['KFold'] 5
    run 31   CV Type ['KFold'] 5
    run 32   CV Type ['KFold'] 5
    run 33   CV Type ['KFold'] 5
    run 34   CV Type ['KFold'] 5
    run 35   CV Type ['KFold'] 5
    run 36   CV Type ['KFold'] 5
    run 37   CV Type ['KFold'] 5
    run 38   CV Type ['KFold'] 5
    run 39   CV Type ['KFold'] 5
    run 40   CV Type ['KFold'] 5
    run 41   CV Type ['KFold'] 5
    run 42   CV Type ['KFold'] 5
    run 43   CV Type ['KFold'] 5
    run 44   CV Type ['KFold'] 5
    run 45   CV Type ['KFold'] 5
    run 46   CV Type ['KFold'] 5
    run 47   CV Type ['KFold'] 5
    run 48   CV Type ['KFold'] 5
    run 49   CV Type ['KFold'] 5
    run 50   CV Type ['KFold'] 5
    run 51

    run 52   CV Type ['KFold'] 5
    run 53   CV Type ['KFold'] 5
    run 54   CV Type ['KFold'] 5
    run 55   CV Type ['KFold'] 5
    run 56   CV Type ['KFold'] 5
    run 57   CV Type ['KFold'] 5
    run 58   CV Type ['KFold'] 5
    run 59   CV Type ['KFold'] 5
    run 60   CV Type ['KFold'] 5
    run 61   CV Type ['KFold'] 5
    run 62   CV Type ['KFold'] 5
    run 63   CV Type ['KFold'] 5
    run 64   CV Type ['KFold'] 5
    run 65   CV Type ['KFold'] 5
    run 66   CV Type ['KFold'] 5
    run 67   CV Type ['KFold'] 5
    run 68   CV Type ['KFold'] 5
    run 69   CV Type ['KFold'] 5
    run 70   CV Type ['KFold'] 5
    run 71   CV Type ['KFold'] 5
    run 72   CV Type ['KFold'] 5
    run 73   CV Type ['KFold'] 5
    run 74   CV Type ['KFold'] 5
    run 75   CV Type ['KFold'] 5
    run 76   CV Type ['KFold'] 5
    run 77   CV Type ['KFold'] 5
    run 78   CV Type ['KFold'] 5
    run 79   CV Type ['KFold'] 5
    run 80   CV Type ['KFold'] 5
    run 81   CV Type ['KFold'] 5
    run 82

    run 82   CV Type ['KFold'] 5
    run 83   CV Type ['KFold'] 5
    run 84   CV Type ['KFold'] 5
    run 85   CV Type ['KFold'] 5
    run 86   CV Type ['KFold'] 5
    run 87   CV Type ['KFold'] 5
    run 88   CV Type ['KFold'] 5
    run 89   CV Type ['KFold'] 5
    run 90   CV Type ['KFold'] 5
    run 91   CV Type ['KFold'] 5
    run 92   CV Type ['KFold'] 5
    run 93   CV Type ['KFold'] 5
    run 94   CV Type ['KFold'] 5
    run 95   CV Type ['KFold'] 5
    run 96   CV Type ['KFold'] 5
    run 97   CV Type ['KFold'] 5
    run 98   CV Type ['KFold'] 5
    run 99   CV Type ['KFold'] 5
    run 100   CV Type ['KFold'] 5
    run 101   CV Type ['KFold'] 5
    run 102   CV Type ['KFold'] 5
    run 103   CV Type ['KFold'] 5
    run 104   CV Type ['KFold'] 5
    run 105   CV Type ['KFold'] 5
    run 106   CV Type ['KFold'] 5
    run 107   CV Type ['KFold'] 5
    run 108   CV Type ['KFold'] 5
    run 109   CV Type ['KFold'] 5
    run 110   CV Type ['KFold'] 5
    run 111   CV Type ['KFold'] 

    run 112   CV Type ['KFold'] 5
    run 113   CV Type ['KFold'] 5
    run 114   CV Type ['KFold'] 5
    run 115   CV Type ['KFold'] 5
    run 116   CV Type ['KFold'] 5
    run 117   CV Type ['KFold'] 5
    run 118   CV Type ['KFold'] 5
    run 119   CV Type ['KFold'] 5
    run 120   CV Type ['KFold'] 5
    run 121   CV Type ['KFold'] 5
    run 122   CV Type ['KFold'] 5
    run 123   CV Type ['KFold'] 5
    run 124   CV Type ['KFold'] 5
    run 125   CV Type ['KFold'] 5
    run 126   CV Type ['KFold'] 5
    run 127   CV Type ['KFold'] 5
    run 128   CV Type ['KFold'] 5
    run 129   CV Type ['KFold'] 5
    run 130   CV Type ['KFold'] 5
    run 131   CV Type ['KFold'] 5
    run 132   CV Type ['KFold'] 5
    run 133   CV Type ['KFold'] 5
    run 134   CV Type ['KFold'] 5
    run 135   CV Type ['KFold'] 5
    run 136   CV Type ['KFold'] 5
    run 137   CV Type ['KFold'] 5
    run 138   CV Type ['KFold'] 5
    run 139   CV Type ['KFold'] 5
    run 140   CV Type ['KFold'] 5
    run 141   

    run 141   CV Type ['KFold'] 5
    run 142   CV Type ['KFold'] 5
    run 143   CV Type ['KFold'] 5
    run 144   CV Type ['KFold'] 5
    run 145   CV Type ['KFold'] 5
    run 146   CV Type ['KFold'] 5
    run 147   CV Type ['KFold'] 5
    run 148   CV Type ['KFold'] 5
    run 149   CV Type ['KFold'] 5
    run 150   CV Type ['KFold'] 5
    run 151   CV Type ['KFold'] 5
    run 152   CV Type ['KFold'] 5
    run 153   CV Type ['KFold'] 5
    run 154   CV Type ['KFold'] 5
    run 155   CV Type ['KFold'] 5
    run 156   CV Type ['KFold'] 5
    run 157   CV Type ['KFold'] 5
    run 158   CV Type ['KFold'] 5
    run 159   CV Type ['KFold'] 5
    run 160   CV Type ['KFold'] 5
    run 161   CV Type ['KFold'] 5
    run 162   CV Type ['KFold'] 5
    run 163   CV Type ['KFold'] 5
    run 164   CV Type ['KFold'] 5
    run 165   CV Type ['KFold'] 5
    run 166   CV Type ['KFold'] 5
    run 167   CV Type ['KFold'] 5
    run 168   CV Type ['KFold'] 5
    run 169   CV Type ['KFold'] 5
    run 170   