## About this notebook

Cross-sectional district level models are run for which features generated by Arpit are used. For every indicator we have around 10 feature files (bin 5-15

Will follow a 5fold cross-validation approach and take the mean of weighted f1-score

Finally all scores will be reported for the best parameters

Not running MLP for this run

In [1]:
import numpy as np                               
import pandas as pd                              
import csv
import time
import keras
import pprint
import copy
import random
import math
import itertools
from operator import itemgetter
from imblearn.over_sampling import SMOTE
import pickle
import os

# mainly for stats models, and tensorflow,pandas version
# related warnings, which should not be ignored ideally
import warnings
warnings.filterwarnings("ignore")
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL) 
random.seed(5)
np.random.seed(5)

import src
import specs

# all the specs
from specs.mlSpecs import *
from specs.mlDrillSpecs import *
from specs.ensembleSpecs import *
# from specs.statsSpecs import *

# all the classes and functions for dictionaries
from src.scores import *

from src.models.machineLearning.randomForest import randomForest
from src.models.machineLearning.xgBoost import xgBoost
from src.models.machineLearning.svc import svc
from src.models.machineLearning.adaBoost import adaBoost
from src.models.machineLearning.bagging import bagging
from src.models.machineLearning.gradientBoosting import gradientBoosting
from src.models.machineLearning.knn import knn
from src.models.machineLearning.mlp import mlp
from src.models.machineLearning.ensembleMLP import ensembleMLP

from src.randomizedSearchSpecial import *

#all other functions
from src.dataPreprocessingUtils import *
from src.selectorFunctions import *
from src.mlPrediction import *
from src.combining import *
from src.randomizedSearch import *

Using TensorFlow backend.


In [2]:
# These 3 need to be defined outside in some file which is loaded
scores_dict = {
        "f1_weighted" : {"function" : f1_weighted, "type" : "score",},
        "f1_macro" : {"function" : f1_macro, "type" : "score",},
        "f1_micro" : {"function" : f1_micro, "type" : "score",},
        "accuracy_normal" : {"function" : accuracy_normal, "type" : "score",},
        "accuracy_weighted" : {"function" : accuracy_weighted, "type" : "score",},
        "accuracy_classwise" : {"function" : accuracy_classwise, "type" : "score",},
        "precision_weighted" : {"function" : precision_weighted, "type" : "score",},
        "precision_macro" : {"function" : precision_macro, "type" : "score",},
        "precision_micro" : {"function" : precision_micro, "type" : "score",},
        "recall_weighted" : {"function" : recall_weighted, "type" : "score",},
        "recall_macro" : {"function" : recall_macro, "type" : "score",},
        "recall_micro" : {"function" : recall_micro, "type" : "score",},
#         "roc_auc_weighted" : {"function" : roc_auc_weighted, "type" : "score",},
#         "roc_auc_macro" : {"function" : roc_auc_macro, "type" : "score",},
#         "roc_auc_micro" : {"function" : roc_auc_micro, "type" : "score",},
        "categorical_crossentropy" : {"type" : "error",},
        "accuracy" : {"type" : "score",},
        "acc" : {"type" : "score",}
    }
models_dict = {
#                 "randomForest" : {"class": randomForest , "type": "machine_learning"},
                "xgBoost" : {"class": xgBoost , "type": "machine_learning"},
#                 "svc" : {"class": svc , "type": "machine_learning"},
#                 "bagging" : {"class": bagging , "type": "machine_learning"},
#                 "adaBoost" : {"class": adaBoost , "type": "machine_learning"},
#                 "gradientBoosting" : {"class": gradientBoosting , "type": "machine_learning"},
#                 "knn" : {"class": knn , "type": "machine_learning"},
#                 "mlp" : {"class": mlp , "type": "machine_learning"},
}
special_specs_functions = {
    'randomForest' : multipleSpecialRFSpecs,
    'mlp' : None,
    'xgBoost' : None,
    'svc' : multipleSpecialRFSpecs,
    'adaBoost' : multipleSpecialRFSpecs,
    'gradientBoosting' : multipleSpecialRFSpecs,
    'bagging' : multipleSpecialRFSpecs,
    'knn' : None,
}

In [3]:
print('Models that will be runnning...')
print('-------------------------------')
pprint.pprint(models_dict)

Models that will be runnning...
-------------------------------
{'xgBoost': {'class': <class 'src.models.machineLearning.xgBoost.xgBoost'>,
             'type': 'machine_learning'}}


In [4]:
# Should be a separate function or file # For avoiding bugs and unexpected behaviours # which are hard to find later, no value 
# should be replaced, new values should be # added

for key in specifications_master_dict.keys():
    specifications_master_dict[key].update({
          'n_randomized_search': { 
                        'value_type': 'constant',
                        'values': [300]},
          'test_ratio': { 
                        'value_type': 'constant',
                        'values': [0.0]},
          'y_col': { 
                      'value_type': 'constant',
                      'values': []},          
          'cols_to_drop': { 
                      'value_type': 'constant',
                      'values': [[]]},
           'ensemble': { 
                  'value_type': 'constant',
                  'values': [False]},
           'trail':  { 
                  'value_type': 'categories',
                  'values': [0]},
          'n_cols_dropped': { 
                      'value_type': 'constant',
                      'values': [0]},
          'current_date_col': { 
                      'value_type': 'constant',
                      'values': []},
          'no_steps_ahead': { 
                      'value_type': 'constant',
                      'values': [0]},
          'top_n_models_drill': { 
                             'value_type': 'constant',
                             'values': [5]},
          'top_n_models_pred': { 
                             'value_type': 'constant',
                             'values': [5]},
          'apply_smote': { 
                             'value_type': 'constant',
                             'values': [True]},  #. True or False
          'time_cols': {    
                             'value_type': 'constant',
                             'values': [[]]},
          'n_drill_search': { 
                     'value_type': 'constant',
                     'values': [[100]]},
          'cross_validation': { 
                     'value_type': 'constant',
                     'values': [['KFold']]},     # can take values - KFold, LeaveOneOut, train_val_holdout
                                                 # LeavePOut - in case of KFold & LeavePOut, we should have 
                                                 # additional parameter of k and p
         'kFold_splits': { 
                             'value_type': 'categories',
                             'values': [5]}, 
         'pFold_splits': { 
                             'value_type': 'categories',
                             'values': [3,4,5,10,15,20]}, 
         'feature_cols': { 
                             'value_type': 'constant',
                             'values': []},},
    )
# pprint.pprint(specifications_master_dict)
print('Done Updating the Common Specs')

Done Updating the Common Specs


### Phaneesh's features

## All indicators - All bincount

In [13]:
filepath = "/Users/arpitjain/Downloads/SatPRo/2001_L7_data/2011_Anuj_Method/Features_100m_quantile@10.csv"

In [14]:
df = pd.read_csv(filepath)

In [15]:
print(df.columns)

Index(['band_1_1', 'band_1_2', 'band_1_3', 'band_1_4', 'band_1_5', 'band_1_6',
       'band_1_7', 'band_1_8', 'band_1_9', 'band_1_10',
       ...
       'band_12_2', 'band_12_3', 'band_12_4', 'band_12_5', 'band_12_6',
       'band_12_7', 'band_12_8', 'band_12_9', 'band_12_10', 'census_code'],
      dtype='object', length=121)


In [16]:
ground_truth = pd.read_csv("/Users/arpitjain/Downloads/SatPRo/2001_L7_data/District - Ground Truth - 2011.csv")
cols = ['census_code','MSW_2011','BF_2011','MSL_2011', 'FC_2011','CHH_2011','ASSET_2011']
ground_truth = ground_truth[cols]
data = ground_truth.merge(df, how='left', on='census_code')
data.dropna(inplace=True)

In [17]:
print(data.columns)
print('---------')

Index(['census_code', 'MSW_2011', 'BF_2011', 'MSL_2011', 'FC_2011', 'CHH_2011',
       'ASSET_2011', 'band_1_1', 'band_1_2', 'band_1_3',
       ...
       'band_12_1', 'band_12_2', 'band_12_3', 'band_12_4', 'band_12_5',
       'band_12_6', 'band_12_7', 'band_12_8', 'band_12_9', 'band_12_10'],
      dtype='object', length=127)
---------


In [18]:
print(data.columns[7:])
print('---------')
print(data.shape)

Index(['band_1_1', 'band_1_2', 'band_1_3', 'band_1_4', 'band_1_5', 'band_1_6',
       'band_1_7', 'band_1_8', 'band_1_9', 'band_1_10',
       ...
       'band_12_1', 'band_12_2', 'band_12_3', 'band_12_4', 'band_12_5',
       'band_12_6', 'band_12_7', 'band_12_8', 'band_12_9', 'band_12_10'],
      dtype='object', length=120)
---------
(633, 127)


In [19]:
indicators = ['CHH_2011','ASSET_2011']  # 'MSW_2011','BF_2011','MSL_2011', 'FC_2011',
for target in indicators:
    print('#### Running for', target)
    drop_cols = [i for i in indicators if i!=target]
    drop_cols.extend(['census_code'])
    
    info_dict = {}
    info_dict['target'] = target
    info_dict['drop_cols'] = drop_cols
    info_dict['features'] = data.columns[7:].tolist()
    
#     output = apply_smote(data, info_dict['features'], target)
#     output['random'] = 1
    
    print(info_dict['target'], info_dict['drop_cols'], info_dict['features'])
    for key in specifications_master_dict.keys():
               specifications_master_dict[key].update({
                     'y_col': {
                                 'value_type': 'constant',
                                 'values': [info_dict['target']]},
                     'current_date_col': {
                                 'value_type': 'constant',
                                 'values': drop_cols},
                    'feature_cols': {
                                        'value_type': 'constant',
                                        'values': [info_dict['features']]},},
               )
    
    
    model_specs_vs_score = callModels(models_dict, scores_dict,specifications_master_dict, data, False)
    top_scores_specs = topSpecs(model_specs_vs_score, scores_dict,get_train_score=True)
    print('------------------------')
    print(top_scores_specs['xgBoost']['val_scores'])
    print(top_scores_specs['xgBoost']['train_scores'])
    pickle.dump(top_scores_specs, open(target + 'AnujMethod2011_XGB_top_score_specs@10.pkl', 'wb'))
    print('------------------------')

#### Running for MSW_2011
MSW_2011 ['BF_2011', 'MSL_2011', 'FC_2011', 'CHH_2011', 'ASSET_2011', 'census_code'] ['band_1_1', 'band_1_2', 'band_1_3', 'band_1_4', 'band_1_5', 'band_1_6', 'band_1_7', 'band_1_8', 'band_1_9', 'band_1_10', 'band_2_1', 'band_2_2', 'band_2_3', 'band_2_4', 'band_2_5', 'band_2_6', 'band_2_7', 'band_2_8', 'band_2_9', 'band_2_10', 'band_3_1', 'band_3_2', 'band_3_3', 'band_3_4', 'band_3_5', 'band_3_6', 'band_3_7', 'band_3_8', 'band_3_9', 'band_3_10', 'band_4_1', 'band_4_2', 'band_4_3', 'band_4_4', 'band_4_5', 'band_4_6', 'band_4_7', 'band_4_8', 'band_4_9', 'band_4_10', 'band_5_1', 'band_5_2', 'band_5_3', 'band_5_4', 'band_5_5', 'band_5_6', 'band_5_7', 'band_5_8', 'band_5_9', 'band_5_10', 'band_6_1', 'band_6_2', 'band_6_3', 'band_6_4', 'band_6_5', 'band_6_6', 'band_6_7', 'band_6_8', 'band_6_9', 'band_6_10', 'band_7_1', 'band_7_2', 'band_7_3', 'band_7_4', 'band_7_5', 'band_7_6', 'band_7_7', 'band_7_8', 'band_7_9', 'band_7_10', 'band_8_1', 'band_8_2', 'band_8_3', 'band

    run 196   CV Type ['KFold'] 5
    run 197   CV Type ['KFold'] 5
    run 198   CV Type ['KFold'] 5
    run 199   CV Type ['KFold'] 5
    run 200   CV Type ['KFold'] 5
    run 201   CV Type ['KFold'] 5
    run 202   CV Type ['KFold'] 5
    run 203   CV Type ['KFold'] 5
    run 204   CV Type ['KFold'] 5
    run 205   CV Type ['KFold'] 5
    run 206   CV Type ['KFold'] 5
    run 207   CV Type ['KFold'] 5
    run 208   CV Type ['KFold'] 5
    run 209   CV Type ['KFold'] 5
    run 210   CV Type ['KFold'] 5
    run 211   CV Type ['KFold'] 5
    run 212   CV Type ['KFold'] 5
    run 213   CV Type ['KFold'] 5
    run 214   CV Type ['KFold'] 5
    run 215   CV Type ['KFold'] 5
    run 216   CV Type ['KFold'] 5
    run 217   CV Type ['KFold'] 5
    run 218   CV Type ['KFold'] 5
    run 219   CV Type ['KFold'] 5
    run 220   CV Type ['KFold'] 5
    run 221   CV Type ['KFold'] 5
    run 222   CV Type ['KFold'] 5
    run 223   CV Type ['KFold'] 5
    run 224   CV Type ['KFold'] 5
    run 225   

    run 84   CV Type ['KFold'] 5
    run 85   CV Type ['KFold'] 5
    run 86   CV Type ['KFold'] 5
    run 87   CV Type ['KFold'] 5
    run 88   CV Type ['KFold'] 5
    run 89   CV Type ['KFold'] 5
    run 90   CV Type ['KFold'] 5
    run 91   CV Type ['KFold'] 5
    run 92   CV Type ['KFold'] 5
    run 93   CV Type ['KFold'] 5
    run 94   CV Type ['KFold'] 5
    run 95   CV Type ['KFold'] 5
    run 96   CV Type ['KFold'] 5
    run 97   CV Type ['KFold'] 5
    run 98   CV Type ['KFold'] 5
    run 99   CV Type ['KFold'] 5
    run 100   CV Type ['KFold'] 5
    run 101   CV Type ['KFold'] 5
    run 102   CV Type ['KFold'] 5
    run 103   CV Type ['KFold'] 5
    run 104   CV Type ['KFold'] 5
    run 105   CV Type ['KFold'] 5
    run 106   CV Type ['KFold'] 5
    run 107   CV Type ['KFold'] 5
    run 108   CV Type ['KFold'] 5
    run 109   CV Type ['KFold'] 5
    run 110   CV Type ['KFold'] 5
    run 111   CV Type ['KFold'] 5
    run 112   CV Type ['KFold'] 5
    run 113   CV Type ['KFold'

    run 1   CV Type ['KFold'] 5
    run 2   CV Type ['KFold'] 5
    run 3   CV Type ['KFold'] 5
    run 4   CV Type ['KFold'] 5
    run 5   CV Type ['KFold'] 5
    run 6   CV Type ['KFold'] 5
    run 7   CV Type ['KFold'] 5
    run 8   CV Type ['KFold'] 5
    run 9   CV Type ['KFold'] 5
    run 10   CV Type ['KFold'] 5
    run 11   CV Type ['KFold'] 5
    run 12   CV Type ['KFold'] 5
    run 13   CV Type ['KFold'] 5
    run 14   CV Type ['KFold'] 5
    run 15   CV Type ['KFold'] 5
    run 16   CV Type ['KFold'] 5
    run 17   CV Type ['KFold'] 5
    run 18   CV Type ['KFold'] 5
    run 19   CV Type ['KFold'] 5
    run 20   CV Type ['KFold'] 5
    run 21   CV Type ['KFold'] 5
    run 22   CV Type ['KFold'] 5
    run 23   CV Type ['KFold'] 5
    run 24   CV Type ['KFold'] 5
    run 25   CV Type ['KFold'] 5
    run 26   CV Type ['KFold'] 5
    run 27   CV Type ['KFold'] 5
    run 28   CV Type ['KFold'] 5
    run 29   CV Type ['KFold'] 5
    run 30   CV Type ['KFold'] 5
    run 31   CV Typ

    run 246   CV Type ['KFold'] 5
    run 247   CV Type ['KFold'] 5
    run 248   CV Type ['KFold'] 5
    run 249   CV Type ['KFold'] 5
    run 250   CV Type ['KFold'] 5
    run 251   CV Type ['KFold'] 5
    run 252   CV Type ['KFold'] 5
    run 253   CV Type ['KFold'] 5
    run 254   CV Type ['KFold'] 5
    run 255   CV Type ['KFold'] 5
    run 256   CV Type ['KFold'] 5
    run 257   CV Type ['KFold'] 5
    run 258   CV Type ['KFold'] 5
    run 259   CV Type ['KFold'] 5
    run 260   CV Type ['KFold'] 5
    run 261   CV Type ['KFold'] 5
    run 262   CV Type ['KFold'] 5
    run 263   CV Type ['KFold'] 5
    run 264   CV Type ['KFold'] 5
    run 265   CV Type ['KFold'] 5
    run 266   CV Type ['KFold'] 5
    run 267   CV Type ['KFold'] 5
    run 268   CV Type ['KFold'] 5
    run 269   CV Type ['KFold'] 5
    run 270   CV Type ['KFold'] 5
    run 271   CV Type ['KFold'] 5
    run 272   CV Type ['KFold'] 5
    run 273   CV Type ['KFold'] 5
    run 274   CV Type ['KFold'] 5
    run 275   

    run 135   CV Type ['KFold'] 5
    run 136   CV Type ['KFold'] 5
    run 137   CV Type ['KFold'] 5
    run 138   CV Type ['KFold'] 5
    run 139   CV Type ['KFold'] 5
    run 140   CV Type ['KFold'] 5
    run 141   CV Type ['KFold'] 5
    run 142   CV Type ['KFold'] 5
    run 143   CV Type ['KFold'] 5
    run 144   CV Type ['KFold'] 5
    run 145   CV Type ['KFold'] 5
    run 146   CV Type ['KFold'] 5
    run 147   CV Type ['KFold'] 5
    run 148   CV Type ['KFold'] 5
    run 149   CV Type ['KFold'] 5
    run 150   CV Type ['KFold'] 5
    run 151   CV Type ['KFold'] 5
    run 152   CV Type ['KFold'] 5
    run 153   CV Type ['KFold'] 5
    run 154   CV Type ['KFold'] 5
    run 155   CV Type ['KFold'] 5
    run 156   CV Type ['KFold'] 5
    run 157   CV Type ['KFold'] 5
    run 158   CV Type ['KFold'] 5
    run 159   CV Type ['KFold'] 5
    run 160   CV Type ['KFold'] 5
    run 161   CV Type ['KFold'] 5
    run 162   CV Type ['KFold'] 5
    run 163   CV Type ['KFold'] 5
    run 164   

    run 21   CV Type ['KFold'] 5
    run 22   CV Type ['KFold'] 5
    run 23   CV Type ['KFold'] 5
    run 24   CV Type ['KFold'] 5
    run 25   CV Type ['KFold'] 5
    run 26   CV Type ['KFold'] 5
    run 27   CV Type ['KFold'] 5
    run 28   CV Type ['KFold'] 5
    run 29   CV Type ['KFold'] 5
    run 30   CV Type ['KFold'] 5
    run 31   CV Type ['KFold'] 5
    run 32   CV Type ['KFold'] 5
    run 33   CV Type ['KFold'] 5
    run 34   CV Type ['KFold'] 5
    run 35   CV Type ['KFold'] 5
    run 36   CV Type ['KFold'] 5
    run 37   CV Type ['KFold'] 5
    run 38   CV Type ['KFold'] 5
    run 39   CV Type ['KFold'] 5
    run 40   CV Type ['KFold'] 5
    run 41   CV Type ['KFold'] 5
    run 42   CV Type ['KFold'] 5
    run 43   CV Type ['KFold'] 5
    run 44   CV Type ['KFold'] 5
    run 45   CV Type ['KFold'] 5
    run 46   CV Type ['KFold'] 5
    run 47   CV Type ['KFold'] 5
    run 48   CV Type ['KFold'] 5
    run 49   CV Type ['KFold'] 5
    run 50   CV Type ['KFold'] 5
    run 51

KeyboardInterrupt: 