## About this notebook

Cross-sectional district level models are run for which features generated by Arpit are used. For every indicator we have around 10 feature files (bin 5-15

Will follow a 5fold cross-validation approach and take the mean of weighted f1-score

Finally all scores will be reported for the best parameters

Not running MLP for this run

In [1]:
import numpy as np                               
import pandas as pd                              
import csv
import time
import keras
import pprint
import copy
import random
import math
import itertools
from operator import itemgetter
from imblearn.over_sampling import SMOTE
import pickle
import os

# mainly for stats models, and tensorflow,pandas version
# related warnings, which should not be ignored ideally
import warnings
warnings.filterwarnings("ignore")
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL) 
random.seed(5)
np.random.seed(5)

import src
import specs

# all the specs
from specs.mlSpecs import *
from specs.mlDrillSpecs import *
from specs.ensembleSpecs import *
# from specs.statsSpecs import *

# all the classes and functions for dictionaries
from src.scores import *

from src.models.machineLearning.randomForest import randomForest
from src.models.machineLearning.xgBoost import xgBoost
from src.models.machineLearning.svc import svc
from src.models.machineLearning.adaBoost import adaBoost
from src.models.machineLearning.bagging import bagging
from src.models.machineLearning.gradientBoosting import gradientBoosting
from src.models.machineLearning.knn import knn
from src.models.machineLearning.mlp import mlp
from src.models.machineLearning.ensembleMLP import ensembleMLP

from src.randomizedSearchSpecial import *

#all other functions
from src.dataPreprocessingUtils import *
from src.selectorFunctions import *
from src.mlPrediction import *
from src.combining import *
from src.randomizedSearch import *

Using TensorFlow backend.


In [2]:
# These 3 need to be defined outside in some file which is loaded
scores_dict = {
        "f1_weighted" : {"function" : f1_weighted, "type" : "score",},
        "f1_macro" : {"function" : f1_macro, "type" : "score",},
        "f1_micro" : {"function" : f1_micro, "type" : "score",},
        "accuracy_normal" : {"function" : accuracy_normal, "type" : "score",},
        "accuracy_weighted" : {"function" : accuracy_weighted, "type" : "score",},
        "accuracy_classwise" : {"function" : accuracy_classwise, "type" : "score",},
        "precision_weighted" : {"function" : precision_weighted, "type" : "score",},
        "precision_macro" : {"function" : precision_macro, "type" : "score",},
        "precision_micro" : {"function" : precision_micro, "type" : "score",},
        "recall_weighted" : {"function" : recall_weighted, "type" : "score",},
        "recall_macro" : {"function" : recall_macro, "type" : "score",},
        "recall_micro" : {"function" : recall_micro, "type" : "score",},
#         "roc_auc_weighted" : {"function" : roc_auc_weighted, "type" : "score",},
#         "roc_auc_macro" : {"function" : roc_auc_macro, "type" : "score",},
#         "roc_auc_micro" : {"function" : roc_auc_micro, "type" : "score",},
        "categorical_crossentropy" : {"type" : "error",},
        "accuracy" : {"type" : "score",},
        "acc" : {"type" : "score",}
    }
models_dict = {
#                 "randomForest" : {"class": randomForest , "type": "machine_learning"},
                "xgBoost" : {"class": xgBoost , "type": "machine_learning"},
#                 "svc" : {"class": svc , "type": "machine_learning"},
#                 "bagging" : {"class": bagging , "type": "machine_learning"},
#                 "adaBoost" : {"class": adaBoost , "type": "machine_learning"},
#                 "gradientBoosting" : {"class": gradientBoosting , "type": "machine_learning"},
#                 "knn" : {"class": knn , "type": "machine_learning"},
#                 "mlp" : {"class": mlp , "type": "machine_learning"},
}
special_specs_functions = {
    'randomForest' : multipleSpecialRFSpecs,
    'mlp' : None,
    'xgBoost' : None,
    'svc' : multipleSpecialRFSpecs,
    'adaBoost' : multipleSpecialRFSpecs,
    'gradientBoosting' : multipleSpecialRFSpecs,
    'bagging' : multipleSpecialRFSpecs,
    'knn' : None,
}

In [3]:
print('Models that will be runnning...')
print('-------------------------------')
pprint.pprint(models_dict)

Models that will be runnning...
-------------------------------
{'xgBoost': {'class': <class 'src.models.machineLearning.xgBoost.xgBoost'>,
             'type': 'machine_learning'}}


In [4]:
# Should be a separate function or file # For avoiding bugs and unexpected behaviours # which are hard to find later, no value 
# should be replaced, new values should be # added

for key in specifications_master_dict.keys():
    specifications_master_dict[key].update({
          'n_randomized_search': { 
                        'value_type': 'constant',
                        'values': [300]},
          'test_ratio': { 
                        'value_type': 'constant',
                        'values': [0.0]},
          'y_col': { 
                      'value_type': 'constant',
                      'values': []},          
          'cols_to_drop': { 
                      'value_type': 'constant',
                      'values': [[]]},
           'ensemble': { 
                  'value_type': 'constant',
                  'values': [False]},
           'trail':  { 
                  'value_type': 'categories',
                  'values': [0]},
          'n_cols_dropped': { 
                      'value_type': 'constant',
                      'values': [0]},
          'current_date_col': { 
                      'value_type': 'constant',
                      'values': []},
          'no_steps_ahead': { 
                      'value_type': 'constant',
                      'values': [0]},
          'top_n_models_drill': { 
                             'value_type': 'constant',
                             'values': [5]},
          'top_n_models_pred': { 
                             'value_type': 'constant',
                             'values': [5]},
          'apply_smote': { 
                             'value_type': 'constant',
                             'values': [True]},      # can be True or False
          'time_cols': {    
                             'value_type': 'constant',
                             'values': [[]]},
          'n_drill_search': { 
                     'value_type': 'constant',
                     'values': [[100]]},
          'cross_validation': { 
                     'value_type': 'constant',
                     'values': [['KFold']]},     # can take values - KFold, LeaveOneOut, train_val_holdout
                                                 # LeavePOut - in case of KFold & LeavePOut, we should have 
                                                 # additional parameter of k and p
         'kFold_splits': { 
                             'value_type': 'categories',
                             'values': [5]}, 
         'pFold_splits': { 
                             'value_type': 'categories',
                             'values': [3,4,5,10,15,20]}, 
         'feature_cols': { 
                             'value_type': 'constant',
                             'values': []},},
    )
# pprint.pprint(specifications_master_dict)
print('Done Updating the Common Specs')

Done Updating the Common Specs


In [5]:
# pprint.pprint(specifications_master_dict)

In [6]:
# def apply_smote(data, feature_cols, target):
#     '''
#     Input:
#     data - the original dataframe
#     feature_cols - the feature columns (list of columns)
#     target - the target column (string value)
#     '''
#     sm = SMOTE(random_state=42)
#     features, targets = sm.fit_resample(data[feature_cols],data[target])
#     feature_df = pd.DataFrame(features, columns=feature_cols)
#     target_df = pd.DataFrame(targets, columns=[target])
#     output = pd.concat([feature_df, target_df], axis=1)
#     # Shuffling dataset
#     output = output.sample(frac=1).reset_index(drop=True)
#     return output

## All indicators - All bincount

In [7]:
filepath = "/Users/arpitjain/Downloads/SatPRo/2001_L7_data/BinsData_AnujMethod/2001_Anuj_Method/Features_100m_quantile@8.csv"

In [8]:
df = pd.read_csv(filepath)

In [9]:
print(df.columns)

Index(['band_1_1', 'band_1_2', 'band_1_3', 'band_1_4', 'band_1_5', 'band_1_6',
       'band_1_7', 'band_1_8', 'band_2_1', 'band_2_2', 'band_2_3', 'band_2_4',
       'band_2_5', 'band_2_6', 'band_2_7', 'band_2_8', 'band_3_1', 'band_3_2',
       'band_3_3', 'band_3_4', 'band_3_5', 'band_3_6', 'band_3_7', 'band_3_8',
       'band_4_1', 'band_4_2', 'band_4_3', 'band_4_4', 'band_4_5', 'band_4_6',
       'band_4_7', 'band_4_8', 'band_5_1', 'band_5_2', 'band_5_3', 'band_5_4',
       'band_5_5', 'band_5_6', 'band_5_7', 'band_5_8', 'band_6_1', 'band_6_2',
       'band_6_3', 'band_6_4', 'band_6_5', 'band_6_6', 'band_6_7', 'band_6_8',
       'band_7_1', 'band_7_2', 'band_7_3', 'band_7_4', 'band_7_5', 'band_7_6',
       'band_7_7', 'band_7_8', 'band_8_1', 'band_8_2', 'band_8_3', 'band_8_4',
       'band_8_5', 'band_8_6', 'band_8_7', 'band_8_8', 'band_9_1', 'band_9_2',
       'band_9_3', 'band_9_4', 'band_9_5', 'band_9_6', 'band_9_7', 'band_9_8',
       'band_10_1', 'band_10_2', 'band_10_3', 'band_

In [10]:
ground_truth = pd.read_csv("/Users/arpitjain/Downloads/SatPRo/District - Ground Truth - 2011&2001.csv")
cols = ['census_code','MSW_2001','BF_2001','MSL_2001', 'FC_2001','CHH_2001','ASSET_2001']
ground_truth = ground_truth[cols]

In [11]:
data = ground_truth.merge(df, how='left', on='census_code')

In [12]:
data.dropna(inplace=True)

In [13]:
print(data.columns)

Index(['census_code', 'MSW_2001', 'BF_2001', 'MSL_2001', 'FC_2001', 'CHH_2001',
       'ASSET_2001', 'band_1_1', 'band_1_2', 'band_1_3',
       ...
       'band_11_7', 'band_11_8', 'band_12_1', 'band_12_2', 'band_12_3',
       'band_12_4', 'band_12_5', 'band_12_6', 'band_12_7', 'band_12_8'],
      dtype='object', length=103)


In [14]:
print(data.columns[7:])
print('---------')
print(data.shape)

Index(['band_1_1', 'band_1_2', 'band_1_3', 'band_1_4', 'band_1_5', 'band_1_6',
       'band_1_7', 'band_1_8', 'band_2_1', 'band_2_2', 'band_2_3', 'band_2_4',
       'band_2_5', 'band_2_6', 'band_2_7', 'band_2_8', 'band_3_1', 'band_3_2',
       'band_3_3', 'band_3_4', 'band_3_5', 'band_3_6', 'band_3_7', 'band_3_8',
       'band_4_1', 'band_4_2', 'band_4_3', 'band_4_4', 'band_4_5', 'band_4_6',
       'band_4_7', 'band_4_8', 'band_5_1', 'band_5_2', 'band_5_3', 'band_5_4',
       'band_5_5', 'band_5_6', 'band_5_7', 'band_5_8', 'band_6_1', 'band_6_2',
       'band_6_3', 'band_6_4', 'band_6_5', 'band_6_6', 'band_6_7', 'band_6_8',
       'band_7_1', 'band_7_2', 'band_7_3', 'band_7_4', 'band_7_5', 'band_7_6',
       'band_7_7', 'band_7_8', 'band_8_1', 'band_8_2', 'band_8_3', 'band_8_4',
       'band_8_5', 'band_8_6', 'band_8_7', 'band_8_8', 'band_9_1', 'band_9_2',
       'band_9_3', 'band_9_4', 'band_9_5', 'band_9_6', 'band_9_7', 'band_9_8',
       'band_10_1', 'band_10_2', 'band_10_3', 'band_

In [15]:
data.columns

Index(['census_code', 'MSW_2001', 'BF_2001', 'MSL_2001', 'FC_2001', 'CHH_2001',
       'ASSET_2001', 'band_1_1', 'band_1_2', 'band_1_3',
       ...
       'band_11_7', 'band_11_8', 'band_12_1', 'band_12_2', 'band_12_3',
       'band_12_4', 'band_12_5', 'band_12_6', 'band_12_7', 'band_12_8'],
      dtype='object', length=103)

In [16]:
indicators = ['BF_2001','MSL_2001', 'FC_2001', 'CHH_2001', 'ASSET_2001', 'MSW_2001'] # 
for target in indicators:
    print('#### Running for', target)
    drop_cols = [i for i in indicators if i!=target]
    drop_cols.extend(['census_code'])
    
    info_dict = {}
    info_dict['target'] = target
    info_dict['drop_cols'] = drop_cols
    info_dict['features'] = data.columns[7:175].tolist()
    
#     print(type(info_dict['drop_cols']))
    
    print(info_dict['target'], info_dict['drop_cols'], info_dict['features'])
    for key in specifications_master_dict.keys():
               specifications_master_dict[key].update({
                     'y_col': {
                                 'value_type': 'constant',
                                 'values': [info_dict['target']]},
                     'current_date_col': {
                                 'value_type': 'constant',
                                 'values': drop_cols},
                    'feature_cols': {
                                        'value_type': 'constant',
                                        'values': [info_dict['features']]},},
               )
    
    
    model_specs_vs_score = callModels(models_dict, scores_dict,specifications_master_dict, data, False)
    top_scores_specs = topSpecs(model_specs_vs_score, scores_dict,get_train_score=True)
    print('------------------------')
    print(top_scores_specs['xgBoost']['val_scores'])
    print(top_scores_specs['xgBoost']['train_scores'])
    pickle.dump(top_scores_specs, open(target + 'AnujMethod2001_XGB_top_score_specs@8.pkl', 'wb'))
    print('------------------------')

#### Running for BF_2001
<class 'list'>
BF_2001 ['MSL_2001', 'FC_2001', 'CHH_2001', 'ASSET_2001', 'MSW_2001', 'census_code'] ['band_1_1', 'band_1_2', 'band_1_3', 'band_1_4', 'band_1_5', 'band_1_6', 'band_1_7', 'band_1_8', 'band_2_1', 'band_2_2', 'band_2_3', 'band_2_4', 'band_2_5', 'band_2_6', 'band_2_7', 'band_2_8', 'band_3_1', 'band_3_2', 'band_3_3', 'band_3_4', 'band_3_5', 'band_3_6', 'band_3_7', 'band_3_8', 'band_4_1', 'band_4_2', 'band_4_3', 'band_4_4', 'band_4_5', 'band_4_6', 'band_4_7', 'band_4_8', 'band_5_1', 'band_5_2', 'band_5_3', 'band_5_4', 'band_5_5', 'band_5_6', 'band_5_7', 'band_5_8', 'band_6_1', 'band_6_2', 'band_6_3', 'band_6_4', 'band_6_5', 'band_6_6', 'band_6_7', 'band_6_8', 'band_7_1', 'band_7_2', 'band_7_3', 'band_7_4', 'band_7_5', 'band_7_6', 'band_7_7', 'band_7_8', 'band_8_1', 'band_8_2', 'band_8_3', 'band_8_4', 'band_8_5', 'band_8_6', 'band_8_7', 'band_8_8', 'band_9_1', 'band_9_2', 'band_9_3', 'band_9_4', 'band_9_5', 'band_9_6', 'band_9_7', 'band_9_8', 'band_10_1

KeyboardInterrupt: 