## Implementing quantile binning - Method 1 - separate bin edges for each district each band

In [234]:
import warnings
warnings.filterwarnings('ignore')

In [235]:
import libtiff
from sklearn.preprocessing import KBinsDiscretizer
input_folder = "/Users/arpitjain/Downloads/SatPRo/2001_L7_data/districtTiff100_1"

In [241]:
for files in os.listdir(input_folder):
    print(files.split("@"))

['Hisar', '6', '80.tiff']
['Ambedkar Nagar', '9', '178.tiff']
['Bankura', '19', '339.pkl']
['Banas Kantha', '24', '469.tiff']
['Sahibganj', '20', '352.tiff']
['Panchkula', '6', '69.tiff']
['Kurung Kumey', '12', '256.pkl']
['Alappuzha', '32', '598.pkl']
['Samba', '1', '22.tiff']
['Hassan', '29', '574.tiff']
['Ahmadabad', '24', '474.tiff']
['Mamit', '15', '281.tiff']
['Udupi', '29', '569.tiff']
['Nalanda', '10', '229.tiff']
['Nawada', '10', '237.tiff']
['Davanagere', '29', '567.tiff']
['Dima Hasao', '18', '315.tiff']
['Chikkaballapura', '29', '582.pkl']
['Sahibzada Ajit Singh Nagar', '3', '52.tiff']
['Kullu', '2', '26.pkl']
['Dindori', '23', '453.tiff']
['Moradabad', '9', '135.tiff']
['Pashchim Champaran', '10', '203.tiff']
['Dhenkanal', '21', '383.tiff']
['Ratnagiri', '27', '528.tiff']
['Serchhip', '15', '285.tiff']
['Bhiwani', '6', '81.tiff']
['Mumbai Suburban', '27', '518.tiff']
['Akola', '27', '501.pkl']
['Rajkot', '24', '476.tiff']
['Baramula', '1', '8.tiff']
['Guna', '23', '458.pkl

In [236]:
def get_cols(bincount):
    cols = [['band_'+str(j+1)+'_'+str(i+1) for i in range(bincount)] for j in range(12)]
    cols = np.array(cols).reshape(1,-1)[0]
    cols = np.append(cols, 'census_code')
    return cols

In [237]:
def quantile_binning(band, n_bins):
    band_no = band.copy()
    band_no = band_no.flatten()
    band_no = band_no[band_no!=0] # Removing zero
    band_no = band_no[~np.isnan(band_no)] # Removing Nan
    band_no = band_no.reshape(-1,1)
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='quantile')
    est.fit(band_no)
    bins = est.bin_edges_
    feature = np.histogram(band_no, bins=bins[0])[0]
    return feature

In [276]:
bin_list = [6,7,8,9,10,11,12,13,14,15]

In [277]:
skipped_files = []
for bincount in bin_list:
    print('****** Running for bins - ', bincount)
    
    
    df_list = []
    for files in os.listdir(input_folder):

        if '.DS_Store' in files:
            pass
        else:
            filepath = os.path.join(input_folder, files)
            district_code = files.split("@")[2].split(".")[0]
        print("       Running for district code - ",district_code)

        if '.tiff' in files:
            try:
                tif = libtiff.TIFF.open(filepath, mode='r')
                image = np.array(tif.read_image())

                band1 = image[:,:,0]  # Not takingt the 10th, i.e. BQA band
                band2 = image[:,:,1]
                band3 = image[:,:,2]
                band4 = image[:,:,3]
                band5 = image[:,:,4]
                band6 = image[:,:,5]
                band7 = image[:,:,6]
                band8 = image[:,:,7]
                band9 = image[:,:,8]
                bandndvi = np.array((band4-band3)/(band3+band4))
                bandndvi = np.nan_to_num(bandndvi)
                bandndwi = np.array((band2-band5)/(band2+band5))
                bandndwi = np.nan_to_num(bandndwi)
                bandndbi = np.array((band5-band4)/(band4+band5))
                bandndbi = np.nan_to_num(bandndbi)

                feature1 = quantile_binning(band1, bincount)
                feature2 = quantile_binning(band1, bincount)
                feature3 = quantile_binning(band1, bincount)
                feature4 = quantile_binning(band1, bincount)
                feature5 = quantile_binning(band1, bincount)
                feature6 = quantile_binning(band1, bincount)
                feature7 = quantile_binning(band1, bincount)
                feature8 = quantile_binning(band1, bincount)
                feature9 = quantile_binning(band1, bincount)
                featurendvi = quantile_binning(band1, bincount)
                featurendwi = quantile_binning(band1, bincount)
                featurendbi = quantile_binning(band1, bincount)

                all_features = np.hstack((feature1, feature2, feature3, feature4, feature5, feature6, feature7, feature8, feature9, featurendvi, featurendwi, featurendbi, district_code))
            except Exception as e:
                skipped_files.append(district_code)
                skipped_files.append(bincount)
                print('Error#####',e, district_code)
        
        elif '.pkl' in files:
            try:
                image = pickle.load(open(filepath,"rb"))

                band1 = image[0,:,:]  # Not takingt the 10th, i.e. BQA band
                band2 = image[1,:,:]
                band3 = image[2,:,:]
                band4 = image[3,:,:]
                band5 = image[4,:,:]
                band6 = image[5,:,:]
                band7 = image[6,:,:]
                band8 = image[7,:,:]
                band9 = image[8,:,:]
                bandndvi = np.array((band4-band3)/(band3+band4))
                bandndvi = np.nan_to_num(bandndvi)
                bandndwi = np.array((band2-band5)/(band2+band5))
                bandndwi = np.nan_to_num(bandndwi)
                bandndbi = np.array((band5-band4)/(band4+band5))
                bandndbi = np.nan_to_num(bandndbi)

                feature1 = quantile_binning(band1, bincount)
                feature2 = quantile_binning(band1, bincount)
                feature3 = quantile_binning(band1, bincount)
                feature4 = quantile_binning(band1, bincount)
                feature5 = quantile_binning(band1, bincount)
                feature6 = quantile_binning(band1, bincount)
                feature7 = quantile_binning(band1, bincount)
                feature8 = quantile_binning(band1, bincount)
                feature9 = quantile_binning(band1, bincount)
                featurendvi = quantile_binning(band1, bincount)
                featurendwi = quantile_binning(band1, bincount)
                featurendbi = quantile_binning(band1, bincount)

                all_features = np.hstack((feature1, feature2, feature3, feature4, feature5, feature6, feature7, feature8, feature9, featurendvi, featurendwi, featurendbi, district_code))
            
            except Exception as e:
                skipped_files.append(district_code)
                skipped_files.append(bincount)
                print('Error#####',e, district_code)
                
        df = pd.DataFrame(all_features)
        df = df.transpose()
        df_list.append(df)
    
    final_df = pd.concat(df_list)
    cols = get_cols(bincount)
    # Note band10 is ndvi, 11 is ndwi, 12 is ndbi
    final_df.columns = cols
    final_df.to_csv('Features_100m_quantile@'+str(bincount)+'.csv', index=False)

****** Running for bins -  6
       Running for district code -  80
       Running for district code -  178
       Running for district code -  339
       Running for district code -  469
       Running for district code -  352
       Running for district code -  69
       Running for district code -  256
       Running for district code -  598
       Running for district code -  22
       Running for district code -  574
       Running for district code -  474
       Running for district code -  281
       Running for district code -  569
       Running for district code -  229
       Running for district code -  237
       Running for district code -  567
       Running for district code -  315
       Running for district code -  582
       Running for district code -  52
       Running for district code -  26
       Running for district code -  453
       Running for district code -  135
       Running for district code -  203
       Running for district code -  383
       Running f

       Running for district code -  589
       Running for district code -  606
       Running for district code -  573
       Running for district code -  326
       Running for district code -  612
       Running for district code -  131
       Running for district code -  619
       Running for district code -  195
       Running for district code -  124
       Running for district code -  475
       Running for district code -  536
       Running for district code -  21
       Running for district code -  559
       Running for district code -  427
       Running for district code -  471
       Running for district code -  101
       Running for district code -  297
       Running for district code -  299
       Running for district code -  478
       Running for district code -  190
       Running for district code -  369
       Running for district code -  507
       Running for district code -  432
       Running for district code -  231
       Running for district code -  74
  

       Running for district code -  584
       Running for district code -  25
       Running for district code -  562
       Running for district code -  61
       Running for district code -  10
       Running for district code -  240
       Running for district code -  402
       Running for district code -  13
       Running for district code -  84
       Running for district code -  42
       Running for district code -  149
       Running for district code -  500
       Running for district code -  78
       Running for district code -  255
       Running for district code -  184
       Running for district code -  558
Error##### Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required. 558
       Running for district code -  397
       Running for district code -  4
       Running for district code -  41
       Running for district code -  435
       Running for district code -  265
       Running for district code -  294
       Running for district code -  4

       Running for district code -  219
       Running for district code -  433
       Running for district code -  585
       Running for district code -  171
       Running for district code -  607
       Running for district code -  565
       Running for district code -  348
       Running for district code -  552
       Running for district code -  106
       Running for district code -  109
       Running for district code -  423
       Running for district code -  89
       Running for district code -  334
       Running for district code -  289
       Running for district code -  456
       Running for district code -  535
       Running for district code -  627
       Running for district code -  389
       Running for district code -  359
       Running for district code -  329
       Running for district code -  168
       Running for district code -  12
       Running for district code -  185
       Running for district code -  419
       Running for district code -  211
  

       Running for district code -  194
       Running for district code -  625
       Running for district code -  79
       Running for district code -  270
       Running for district code -  257
       Running for district code -  119
       Running for district code -  225
       Running for district code -  445
       Running for district code -  516
       Running for district code -  262
       Running for district code -  53
       Running for district code -  563
       Running for district code -  634
       Running for district code -  358
       Running for district code -  422
       Running for district code -  91
       Running for district code -  639
Error##### Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required. 639
       Running for district code -  515
       Running for district code -  603
       Running for district code -  46
       Running for district code -  6
       Running for district code -  576
       Running for district code 

       Running for district code -  592
       Running for district code -  577
       Running for district code -  436
       Running for district code -  549
       Running for district code -  40
       Running for district code -  167
       Running for district code -  243
       Running for district code -  32
       Running for district code -  226
       Running for district code -  489
       Running for district code -  380
       Running for district code -  417
       Running for district code -  126
       Running for district code -  95
       Running for district code -  330
       Running for district code -  77
       Running for district code -  333
       Running for district code -  151
       Running for district code -  18
       Running for district code -  382
       Running for district code -  595
       Running for district code -  193
       Running for district code -  520
       Running for district code -  174
       Running for district code -  323
     

       Running for district code -  396
       Running for district code -  239
       Running for district code -  93
       Running for district code -  370
       Running for district code -  118
       Running for district code -  343
       Running for district code -  604
       Running for district code -  83
       Running for district code -  496
       Running for district code -  617
       Running for district code -  212
       Running for district code -  347
       Running for district code -  236
       Running for district code -  216
       Running for district code -  201
       Running for district code -  373
       Running for district code -  121
       Running for district code -  430
       Running for district code -  597
       Running for district code -  9
       Running for district code -  58
       Running for district code -  280
       Running for district code -  580
       Running for district code -  210
       Running for district code -  129
     

       Running for district code -  425
       Running for district code -  331
       Running for district code -  421
       Running for district code -  426
       Running for district code -  546
       Running for district code -  202
       Running for district code -  581
       Running for district code -  596
       Running for district code -  450
       Running for district code -  616
       Running for district code -  189
       Running for district code -  525
       Running for district code -  457
       Running for district code -  261
       Running for district code -  590
       Running for district code -  47
       Running for district code -  85
       Running for district code -  485
       Running for district code -  15
       Running for district code -  127
       Running for district code -  139
       Running for district code -  164
       Running for district code -  214
       Running for district code -  494
       Running for district code -  583
   

       Running for district code -  385
       Running for district code -  377
       Running for district code -  483
       Running for district code -  561
       Running for district code -  411
       Running for district code -  391
       Running for district code -  250
       Running for district code -  353
       Running for district code -  303
       Running for district code -  192
       Running for district code -  538
       Running for district code -  88
       Running for district code -  327
       Running for district code -  318
       Running for district code -  234
       Running for district code -  466
       Running for district code -  366
       Running for district code -  340
       Running for district code -  205
       Running for district code -  386
       Running for district code -  268
       Running for district code -  308
       Running for district code -  230
       Running for district code -  1
       Running for district code -  155
   

       Running for district code -  415
       Running for district code -  38
       Running for district code -  157
       Running for district code -  431
       Running for district code -  434
       Running for district code -  108
       Running for district code -  631
       Running for district code -  437
       Running for district code -  452
       Running for district code -  560
       Running for district code -  505
       Running for district code -  448
Error##### Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required. 448
       Running for district code -  105
       Running for district code -  539
       Running for district code -  413
       Running for district code -  360
       Running for district code -  200
       Running for district code -  408
       Running for district code -  67
       Running for district code -  221
       Running for district code -  110
       Running for district code -  498
       Running for district c

       Running for district code -  586
       Running for district code -  165
       Running for district code -  482
       Running for district code -  55
       Running for district code -  418
       Running for district code -  335
       Running for district code -  17
       Running for district code -  269
       Running for district code -  3
       Running for district code -  537
       Running for district code -  444
       Running for district code -  251
       Running for district code -  128
       Running for district code -  244
       Running for district code -  24
       Running for district code -  632
       Running for district code -  116
       Running for district code -  49
       Running for district code -  446
       Running for district code -  602
       Running for district code -  514
       Running for district code -  183
       Running for district code -  638
       Running for district code -  56
       Running for district code -  400
       

       Running for district code -  410
       Running for district code -  284
       Running for district code -  341
       Running for district code -  384
       Running for district code -  198
       Running for district code -  48
       Running for district code -  338
       Running for district code -  132
       Running for district code -  544
       Running for district code -  526
       Running for district code -  593
       Running for district code -  355
       Running for district code -  264
       Running for district code -  572
       Running for district code -  449
       Running for district code -  317
       Running for district code -  31
       Running for district code -  232
       Running for district code -  207
       Running for district code -  290
       Running for district code -  374
       Running for district code -  545
       Running for district code -  608
       Running for district code -  179
       Running for district code -  376
  

       Running for district code -  306
       Running for district code -  548
       Running for district code -  509
       Running for district code -  90
       Running for district code -  379
       Running for district code -  575
       Running for district code -  601
       Running for district code -  529
       Running for district code -  45
       Running for district code -  633
       Running for district code -  622
       Running for district code -  29
       Running for district code -  635
       Running for district code -  181
       Running for district code -  209
       Running for district code -  316
       Running for district code -  72
       Running for district code -  579
       Running for district code -  477
       Running for district code -  36
       Running for district code -  356
       Running for district code -  208
       Running for district code -  260
       Running for district code -  523
       Running for district code -  113
     

       Running for district code -  490
       Running for district code -  233
       Running for district code -  233
       Running for district code -  215
       Running for district code -  153
       Running for district code -  307
       Running for district code -  278
       Running for district code -  438
       Running for district code -  547
       Running for district code -  451
       Running for district code -  145
       Running for district code -  481
       Running for district code -  253
       Running for district code -  534
       Running for district code -  27
       Running for district code -  484
       Running for district code -  293
       Running for district code -  319
       Running for district code -  103
       Running for district code -  159
       Running for district code -  263
       Running for district code -  491
       Running for district code -  493
       Running for district code -  440
       Running for district code -  51
  

       Running for district code -  43
       Running for district code -  37
       Running for district code -  134
       Running for district code -  86
       Running for district code -  228
       Running for district code -  291
       Running for district code -  398
       Running for district code -  283
       Running for district code -  530
       Running for district code -  554
       Running for district code -  344
       Running for district code -  412
       Running for district code -  543
       Running for district code -  342
       Running for district code -  44
       Running for district code -  258
       Running for district code -  320
       Running for district code -  34
       Running for district code -  142
       Running for district code -  459
       Running for district code -  488
       Running for district code -  310
       Running for district code -  428
       Running for district code -  114
       Running for district code -  378
     

       Running for district code -  372
       Running for district code -  60
       Running for district code -  611
       Running for district code -  62
       Running for district code -  532
       Running for district code -  238
       Running for district code -  186
       Running for district code -  454
       Running for district code -  568
       Running for district code -  302
       Running for district code -  361
       Running for district code -  111
       Running for district code -  375
       Running for district code -  259
       Running for district code -  274
       Running for district code -  626
       Running for district code -  156
       Running for district code -  197
       Running for district code -  206
       Running for district code -  97
       Running for district code -  50
       Running for district code -  246
       Running for district code -  399
       Running for district code -  531
       Running for district code -  324
    

       Running for district code -  416
       Running for district code -  247
       Running for district code -  618
       Running for district code -  191
       Running for district code -  460
       Running for district code -  497
       Running for district code -  407
       Running for district code -  187
       Running for district code -  371
       Running for district code -  120
       Running for district code -  163
       Running for district code -  166
       Running for district code -  35
       Running for district code -  295
       Running for district code -  506
       Running for district code -  19
       Running for district code -  100
       Running for district code -  470
       Running for district code -  555
       Running for district code -  556
       Running for district code -  311
       Running for district code -  441
       Running for district code -  381
       Running for district code -  503
       Running for district code -  130
  

       Running for district code -  249
       Running for district code -  112
       Running for district code -  312
       Running for district code -  20
       Running for district code -  199
       Running for district code -  309
       Running for district code -  405
       Running for district code -  0
       Running for district code -  33
       Running for district code -  467
       Running for district code -  479
       Running for district code -  266
       Running for district code -  613
       Running for district code -  267
       Running for district code -  144
       Running for district code -  288
       Running for district code -  123
       Running for district code -  325
       Running for district code -  107
       Running for district code -  104
       Running for district code -  301
       Running for district code -  136
       Running for district code -  468
       Running for district code -  87
       Running for district code -  557
     

       Running for district code -  623
       Running for district code -  275
       Running for district code -  387
       Running for district code -  519
       Running for district code -  492
       Running for district code -  137
       Running for district code -  614
       Running for district code -  30
       Running for district code -  196
       Running for district code -  414
       Running for district code -  512
       Running for district code -  495
       Running for district code -  511
       Running for district code -  609
       Running for district code -  409
       Running for district code -  63
       Running for district code -  510
       Running for district code -  357
       Running for district code -  64
       Running for district code -  182
       Running for district code -  70
       Running for district code -  122
       Running for district code -  462
       Running for district code -  65
       Running for district code -  403
     

       Running for district code -  588
       Running for district code -  300
       Running for district code -  362
       Running for district code -  76
       Running for district code -  461
       Running for district code -  248
       Running for district code -  600
       Running for district code -  624
       Running for district code -  390
       Running for district code -  513
       Running for district code -  176
       Running for district code -  277
       Running for district code -  133
       Running for district code -  71
       Running for district code -  286
       Running for district code -  486
       Running for district code -  66
       Running for district code -  578
       Running for district code -  227
       Running for district code -  313
       Running for district code -  23
       Running for district code -  217
       Running for district code -  305
       Running for district code -  296
       Running for district code -  5
      

       Running for district code -  141
       Running for district code -  621
       Running for district code -  241
       Running for district code -  276
       Running for district code -  273
       Running for district code -  420
       Running for district code -  223
       Running for district code -  368
       Running for district code -  524
       Running for district code -  395
       Running for district code -  502
       Running for district code -  594
       Running for district code -  392
       Running for district code -  599
       Running for district code -  170
       Running for district code -  7
       Running for district code -  550
       Running for district code -  150
       Running for district code -  16
       Running for district code -  610
       Running for district code -  521
       Running for district code -  508
       Running for district code -  75
       Running for district code -  68
       Running for district code -  442
     

       Running for district code -  443
       Running for district code -  551
       Running for district code -  298
       Running for district code -  564
       Running for district code -  146
       Running for district code -  351
       Running for district code -  98
       Running for district code -  138
       Running for district code -  605
       Running for district code -  364
       Running for district code -  322
       Running for district code -  439
       Running for district code -  117
       Running for district code -  175
       Running for district code -  160
       Running for district code -  2
       Running for district code -  332
       Running for district code -  517
       Running for district code -  504
       Running for district code -  279
       Running for district code -  345
       Running for district code -  59
       Running for district code -  465
       Running for district code -  404
       Running for district code -  282
    

       Running for district code -  474
       Running for district code -  281
       Running for district code -  569
       Running for district code -  229
       Running for district code -  237
       Running for district code -  567
       Running for district code -  315
       Running for district code -  582
       Running for district code -  52
       Running for district code -  26
       Running for district code -  453
       Running for district code -  135
       Running for district code -  203
       Running for district code -  383
       Running for district code -  528
       Running for district code -  285
       Running for district code -  81
       Running for district code -  518
       Running for district code -  501
       Running for district code -  476
       Running for district code -  8
       Running for district code -  458
       Running for district code -  337
       Running for district code -  487
       Running for district code -  640
Error

       Running for district code -  559
       Running for district code -  427
       Running for district code -  471
       Running for district code -  101
       Running for district code -  297
       Running for district code -  299
       Running for district code -  478
       Running for district code -  190
       Running for district code -  369
       Running for district code -  507
       Running for district code -  432
       Running for district code -  231
       Running for district code -  74
       Running for district code -  245
       Running for district code -  218
       Running for district code -  328
       Running for district code -  162
       Running for district code -  94
       Running for district code -  566
       Running for district code -  527
       Running for district code -  169
       Running for district code -  287
       Running for district code -  615
       Running for district code -  406
       Running for district code -  73
   

       Running for district code -  78
       Running for district code -  255
       Running for district code -  184
       Running for district code -  558
Error##### Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required. 558
       Running for district code -  397
       Running for district code -  4
       Running for district code -  41
       Running for district code -  435
       Running for district code -  265
       Running for district code -  294
       Running for district code -  401
       Running for district code -  473
       Running for district code -  533
       Running for district code -  125
       Running for district code -  96
       Running for district code -  28
       Running for district code -  14
       Running for district code -  271
       Running for district code -  587
Error##### Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required. 587
       Running for district code -  570
       Running for di

       Running for district code -  334
       Running for district code -  289
       Running for district code -  456
       Running for district code -  535
       Running for district code -  627
       Running for district code -  389
       Running for district code -  359
       Running for district code -  329
       Running for district code -  168
       Running for district code -  12
       Running for district code -  185
       Running for district code -  419
       Running for district code -  211
       Running for district code -  346
       Running for district code -  92
       Running for district code -  213
       Running for district code -  188
****** Running for bins -  14
       Running for district code -  80
       Running for district code -  178
       Running for district code -  339
       Running for district code -  469
       Running for district code -  352
       Running for district code -  69
       Running for district code -  256
       Running

       Running for district code -  634
       Running for district code -  358
       Running for district code -  422
       Running for district code -  91
       Running for district code -  639
Error##### Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required. 639
       Running for district code -  515
       Running for district code -  603
       Running for district code -  46
       Running for district code -  6
       Running for district code -  576
       Running for district code -  350
       Running for district code -  222
       Running for district code -  571
       Running for district code -  11
       Running for district code -  455
       Running for district code -  589
       Running for district code -  606
       Running for district code -  573
       Running for district code -  326
       Running for district code -  612
       Running for district code -  131
       Running for district code -  619
       Running for district code

       Running for district code -  126
       Running for district code -  95
       Running for district code -  330
       Running for district code -  77
       Running for district code -  333
       Running for district code -  151
       Running for district code -  18
       Running for district code -  382
       Running for district code -  595
       Running for district code -  193
       Running for district code -  520
       Running for district code -  174
       Running for district code -  323
       Running for district code -  173
       Running for district code -  204
       Running for district code -  354
       Running for district code -  140
       Running for district code -  584
       Running for district code -  25
       Running for district code -  562
       Running for district code -  61
       Running for district code -  10
       Running for district code -  240
       Running for district code -  402
       Running for district code -  13
       

       Running for district code -  236
       Running for district code -  216
       Running for district code -  201
       Running for district code -  373
       Running for district code -  121
       Running for district code -  430
       Running for district code -  597
       Running for district code -  9
       Running for district code -  58
       Running for district code -  280
       Running for district code -  580
       Running for district code -  210
       Running for district code -  129
       Running for district code -  82
       Running for district code -  499
       Running for district code -  102
       Running for district code -  429
       Running for district code -  219
       Running for district code -  433
       Running for district code -  585
       Running for district code -  171
       Running for district code -  607
       Running for district code -  565
       Running for district code -  348
       Running for district code -  552
    

       Running for district code -  457
       Running for district code -  261
       Running for district code -  590
       Running for district code -  47
       Running for district code -  85
       Running for district code -  485
       Running for district code -  15
       Running for district code -  127
       Running for district code -  139
       Running for district code -  164
       Running for district code -  214
       Running for district code -  494
       Running for district code -  583
       Running for district code -  367
       Running for district code -  147
       Running for district code -  252
       Running for district code -  628
       Running for district code -  194
       Running for district code -  625
       Running for district code -  79
       Running for district code -  270
       Running for district code -  257
       Running for district code -  119
       Running for district code -  225
       Running for district code -  445
    

       Running for district code -  88
       Running for district code -  327
       Running for district code -  318
       Running for district code -  234
       Running for district code -  466
       Running for district code -  366
       Running for district code -  340
       Running for district code -  205
       Running for district code -  386
       Running for district code -  268
       Running for district code -  308
       Running for district code -  230
       Running for district code -  1
       Running for district code -  155
       Running for district code -  447
       Running for district code -  143
       Running for district code -  161
       Running for district code -  592
       Running for district code -  577
       Running for district code -  436
       Running for district code -  549
       Running for district code -  40
       Running for district code -  167
       Running for district code -  243
       Running for district code -  32
     

       Running for district code -  448
Error##### Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required. 448
       Running for district code -  105
       Running for district code -  539
       Running for district code -  413
       Running for district code -  360
       Running for district code -  200
       Running for district code -  408
       Running for district code -  67
       Running for district code -  221
       Running for district code -  110
       Running for district code -  498
       Running for district code -  220
       Running for district code -  463
       Running for district code -  591
       Running for district code -  396
       Running for district code -  239
       Running for district code -  93
       Running for district code -  370
       Running for district code -  118
       Running for district code -  343
       Running for district code -  604
       Running for district code -  83
       Running for district co

In [287]:
print(skipped_files)

['640', 6, '639', 6, '558', 6, '587', 6, '448', 6, '640', 7, '639', 7, '558', 7, '587', 7, '448', 7, '640', 8, '639', 8, '558', 8, '587', 8, '448', 8, '640', 9, '639', 9, '558', 9, '587', 9, '448', 9, '640', 10, '639', 10, '558', 10, '587', 10, '448', 10, '640', 11, '639', 11, '558', 11, '587', 11, '448', 11, '640', 12, '639', 12, '558', 12, '587', 12, '448', 12, '640', 13, '639', 13, '558', 13, '587', 13, '448', 13, '640', 14, '639', 14, '558', 14, '587', 14, '448', 14, '640', 15, '639', 15, '558', 15, '587', 15, '448', 15]


In [280]:
# feature

# f1 =quantile_binning(band1, 12)

# f2 = quantile_binning(band2, 12)

# f1.shape

# f2.shape

# pd.DataFrame(np.hstack((f1,f2,3))col).transpose()

# f1

# f2

# file = "/Users/arpitjain/Downloads/SatPRo/2001_L7_data/districtTiff100_1/South Andaman@35@640.tiff"

# tif = libtiff.TIFF.open(file, mode='r')
# image = np.array(tif.read_image())

# image.shape

# band1 = image[:,:,0]  # Not takingt the 10th, i.e. BQA band
# band2 = image[:,:,1]
# band3 = image[:,:,2]
# band4 = image[:,:,3]
# band5 = image[:,:,4]
# band6 = image[:,:,5]
# band7 = image[:,:,6]
# band8 = image[:,:,7]
# band9 = image[:,:,8]
# bandndvi = np.array((band4-band3)/(band3+band4))
# bandndvi = np.nan_to_num(bandndvi)
# bandndwi = np.array((band2-band5)/(band2+band5))
# bandndwi = np.nan_to_num(bandndwi)
# bandndbi = np.array((band5-band4)/(band4+band5))
# bandndbi = np.nan_to_num(bandndbi)

# band1.shape

# # quantile_binning(band1, bincount)

## Implementing quantile binning - Method 2 - common bin edge for each district (same for. each band, anuj's approach)

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import libtiff
from sklearn.preprocessing import KBinsDiscretizer
input_folder = "/Users/arpitjain/Downloads/SatPRo/2001_L7_data/districtTiff100_1"

In [3]:
def get_cols(bincount):
    cols = [['band_'+str(j+1)+'_'+str(i+1) for i in range(bincount)] for j in range(12)]
    cols = np.array(cols).reshape(1,-1)[0]
    cols = np.append(cols, 'census_code')
    return cols

In [4]:
def quantile_binning(bandlist, n_bins):
    band_no = bandlist.copy()
    band_no = band_no.flatten()
    band_no = band_no[band_no!=0] # Removing zero
    band_no = band_no[~np.isnan(band_no)] # Removing Nan
    band_no = band_no.reshape(-1,1)
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='quantile')
    est.fit(band_no)
    bin_edge = est.bin_edges_
#     feature = np.histogram(band_no, bins=bin_edge[0])[0]
    return bin_edge

In [5]:
def get_feature(band, bin_edge):
    band_no = band.copy()
    band_no = band_no.flatten()
    band_no = band_no[band_no!=0] # Removing zero
    band_no = band_no[~np.isnan(band_no)] # Removing Nan
    band_no = band_no.reshape(-1,1)
    feature = np.histogram(band_no, bins=bin_edge[0])[0]
    return feature

In [6]:
def append(band, array):
    band = band.flatten()
    band = band[band!=0]  # Remove zero
    if (array is None):
        array = band
    else:
        array = np.append(array, band)
    return array

In [7]:
def preprocess(band1):
    band1 = band1.flatten()
    band1 = band1[band1!=0]
    band1 = band1[~np.isnan(band1)]
    return band1

In [8]:
bin_list = [5,6,7,8,9,10,11,12,13,14,15]

## Collecting band vise data for all districts 

In [9]:
band1_list = np.array([])
band2_list = np.array([])
band3_list = np.array([])
band4_list = np.array([])
band5_list = np.array([])
band6_list = np.array([])
band7_list = np.array([])
band8_list = np.array([])
band9_list = np.array([])
bandndvi_list = np.array([])
bandndwi_list = np.array([])
bandndbi_list = np.array([])
counter=0
for files in os.listdir(input_folder):
    if '.DS_Store' in files:
            pass
    else:
        filepath = os.path.join(input_folder, files)
        district_code = files.split("@")[2].split(".")[0]
    print("       Running for district code - ",district_code,'  ', counter)
    counter=counter+1

    if '.tiff' in files:
        try:
            tif = libtiff.TIFF.open(filepath, mode='r')
            image = np.array(tif.read_image())

            # Not takingt the 10th, i.e. BQA band
            band1_list = np.append(band1_list, preprocess(image[:,:,0]))
            band2_list = np.append(band2_list, preprocess(image[:,:,1]))
            band3_list = np.append(band3_list, preprocess(image[:,:,2]))
            band4_list = np.append(band4_list, preprocess(image[:,:,3]))
            band5_list = np.append(band5_list, preprocess(image[:,:,4]))
            band6_list = np.append(band6_list, preprocess(image[:,:,5]))
            band7_list = np.append(band7_list, preprocess(image[:,:,6]))
            band8_list = np.append(band8_list, preprocess(image[:,:,7]))
            band9_list = np.append(band9_list, preprocess(image[:,:,8]))

            band2 = image[:,:,1]
            band3 = image[:,:,2]
            band4 = image[:,:,3]
            band5 = image[:,:,4]

            bandndvi = np.array((band4-band3)/(band3+band4))
            bandndvi = np.nan_to_num(bandndvi)
            bandndvi_list = np.append(bandndvi_list, preprocess(bandndvi))
            bandndwi = np.array((band2-band5)/(band2+band5))
            bandndwi = np.nan_to_num(bandndwi)
            bandndwi_list = np.append(bandndwi_list, preprocess(bandndwi))
            bandndbi = np.array((band5-band4)/(band4+band5))
            bandndbi = np.nan_to_num(bandndbi)
            bandndbi_list = np.append(bandndbi_list, preprocess(bandndbi))

        except Exception as e:
            skipped_files.append(district_code)
            skipped_files.append(bincount)
            print('Error#####',e, district_code)

    elif '.pkl' in files:
        try:
            image = pickle.load(open(filepath,"rb"))

            # Not takingt the 10th, i.e. BQA band
            band1_list = np.append(band1_list, preprocess(image[0,:,:]))
            band2_list = np.append(band2_list, preprocess(image[1,:,:]))
            band3_list = np.append(band3_list, preprocess(image[2,:,:]))
            band4_list = np.append(band4_list, preprocess(image[3,:,:]))
            band5_list = np.append(band5_list, preprocess(image[4,:,:]))
            band6_list = np.append(band6_list, preprocess(image[5,:,:]))
            band7_list = np.append(band7_list, preprocess(image[6,:,:]))
            band8_list = np.append(band8_list, preprocess(image[7,:,:]))
            band9_list = np.append(band9_list, preprocess(image[8,:,:]))

            band2 = image[1,:,:]
            band3 = image[2,:,:]
            band4 = image[3,:,:]
            band5 = image[4,:,:]

            bandndvi = np.array((band4-band3)/(band3+band4))
            bandndvi = np.nan_to_num(bandndvi)
            bandndvi_list = np.append(bandndvi_list, preprocess(bandndvi))
            bandndwi = np.array((band2-band5)/(band2+band5))
            bandndwi = np.nan_to_num(bandndwi)
            bandndwi_list = np.append(bandndwi_list, preprocess(bandndwi))
            bandndbi = np.array((band5-band4)/(band4+band5))
            bandndbi = np.nan_to_num(bandndbi)
            bandndbi_list = np.append(bandndbi_list, preprocess(bandndbi))

        except Exception as e:
            skipped_files.append(district_code)
            skipped_files.append(bincount)
            print('Error#####',e, district_code)

# Now we have data of all the bins in the respective lists, we calculate bin edges

       Running for district code -  80    0
       Running for district code -  178    1
       Running for district code -  339    2
       Running for district code -  469    3
       Running for district code -  352    4
       Running for district code -  69    5
       Running for district code -  256    6
       Running for district code -  598    7
       Running for district code -  22    8
       Running for district code -  574    9
       Running for district code -  474    10
       Running for district code -  281    11
       Running for district code -  569    12
       Running for district code -  229    13
       Running for district code -  237    14
       Running for district code -  567    15
       Running for district code -  315    16
       Running for district code -  582    17
       Running for district code -  52    18
       Running for district code -  26    19
       Running for district code -  453    20
       Running for district code -  135    21
   

       Running for district code -  257    178
       Running for district code -  119    179
       Running for district code -  225    180
       Running for district code -  445    181
       Running for district code -  516    182
       Running for district code -  262    183
       Running for district code -  53    184
       Running for district code -  563    185
       Running for district code -  634    186
       Running for district code -  358    187
       Running for district code -  422    188
       Running for district code -  91    189
       Running for district code -  639    190
       Running for district code -  515    191
       Running for district code -  603    192
       Running for district code -  46    193
       Running for district code -  6    194
       Running for district code -  576    195
       Running for district code -  350    196
       Running for district code -  222    197
       Running for district code -  571    198
       Running for

       Running for district code -  561    353
       Running for district code -  411    354
       Running for district code -  391    355
       Running for district code -  250    356
       Running for district code -  353    357
       Running for district code -  303    358
       Running for district code -  192    359
       Running for district code -  538    360
       Running for district code -  88    361
       Running for district code -  327    362
       Running for district code -  318    363
       Running for district code -  234    364
       Running for district code -  466    365
       Running for district code -  366    366
       Running for district code -  340    367
       Running for district code -  205    368
       Running for district code -  386    369
       Running for district code -  268    370
       Running for district code -  308    371
       Running for district code -  230    372
       Running for district code -  1    373
       Running f

KeyboardInterrupt: 

In [10]:
# skipped_files = []
# for bincount in bin_list:
#     print('****** Running for bins - ', bincount)
    
#     band1_list = np.array([])
#     band2_list = np.array([])
#     band3_list = np.array([])
#     band4_list = np.array([])
#     band5_list = np.array([])
#     band6_list = np.array([])
#     band7_list = np.array([])
#     band8_list = np.array([])
#     band9_list = np.array([])
#     bandndvi_list = np.array([])
#     bandndwi_list = np.array([])
#     bandndbi_list = np.array([])

#     for files in os.listdir(input_folder):
#         if '.DS_Store' in files:
#                 pass
#         else:
#             filepath = os.path.join(input_folder, files)
#             district_code = files.split("@")[2].split(".")[0]
#         print("       Running for district code 1 - ",district_code)

#         if '.tiff' in files:
#             try:
#                 tif = libtiff.TIFF.open(filepath, mode='r')
#                 image = np.array(tif.read_image())

#                 # Not takingt the 10th, i.e. BQA band
#                 band1_list = np.append(band1_list, preprocess(image[:,:,0]))
#                 band2_list = np.append(band2_list, preprocess(image[:,:,1]))
#                 band3_list = np.append(band3_list, preprocess(image[:,:,2]))
#                 band4_list = np.append(band4_list, preprocess(image[:,:,3]))
#                 band5_list = np.append(band5_list, preprocess(image[:,:,4]))
#                 band6_list = np.append(band6_list, preprocess(image[:,:,5]))
#                 band7_list = np.append(band7_list, preprocess(image[:,:,6]))
#                 band8_list = np.append(band8_list, preprocess(image[:,:,7]))
#                 band9_list = np.append(band9_list, preprocess(image[:,:,8]))
                
#                 band2 = image[:,:,1]
#                 band3 = image[:,:,2]
#                 band4 = image[:,:,3]
#                 band5 = image[:,:,4]

#                 bandndvi = np.array((band4-band3)/(band3+band4))
#                 bandndvi = np.nan_to_num(bandndvi)
#                 bandndvi_list = np.append(bandndvi_list, preprocess(bandndvi))
#                 bandndwi = np.array((band2-band5)/(band2+band5))
#                 bandndwi = np.nan_to_num(bandndwi)
#                 bandndwi_list = np.append(bandndwi_list, preprocess(bandndwi))
#                 bandndbi = np.array((band5-band4)/(band4+band5))
#                 bandndbi = np.nan_to_num(bandndbi)
#                 bandndbi_list = np.append(bandndbi_list, preprocess(bandndbi))

#             except Exception as e:
#                 skipped_files.append(district_code)
#                 skipped_files.append(bincount)
#                 print('Error#####',e, district_code)

#         elif '.pkl' in files:
#             try:
#                 image = pickle.load(open(filepath,"rb"))

#                 # Not takingt the 10th, i.e. BQA band
#                 band1_list = np.append(band1_list, preprocess(image[0,:,:]))
#                 band2_list = np.append(band2_list, preprocess(image[1,:,:]))
#                 band3_list = np.append(band3_list, preprocess(image[2,:,:]))
#                 band4_list = np.append(band4_list, preprocess(image[3,:,:]))
#                 band5_list = np.append(band5_list, preprocess(image[4,:,:]))
#                 band6_list = np.append(band6_list, preprocess(image[5,:,:]))
#                 band7_list = np.append(band7_list, preprocess(image[6,:,:]))
#                 band8_list = np.append(band8_list, preprocess(image[7,:,:]))
#                 band9_list = np.append(band9_list, preprocess(image[8,:,:]))
                
#                 band2 = image[1,:,:]
#                 band3 = image[2,:,:]
#                 band4 = image[3,:,:]
#                 band5 = image[4,:,:]

#                 bandndvi = np.array((band4-band3)/(band3+band4))
#                 bandndvi = np.nan_to_num(bandndvi)
#                 bandndvi_list = np.append(bandndvi_list, preprocess(bandndvi))
#                 bandndwi = np.array((band2-band5)/(band2+band5))
#                 bandndwi = np.nan_to_num(bandndwi)
#                 bandndwi_list = np.append(bandndwi_list, preprocess(bandndwi))
#                 bandndbi = np.array((band5-band4)/(band4+band5))
#                 bandndbi = np.nan_to_num(bandndbi)
#                 bandndbi_list = np.append(bandndbi_list, preprocess(bandndbi))

#             except Exception as e:
#                 skipped_files.append(district_code)
#                 skipped_files.append(bincount)
#                 print('Error#####',e, district_code)

#     # Now we have data of all the bins in the respective lists, we calculate bin edges
#     band1_edge = quantile_binning(band1_list, bincount)
#     band2_edge = quantile_binning(band2_list, bincount)
#     band3_edge = quantile_binning(band3_list, bincount)
#     band4_edge = quantile_binning(band4_list, bincount)
#     band5_edge = quantile_binning(band5_list, bincount)
#     band6_edge = quantile_binning(band6_list, bincount)
#     band7_edge = quantile_binning(band7_list, bincount)
#     band8_edge = quantile_binning(band8_list, bincount)
#     band9_edge = quantile_binning(band9_list, bincount)
#     bandndwi_edge = quantile_binning(bandndwi_list, bincount)
#     bandndvi_edge = quantile_binning(bandndvi_list, bincount)
#     bandndbi_edge = quantile_binning(bandndbi_list, bincount)
#     print('DONE CREATING THE EDGES')

#     # Now we'll use these bin edges to calculate bins
#     df_list = []
#     for files in os.listdir(input_folder):
#         if '.DS_Store' in files:
#                 pass
#         else:
#             filepath = os.path.join(input_folder, files)
#             district_code = files.split("@")[2].split(".")[0]
#         print("       Running for district code 2 - ",district_code)

#         if '.tiff' in files:
#             try:
#                 tif = libtiff.TIFF.open(filepath, mode='r')
#                 image = np.array(tif.read_image())

#                 # Not takingt the 10th, i.e. BQA band
#                 band1 = image[:,:,0]
#                 band2 = image[:,:,1]
#                 band3 = image[:,:,2]
#                 band4 = image[:,:,3]
#                 band5 = image[:,:,4]
#                 band6 = image[:,:,5]
#                 band7 = image[:,:,6]
#                 band8 = image[:,:,7]
#                 band9 = image[:,:,8]
#                 bandndvi = np.array((band4-band3)/(band3+band4))
#                 bandndvi = np.nan_to_num(bandndvi)
#                 bandndwi = np.array((band2-band5)/(band2+band5))
#                 bandndwi = np.nan_to_num(bandndwi)
#                 bandndbi = np.array((band5-band4)/(band4+band5))
#                 bandndbi = np.nan_to_num(bandndbi)

#                 feature1 = get_feature(band1, band1_edge)
#                 feature2 = get_feature(band2, band2_edge)
#                 feature3 = get_feature(band3, band3_edge)
#                 feature4 = get_feature(band4, band4_edge)
#                 feature5 = get_feature(band5, band5_edge)
#                 feature6 = get_feature(band6, band6_edge)
#                 feature7 = get_feature(band7, band7_edge)
#                 feature8 = get_feature(band8, band8_edge)
#                 feature9 = get_feature(band9, band9_edge)
#                 featurendvi = get_feature(bandndvi, bandndvi_edge)
#                 featurenndwi = get_feature(bandnndwi, bandnndwi_edge)
#                 featurenndbi = get_feature(bandnndbi, bandnndbi_edge)

#                 all_features = np.hstack((feature1, feature2, feature3, feature4, feature5, feature6, feature7, feature8, feature9, featurendvi, featurendwi, featurendbi, district_code))

#             except Exception as e:
#                 skipped_files.append(district_code)
#                 skipped_files.append(bincount)
#                 print('Error#####',e, district_code)

#         elif '.pkl' in files:
#             try:
#                 image = pickle.load(open(filepath,"rb"))

#                 # Not takingt the 10th, i.e. BQA band
#                 band1 = image[0,:,:]
#                 band2 = image[1,:,:]
#                 band3 = image[2,:,:]
#                 band4 = image[3,:,:]
#                 band5 = image[4,:,:]
#                 band6 = image[5,:,:]
#                 band7 = image[6,:,:]
#                 band8 = image[7,:,:]
#                 band9 = image[8,:,:]
#                 bandndvi = np.array((band4-band3)/(band3+band4))
#                 bandndvi = np.nan_to_num(bandndvi)
#                 bandndwi = np.array((band2-band5)/(band2+band5))
#                 bandndwi = np.nan_to_num(bandndwi)
#                 bandndbi = np.array((band5-band4)/(band4+band5))
#                 bandndbi = np.nan_to_num(bandndbi)

#                 feature1 = get_feature(band1, band1_edge)
#                 feature2 = get_feature(band2, band2_edge)
#                 feature3 = get_feature(band3, band3_edge)
#                 feature4 = get_feature(band4, band4_edge)
#                 feature5 = get_feature(band5, band5_edge)
#                 feature6 = get_feature(band6, band6_edge)
#                 feature7 = get_feature(band7, band7_edge)
#                 feature8 = get_feature(band8, band8_edge)
#                 feature9 = get_feature(band9, band9_edge)
#                 featurendvi = get_feature(bandndvi, bandndvi_edge)
#                 featurenndwi = get_feature(bandnndwi, bandnndwi_edge)
#                 featurenndbi = get_feature(bandnndbi, bandnndbi_edge)

#                 all_features = np.hstack((feature1, feature2, feature3, feature4, feature5, feature6, feature7, feature8, feature9, featurendvi, featurendwi, featurendbi, district_code))

#             except Exception as e:
#                 skipped_files.append(district_code)
#                 skipped_files.append(bincount)
#                 print('Error#####',e, district_code)

#         df = pd.DataFrame(all_features)
#         df = df.transpose()
#         df_list.append(df)

#     final_df = pd.concat(df_list)
#     cols = get_cols(bincount)
#     # Note band10 is ndvi, 11 is ndwi, 12 is ndbi
#     final_df.columns = cols
#     final_df.to_csv('Features_100m_quantile@'+str(bincount)+'.csv', index=False)

****** Running for bins -  5
       Running for district code 1 -  80
       Running for district code 1 -  178
       Running for district code 1 -  339
       Running for district code 1 -  469
       Running for district code 1 -  352
       Running for district code 1 -  69
       Running for district code 1 -  256
       Running for district code 1 -  598
       Running for district code 1 -  22
       Running for district code 1 -  574
       Running for district code 1 -  474
       Running for district code 1 -  281
       Running for district code 1 -  569
       Running for district code 1 -  229


KeyboardInterrupt: 