In [1]:
%pylab inline
from glob import glob
from numpy import load
import pandas as pd
import numpy as np
import copy
from scipy.stats import skew, kurtosis

from time import time
import pickle as pkl
import os
import cv2
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

Populating the interactive namespace from numpy and matplotlib


In [2]:
import os
path = os.getcwd()
_uname = path.split('/')[2]
poverty_dir=f'/home/{_uname}/public/cs255-sp22-a00-public/poverty/'
image_dir=poverty_dir+'anon_images/'
image_dir

'/home/ans037/public/cs255-sp22-a00-public/poverty/anon_images/'

In [3]:
train_table=f'/home/{_uname}/public/Datasets_public/Final_Project_Data/train.csv'
df=pd.read_csv(train_table,index_col=0)
df.index=df['filename']

In [4]:
def getImage(image):
    M = np.load(image)
    l = [cv2.resize(M['x'][i, :, :], (20, 20)) for i in range(8)]
    return np.array(l)

In [5]:
def extract_features(band):
    features = {}
    features['mean'] = np.mean(band)
    features['std'] = np.std(band)
    features['skewness'] = skew(band)
    features['kurtosis'] = kurtosis(band)
    features['percentile_25'] = np.percentile(band, 25)
    features['percentile_50'] = np.percentile(band, 50)
    features['percentile_75'] = np.percentile(band, 75)
    return features

def process_outliers(x):
    _min = np.percentile(x, 0.1)
    _max = np.percentile(x,99.9)
        
    x = (x - _min)/(_max - _min)
    x = np.where(x > 1, 1, x)
    x = np.where(x < 0, 0, x)
    return x
    

def calculate_indices(df, train=True):
    if train:
        df = df.drop(['filename', 'wealthpooled'], axis=1)
    else:
        df = df.drop(['filename'], axis=1)
    
    df = df.reset_index(drop=True)
    band_names = ['Red', 'Green', 'Blue', 'NIR', 'SWIR1', 'SWIR2', 'TEMP1', 'NL']
    
    df_copy = df.copy()
    
    for i in range(len(band_names)):
        df_copy[band_names[i]] = df_copy.apply(lambda x: x['Image'][i, :, :], axis=1)

    # Calculate the bare soil indices
    df_copy['MBI'] = (df_copy['SWIR1'] - df_copy['SWIR1'] - df_copy['NIR']) / (df_copy['SWIR1'] + df_copy['SWIR2'] + df_copy['NIR']) + 0.5
    df_copy['BSI'] = (df_copy['SWIR1'] + df_copy['Red'] - df_copy['NIR'] - df_copy['Blue']) / (df_copy['SWIR1'] + df_copy['Red'] + df_copy['NIR'] + df_copy['Blue'])
    df_copy['NDSI1'] = (df_copy['SWIR1'] - df_copy['NIR']) / (df_copy['SWIR1'] + df_copy['NIR'])
    df_copy['NDSI2'] = (df_copy['SWIR1'] - df_copy['Green']) / (df_copy['SWIR1'] + df_copy['Green'])
    df_copy['BI'] = df_copy['Red'] + df_copy['SWIR1'] - df_copy['NIR']
    df_copy['DBSI'] = (df_copy['SWIR1'] - df_copy['Green']) / (df_copy['SWIR1'] + df_copy['Green']) - (df_copy['NIR'] - df_copy['Red']) / (df_copy['NIR'] + df_copy['Red'])

    df_copy['BAEI'] = (df_copy['Red'] + 0.3) / (df_copy['Green'] + df_copy['SWIR1'])
    df_copy['BUI'] = (df_copy['SWIR1'] - df_copy['NIR']) / (df_copy['SWIR1'] + df_copy['NIR']) - (df_copy['NIR'] - df_copy['Red']) / (df_copy['NIR'] + df_copy['Red'])
    df_copy['NBI'] = (df_copy['Red'] - df_copy['SWIR1']) / df_copy['NIR']
    df_copy['BRBA'] = df_copy['Red'] / df_copy['SWIR1']
    df_copy['IBI'] = (2 * df_copy['SWIR1'] / (df_copy['SWIR1'] + df_copy['NIR'])) - ((df_copy['NIR'] / (df_copy['NIR'] - df_copy['Red']) + df_copy['Green'] / (df_copy['Green'] + df_copy['SWIR1'])) / 2) * (df_copy['SWIR1'] / (df_copy['SWIR1'] + df_copy['NIR'])) + (df_copy['NIR'] / (df_copy['NIR'] - df_copy['Red']) + df_copy['Green'] / (df_copy['Green'] + df_copy['SWIR1']))
    df_copy['NDCCI'] = (df_copy['NIR'] - df_copy['Green']) / (df_copy['NIR'] + df_copy['Green'])

    # Vegetation indices
    df_copy['NDVI'] = (df_copy['NIR'] - df_copy['Red']) / (df_copy['NIR'] + df_copy['Red'])
    df_copy['SAVI'] = (df_copy['NIR'] - df_copy['Red']) / (df_copy['NIR'] + df_copy['Red'] + 0.5) * (1 + 0.5)
    df_copy['NDMI'] = (df_copy['NIR'] - df_copy['SWIR1']) / (df_copy['NIR'] + df_copy['SWIR1'])

    # Built-up area indices
    df_copy['DBI'] = (df_copy['Blue'] - df_copy['TEMP1']) / (df_copy['Blue'] + df_copy['TEMP1']) - (df_copy['NIR'] - df_copy['Red']) / (df_copy['NIR'] + df_copy['Red'])
    df_copy['NBAI'] = ((df_copy['SWIR2'] - df_copy['SWIR1']) / df_copy['Green']) / ((df_copy['SWIR2'] + df_copy['SWIR1']) / df_copy['Green'])
    df_copy['NDBI'] = (df_copy['SWIR1'] - df_copy['NIR']) / (df_copy['SWIR1'] + df_copy['NIR'])

    indices = ['MBI', 'BSI', 'NDSI1', 'NDSI2', 'BI', 'DBSI', 'BAEI', 'BUI', 'NBI', 'BRBA', 'IBI', 'NDCCI', 'NDVI', 'SAVI', 'NDMI', 'DBI', 'NBAI', 'NDBI']

    for feature in list(indices + band_names):
        df[f'{feature}_mean'] = df_copy[f'{feature}'].apply(lambda x: extract_features(x)['mean'])
        df[f'{feature}_std'] = df_copy[f'{feature}'].apply(lambda x: extract_features(x)['std'])
        # df[f'{feature}_skewness'] = df[f'{feature}'].apply(lambda x: extract_features(x)['skewness'])
        # df[f'{feature}_kurtosis'] = df[f'{feature}'].apply(lambda x: extract_features(x)['kurtosis'])
        df[f'{feature}_percentile_25'] = df_copy[f'{feature}'].apply(lambda x: extract_features(x)['percentile_25'])
        df[f'{feature}_percentile_50'] = df_copy[f'{feature}'].apply(lambda x: extract_features(x)['percentile_50'])
        df[f'{feature}_percentile_75'] = df_copy[f'{feature}'].apply(lambda x: extract_features(x)['percentile_75'])

    # df = df.drop(band_names + indices, axis=1)
    df = df.drop('Image', axis=1)
    
    return df

In [6]:
pickle_file = 'data/model.pkl'
D = pkl.load(open(pickle_file,'rb'))

for k in D:
    globals()[k] = D[k]
scaling_mean = mean
scaling_std = std

In [7]:
bst_list = [x['bst'] for x in styled_logs[0]['log']]

In [8]:
# ## Iterate over test sets
folds=[{'in':'country_test_reduct.csv','out':'results_country.csv'},
      {'in':'random_test_reduct.csv','out':'results.csv'}]

In [9]:
for fold_i in range(len(folds)):
    fold=folds[fold_i]

    #load table entries
    test_csv = f'/home/{_uname}/public/Datasets_public/Final_Project_Data/{fold["in"]}'
    test = pd.read_csv(test_csv,index_col=0)
    test.index = test['filename']
    test.shape
    
    out = pd.DataFrame()
    out['filename'] = test['filename']
    out['urban'] = test['urban']
    out.set_index('filename', inplace=True)
    
    test['Image'] = test.apply(lambda x: getImage(image_dir + '/'+ x['filename']), axis=1)
    
    test = calculate_indices(test, train=False)
    
    X_test = test
    y_test = np.zeros(test.shape[0])
    
    dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)
    
    Preds = np.zeros([X_test.shape[0],len(bst_list)])
    
    for i in range(len(bst_list)):
        # Preds[:,i]=bst_list[i].predict(dtest,output_margin=True)
        bst = bst_list[i]
        y_pred = bst.predict(dtest,output_margin=True, ntree_limit=bst.best_ntree_limit)

        #y_pred = np.round(y_pred/(np.max(y_pred) - np.min(y_pred)),2)
        Preds[:,i] = y_pred
        
    #Preds=(Preds-scaling_mean)/scaling_std # apply overall score scaling
    
    _mean=np.mean(Preds,axis=1)
    _std=np.std(Preds,axis=1)
    
    pred_wo_abstention = (2 * (_mean > 0)) - 1

    # Apply threshold-based approach
    threshold = 0.5  # Adjust the threshold value as per your requirement
    uncertain_indices = np.abs(_mean) < threshold * np.abs(_std)
    pred_with_abstention = pred_wo_abstention.copy()
    pred_with_abstention[uncertain_indices] = 0
    
    if fold_i == 0:
        out['pred_with_abstention'] = pred_with_abstention
    else:
        out['pred_wo_abstention'] = pred_wo_abstention
    
    
    outFile=f'data/{fold["out"]}'
    out.to_csv(outFile)
    print('\n\n'+'-'*60)
    print(outFile)

  df[f'{feature}_std'] = df_copy[f'{feature}'].apply(lambda x: extract_features(x)['std'])
  df[f'{feature}_percentile_25'] = df_copy[f'{feature}'].apply(lambda x: extract_features(x)['percentile_25'])
  df[f'{feature}_percentile_50'] = df_copy[f'{feature}'].apply(lambda x: extract_features(x)['percentile_50'])
  df[f'{feature}_percentile_75'] = df_copy[f'{feature}'].apply(lambda x: extract_features(x)['percentile_75'])
  df[f'{feature}_mean'] = df_copy[f'{feature}'].apply(lambda x: extract_features(x)['mean'])
  df[f'{feature}_std'] = df_copy[f'{feature}'].apply(lambda x: extract_features(x)['std'])
  df[f'{feature}_percentile_25'] = df_copy[f'{feature}'].apply(lambda x: extract_features(x)['percentile_25'])
  df[f'{feature}_percentile_50'] = df_copy[f'{feature}'].apply(lambda x: extract_features(x)['percentile_50'])
  df[f'{feature}_percentile_75'] = df_copy[f'{feature}'].apply(lambda x: extract_features(x)['percentile_75'])
  df[f'{feature}_mean'] = df_copy[f'{feature}'].apply(lambd



------------------------------------------------------------
data/results_country.csv


  df[f'{feature}_std'] = df_copy[f'{feature}'].apply(lambda x: extract_features(x)['std'])
  df[f'{feature}_percentile_25'] = df_copy[f'{feature}'].apply(lambda x: extract_features(x)['percentile_25'])
  df[f'{feature}_percentile_50'] = df_copy[f'{feature}'].apply(lambda x: extract_features(x)['percentile_50'])
  df[f'{feature}_percentile_75'] = df_copy[f'{feature}'].apply(lambda x: extract_features(x)['percentile_75'])
  df[f'{feature}_mean'] = df_copy[f'{feature}'].apply(lambda x: extract_features(x)['mean'])
  df[f'{feature}_std'] = df_copy[f'{feature}'].apply(lambda x: extract_features(x)['std'])
  df[f'{feature}_percentile_25'] = df_copy[f'{feature}'].apply(lambda x: extract_features(x)['percentile_25'])
  df[f'{feature}_percentile_50'] = df_copy[f'{feature}'].apply(lambda x: extract_features(x)['percentile_50'])
  df[f'{feature}_percentile_75'] = df_copy[f'{feature}'].apply(lambda x: extract_features(x)['percentile_75'])
  df[f'{feature}_mean'] = df_copy[f'{feature}'].apply(lambd



------------------------------------------------------------
data/results.csv
