## Package Imports

In [28]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import seaborn as sns
from os import listdir
from os.path import isfile, join
from sklearn.metrics import f1_score
import random
from statistics import mean 

## Confusion Matrix Image Code

In [24]:
# Confusion matrix code
from sklearn.metrics import confusion_matrix
from itertools import product
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting normalize=True.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    else:
        print('')
        #print('Confusion matrix, without normalization')

    #print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

## Reading the ground truth labels and getting all the monthly bucket data

In [27]:
mypath='normal_csvFiles/normal_csv/'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
# print(onlyfiles)
labels_2011=pd.read_csv('2011_label.csv')
array1=(labels_2011['EMP_2011'].values)
df_emp_to_number={'Unemployment':1,'Agricultural':2,'Non Agri':3}
array2=np.array([df_emp_to_number[t] for t in array1])
labels_2011['numEMP_2011']=array2
trainCols=['light_'+str(i) for i in range(0,101,1)]
predictCols=['MSW_2011','BF_2011','MSL_2011','FC_2011','CHH_2011','numEMP_2011']

## Using Balanced Weighted Random Forest and averaging out 5 times for accuracy calculation

In [29]:
acc_df = pd.DataFrame(columns=('Month','MSW_2011','BF_2011','MSL_2011','FC_2011','CHH_2011','numEMP_2011'))
loc_int=0
for csvFileName1 in onlyfiles:
    rowToAppend=[csvFileName1[:-4]]
    print('For Month: ', csvFileName1)
    #csvFileName='corrected_csvFiles/corrected_csv/corrected_viirs_India_2014-01-01_2014-01-31_500.csv'
    csvFileName=mypath+csvFileName1
    viirsBucketsData=pd.read_csv(csvFileName)
    viirsBucketsData.rename( columns={'Unnamed: 0':'dname'}, inplace=True)
    combinedDf = pd.merge(viirsBucketsData, labels_2011, left_on=['censuscode'], right_on = ['District'])
    for prediction_label in predictCols: 
        #print(prediction_label)
        scaler = StandardScaler()
        scaler.fit(train_features)
        
        list_acc=[]
        for i in range(5):
            train_features, test_features, train_labels, test_labels = train_test_split(combinedDf[trainCols], 
                                                                                        combinedDf[prediction_label],
                                                                                        test_size = 0.3,
                                                                                        random_state = int(random.random()))
            
            rf = RandomForestClassifier(n_estimators = 500, random_state = 64, class_weight='balanced')
            rf.fit(scaler.transform(train_features), train_labels)
            predictions = rf.predict(scaler.transform(test_features))
            score_curr=f1_score(test_labels, predictions, average='weighted')
            list_acc.append(score_curr)
            
        rowToAppend.append(mean(list_acc))    
        
    acc_df.loc[loc_int] = rowToAppend
    loc_int+=1

For Month:  normal_viirs_India_2013-02-01_2013-02-28_500.csv
For Month:  normal_viirs_India_2013-01-01_2013-01-31_500.csv


KeyboardInterrupt: 

## Saving the Accuracy Matrix

In [7]:
acc_df.to_csv('accuracy_normal_viirs.csv')

## Confusion Matrices

In [35]:
csvFileName='corrected_csvFiles/corrected_csv/corrected_viirs_India_2014-12-01_2014-12-31_500.csv'
viirsBucketsData=pd.read_csv(csvFileName)
viirsBucketsData.rename( columns={'Unnamed: 0':'dname'}, inplace=True)
combinedDf = pd.merge(viirsBucketsData, labels_2011, left_on=['censuscode'], right_on = ['District'])
for prediction_label in predictCols: 
    print(prediction_label)
    train_features, test_features, train_labels, test_labels = train_test_split(combinedDf[trainCols], 
                                                                                combinedDf[prediction_label],
                                                                                test_size = 0.3,
                                                                                random_state = 60)
    
    scaler = StandardScaler()
    scaler.fit(train_features)
    rf = RandomForestClassifier(n_estimators = 500, random_state = 64, class_weight='balanced')
    rf.fit(scaler.transform(train_features), train_labels)
    predictions = rf.predict(scaler.transform(test_features))
    print('accuracy_score',accuracy_score(test_labels, predictions))
    print('f1_score',f1_score(test_labels, predictions,average='weighted'))
    cnf_matrix = confusion_matrix(test_labels, predictions)
    class_names=['1. Under-Developed','2. Moderately-Developed','3. Developed']
    if(prediction_label=='numEMP_2011'):
        class_names=['1. Unemployment','2. Agricultural','3. Non Agricultural Employment']
    plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,title=prediction_label+'_Normalized')
    plt.savefig(prediction_label+'_Normalized.jpg')
    plt.clf()
    plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=False,title=prediction_label+'_Absolute')
    plt.savefig(prediction_label+'_Absolute.jpg')
    plt.clf()

MSW_2011
accuracy_score 0.6243386243386243
f1_score 0.6113506191673601

BF_2011
accuracy_score 0.671957671957672
f1_score 0.6490028190511791

MSL_2011
accuracy_score 0.6666666666666666
f1_score 0.6252304297414653

FC_2011
accuracy_score 0.708994708994709
f1_score 0.6960127058662144

CHH_2011
accuracy_score 0.5978835978835979
f1_score 0.5988872907010949

numEMP_2011
accuracy_score 0.6402116402116402
f1_score 0.6439286587316017



<Figure size 432x288 with 0 Axes>

### Grid Search

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

train_features, test_features, train_labels, test_labels = train_test_split(combinedDf[trainCols], 
                                                                                combinedDf[prediction_label],
                                                                                test_size = 0.3,
                                                                                random_state = 60)
scaler = StandardScaler()
scaler.fit(train_features)
clf = RandomForestClassifier() #Initialize with whatever parameters you want to

# 10-Fold Cross validation
print (np.mean(cross_val_score(clf, scaler.transform(train_features), train_labels, cv=10)))


param_grid = {
                 'n_estimators': [20,100,200,500],
                 'max_depth': [2, 5, 7, 9]
             }

from sklearn.model_selection import GridSearchCV

grid_clf = GridSearchCV(clf, param_grid, cv=10)
grid_clf.fit(scaler.transform(train_features), train_labels)

print('#########################################')
print('grid_clf. best_estimator_',grid_clf. best_estimator_)
print('#########################################')
print('grid_clf. best_params_',grid_clf. best_params_)
print('#########################################')
print('grid_clf.grid_scores_',grid_clf.grid_scores_)
print('#########################################')

0.4735752877613343
#########################################
grid_clf. best_estimator_ RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
#########################################
grid_clf. best_params_ {'max_depth': 7, 'n_estimators': 500}
#########################################
grid_clf.grid_scores_ [mean: 0.49887, std: 0.08990, params: {'max_depth': 2, 'n_estimators': 20}, mean: 0.50794, std: 0.07653, params: {'max_depth': 2, 'n_estimators': 100}, mean: 0.50567, std: 0.06693, params: {'max_depth': 2, 'n_estimators': 200}, mean: 0.50567, std: 0.06478, params: {'max_depth': 2, 'n_estimators': 500}, mean: 0.49433, std: 0.0

