In [1]:
import pandas as pd
import csv
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

In [136]:
district_ground_truth = r"C:\Users\AJain7\OneDrive - Stryker\Personal\Projects\Satellite Project\5. District_level\District_Master.csv"
ground_truth = pd.read_csv(district_ground_truth)

In [137]:
ground_truth.columns

Index(['Unnamed: 0', 'district name', 'district code', '1. Under-Developed_BF',
       '2. Moderately-Developed_BF', '3. Developed_BF',
       '1. Under-Developed_FC', '2. Moderately-Developed_FC',
       '3. Developed_FC', '0',
       ...
       'LIT%', 'MSL', 'MSW', 'CHH', 'FC', 'BF', 'EMP_AG_NONAG', 'Asset',
       'EMP_FEMALE', 'LIT'],
      dtype='object', length=125)

In [4]:
cols = ['district code', 'Asset']
ground_truth = ground_truth[cols]

In [5]:
ground_truth['Asset'] = ground_truth['Asset'].apply(lambda x: int(x[:1]))

In [6]:
ground_truth.shape

(640, 2)

In [7]:
input_file = 'district_500m_feature_1.csv'
features = pd.read_csv(input_file, header=None)

In [8]:
features[0] = features[0].astype(int)
features=features.rename(columns = {0:'district code'})

In [9]:
df = features.merge(ground_truth, how='left',on='district code')

In [10]:
df.dropna(inplace=True)
df['Asset'] = df['Asset'].astype(int)

In [11]:
df.columns

Index(['district code',               1,               2,               3,
                     4,               5,               6,               7,
                     8,               9,
       ...
                 12022,           12023,           12024,           12025,
                 12026,           12027,           12028,           12029,
                 12030,         'Asset'],
      dtype='object', length=12032)

In [55]:
feature_cols = df.columns[1:df.shape[1]-1]
target_col = df.columns[-1:]

In [62]:
X_train, X_test, y_train, y_test = train_test_split(df[feature_cols], df[target_col], test_size=0.2, random_state=42)

In [65]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(512, 12030) (128, 12030)
(512, 1) (128, 1)


In [67]:
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)

In [69]:
predictions = clf.predict(X_test)
print('Macro ',f1_score(y_test, predictions, average='macro'))
print('Weighted ',f1_score(y_test, predictions, average='weighted'))

  return umr_sum(a, axis, dtype, out, keepdims)


In [81]:
df.to_csv('asset_image_feature.csv',index=False)

## PCA Reduction then scaling

In [78]:
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [14]:
feature_cols = df.columns[1:-1]
X = df[feature_cols]

n_components=30
# X_pca = PCA(n_components=n_components).fit_transform(X)
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X)

In [68]:
X_pca.shape

(640, 30)

In [69]:
# pca.explained_variance_
# pca.explained_variance_ratio_
print(pca.explained_variance_ratio_.cumsum())

[0.86456793 0.90654853 0.94094082 0.95247211 0.96016277 0.96621627
 0.97127543 0.97500391 0.97869022 0.98149052 0.98388007 0.98600266
 0.98796308 0.98973302 0.9911569  0.99231073 0.99332801 0.99427927
 0.99508742 0.99575391 0.99626086 0.99675235 0.997213   0.99765516
 0.99801123 0.99830775 0.9985937  0.99883433 0.9990489  0.99921258]


## Taking 30 as feature vectors, because they are providing reasonable variance when you run pca for n_components = 640

In [116]:
X_scaled = MinMaxScaler().fit_transform(X_pca)
feature_df = pd.DataFrame(X_scaled)

In [126]:
district_df = pd.DataFrame(df['district code'])
print(district_df.shape)
district_df.reset_index(inplace=True)
del district_df['index']

(640, 1)


In [131]:
final_df = pd.concat([feature_df, district_df],axis=1)

In [132]:
final_df.shape

(640, 31)

In [134]:
final_df.columns

Index([              0,               1,               2,               3,
                     4,               5,               6,               7,
                     8,               9,              10,              11,
                    12,              13,              14,              15,
                    16,              17,              18,              19,
                    20,              21,              22,              23,
                    24,              25,              26,              27,
                    28,              29, 'district code'],
      dtype='object')

In [139]:
cols = ['district code', 'MSL', 'MSW', 'CHH', 'FC', 'BF', 'Asset', 'EMP_FEMALE', 'LIT', 'EMP_AG_NONAG']
ground_truth = ground_truth[cols]

In [140]:
district_feature_file_2 = final_df.merge(ground_truth, on='district code',how='left')

In [142]:
district_feature_file_2.columns

Index([              0,               1,               2,               3,
                     4,               5,               6,               7,
                     8,               9,              10,              11,
                    12,              13,              14,              15,
                    16,              17,              18,              19,
                    20,              21,              22,              23,
                    24,              25,              26,              27,
                    28,              29, 'district code',           'MSL',
                 'MSW',           'CHH',            'FC',            'BF',
               'Asset',    'EMP_FEMALE',           'LIT',  'EMP_AG_NONAG'],
      dtype='object')

In [156]:
district_feature_file_2['MSL'] = district_feature_file_2['MSL'].apply(lambda x: int(x[:1]))
district_feature_file_2['MSW'] = district_feature_file_2['MSW'].apply(lambda x: int(x[:1]))
district_feature_file_2['CHH'] = district_feature_file_2['CHH'].apply(lambda x: int(x[:1]))
district_feature_file_2['FC'] = district_feature_file_2['FC'].apply(lambda x: int(x[:1]))
district_feature_file_2['BF'] = district_feature_file_2['BF'].apply(lambda x: int(x[:1]))
district_feature_file_2['Asset'] = district_feature_file_2['Asset'].apply(lambda x: int(x[:1]))
district_feature_file_2['EMP_FEMALE'] = district_feature_file_2['EMP_FEMALE'].apply(lambda x: int(x[:1]))
district_feature_file_2['LIT'] = district_feature_file_2['LIT'].apply(lambda x: int(x[:1]))
district_feature_file_2['EMP_AG_NONAG'] = district_feature_file_2['EMP_AG_NONAG'].apply(lambda x: int(x[:1]))

In [168]:
district_feature_file_2['EMP_AG_NONAG'].value_counts()

1    259
2    218
3    163
Name: EMP_AG_NONAG, dtype: int64

In [169]:
district_feature_file_2.to_csv('district_feature_file_2.csv', index=False)