In [226]:
### Import libraries
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.feature_selection import RFE
from sklearn import cross_validation
from sklearn import preprocessing
import matplotlib.pyplot as plt
%pylab
import statsmodels.discrete.discrete_model as sm

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


In [47]:
df_outliers = pd.read_excel('DistrictTypes.xlsx')

In [48]:
df_outliers.head()

Unnamed: 0,District,State,Type,DistCode
0,Agra,Uttar Pradesh,4. Large City,146
1,Ahmadabad,Gujarat,1. Metro,474
2,Aizawl,Mizoram,3. Capital,283
3,Ajmer,Rajasthan,4. Large City,119
4,Allahabad,Uttar Pradesh,4. Large City,175


In [49]:
snow_id = df_outliers[df_outliers['Type'] == '5. Snow Clad']['DistCode'].values.tolist()

In [50]:
metro = df_outliers[df_outliers['Type'] == '1. Metro']['DistCode'].values.tolist()

In [51]:
### Import Data Files
df_satData = pd.read_csv('distSatellite_withLabels.csv')

In [52]:
df_satData = df_satData[~(df_satData['101'].isin(snow_id+metro))]

In [53]:
### List of labels
lis_labelsAll = ['MSL_1','MSL_2','MSL_3','MSW_1','MSW_2','MSW_3','CHH_1','CHH_2','CHH_3','FC_1','FC_2','FC_3','BF_1','BF_2','BF_3','EMP_1','EMP_2','EMP_3']

### List of useful Labels
lis_labels= ['asset_1','asset_2','asset_3']

In [54]:
df_satData.head()

Unnamed: 0,DISTRICT,101,DN_00,DN_01,DN_02,DN_03,DN_04,DN_05,DN_06,DN_07,...,CHH_3,FC_1,FC_2,FC_3,BF_1,BF_2,BF_3,EMP_1,EMP_2,EMP_3
0,Adilabad,532,5237.12,0,0,0.76,1815.67,3332.38,2043.9,1049.05,...,0,1,0,0,1,0,0,0,1,0
1,Agra,146,401.66,0,0,0.0,48.54,451.36,620.77,441.57,...,0,0,0,1,0,0,1,1,0,0
3,Ahmadnagar,522,984.52,0,0,3.12,1756.19,3855.19,2683.53,1989.69,...,0,1,0,0,1,0,0,0,1,0
4,Aizawl,283,3412.59,0,0,0.0,49.31,102.02,42.99,27.32,...,0,0,0,1,0,0,1,0,0,1
5,Ajmer,119,3903.05,0,0,1.61,386.31,1286.76,648.55,490.42,...,0,0,0,1,0,0,1,0,0,1


In [55]:
df_satData.shape

(592, 103)

In [56]:
cols = df_satData.columns.tolist()

In [57]:
cols_new = cols[2:cols.index('MSL_1')]

In [58]:
pure_modis_features = cols[cols.index('Water'):cols.index('MSL_1')]

## Building features from night light data

In [59]:
var = []
for i in range(64):
    var.append('DN_' + str(i).zfill(2))

df_satData['Area'] = 0
for i in var:
    df_satData['Area']=df_satData['Area'] + df_satData[i]
df_satData['Area'] = df_satData['Area'].replace(0,1)


## Building features out of Modis Data

In [60]:
df_satData['Urban and built-up']=df_satData['Urban and built-up']+1
df_satData['CropRatio']=(df_satData['Croplands']+df_satData['Cropland/Natural vegetation mosaic'])/df_satData['Area']
#df_satData['AvgUrbanNTL']=df_satData['sum']/df_satData['Urban and built-up']
df_satData['UrbanRatio']= df_satData['Urban and built-up']/df_satData['Area']

df_satData['Natural']= 0
for i in ['Water','Evergreen Needleleaf forest','Evergreen Broadleaf forest','Deciduous Needleleaf forest','Deciduous Broadleaf forest','Mixed forest','Closed shrublands','Open shrublands','Woody savannas','Savannas','Grasslands','Permanent wetlands','Snow and ice']:
    df_satData['Natural']=df_satData['Natural']+df_satData[i]
    
df_satData['CropRemainRatio']=(df_satData['Croplands']+df_satData['Cropland/Natural vegetation mosaic'])/(df_satData['Area']-df_satData['Natural'])
df_satData['UrbanRemainRatio']=df_satData['Urban and built-up']/(df_satData['Area']-df_satData['Natural'])
#df_satData['UrbanRatio']=df_satData['UrbanRatio'].fillna(0)


forest =['Evergreen Broadleaf forest','Deciduous Broadleaf forest','Mixed forest']
df_satData['forest'] = df_satData[forest].sum(axis=1)

grass_shrubs =['Closed shrublands','Open shrublands','Woody savannas','Savannas','Grasslands']
df_satData['grass_shrubs'] = df_satData[grass_shrubs].sum(axis=1)


In [61]:
modi_area = df_satData[pure_modis_features].sum(axis=1).values.tolist()

In [62]:
left_area = [df_satData['Area'].values.tolist()[i]-modi_area[i] for i in range(len(df_satData))]

In [94]:
modi_var = []
modi_var=['CropRatio', 'UrbanRatio', 'CropRemainRatio', 'UrbanRemainRatio','Natural']
for i in ['Croplands',
 'Urban and built-up',
 'Cropland/Natural vegetation mosaic',
 'forest',
 'grass_shrubs']:
    df_satData['mod_'+i]=df_satData[i]/df_satData['Area']
    modi_var.append('mod_'+i)
    modi_var.append(i)
#modi_var.append('left_area')

In [95]:
## making log of sum

In [72]:
df_satData.loc[:,'logSum'] = log10(df_satData['sum'])

  """Entry point for launching an IPython kernel.


In [252]:
y = df_satData['sum'].values

In [253]:
X = df_satData[modi_var]

In [254]:
X = X.values

In [255]:
X = preprocessing.scale(X)

In [256]:
model = LinearRegression()

In [257]:
fit = model.fit(X,y)

In [258]:
yPredict = fit.predict(X)

In [216]:
#coefficient of determination R^2 of the prediction
scr = model.score(X,y)
print 'coefficient of determination R^2 of the prediction is : '
print scr
print 

print 'mean square error is:'
print metrics.mean_squared_error(y,yPredict)
print 

print 'root mean square error is:'
print sqrt(metrics.mean_squared_error(y,yPredict))

coefficient of determination R^2 of the prediction is : 
0.571218027575

mean square error is:
2.79905718194e+20

root mean square error is:
16730383085.7


In [263]:
sse = {}
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(yPredict.reshape(-1, 1))
    #print(data["clusters"])
    sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.show()

In [266]:
kmeans = KMeans(n_clusters=3, max_iter=1000).fit(y.reshape(-1, 1))
yClustered = kmeans.labels_

In [227]:
model2 = LogisticRegression()

In [231]:
X = df_satData[modi_var].values

In [232]:
X = preprocessing.scale(X)

In [233]:
y = df_satData['clusters'].values

In [235]:
fit = model2.fit(X,y)

In [236]:
yPredict= model2.predict(X)

In [269]:
metrics.f1_score(yClustered,yPredictClustered,average='macro')

0.38586601307189539

In [270]:
metrics.accuracy_score(yClustered,yPredictClustered)

0.26520270270270269