In [1]:
### Import libraries
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.feature_selection import RFE
from sklearn import cross_validation
from sklearn import preprocessing
import matplotlib.pyplot as plt
%pylab
import statsmodels.discrete.discrete_model as sm

Using matplotlib backend: Qt4Agg
Populating the interactive namespace from numpy and matplotlib




In [2]:
df_outliers = pd.read_excel('DistrictTypes.xlsx')

In [3]:
df_outliers.head()

Unnamed: 0,District,State,Type,DistCode
0,Agra,Uttar Pradesh,4. Large City,146
1,Ahmadabad,Gujarat,1. Metro,474
2,Aizawl,Mizoram,3. Capital,283
3,Ajmer,Rajasthan,4. Large City,119
4,Allahabad,Uttar Pradesh,4. Large City,175


In [4]:
snow_id = df_outliers[df_outliers['Type'] == '5. Snow Clad']['DistCode'].values.tolist()

In [5]:
metro = df_outliers[df_outliers['Type'] == '1. Metro']['DistCode'].values.tolist()

In [6]:
### Import Data Files
df_satData = pd.read_csv('distSatellite_withLabels.csv')

In [7]:
df_satData = df_satData[~(df_satData['101'].isin(snow_id+metro))]

In [8]:
### List of labels
lis_labelsAll = ['MSL_1','MSL_2','MSL_3','MSW_1','MSW_2','MSW_3','CHH_1','CHH_2','CHH_3','FC_1','FC_2','FC_3','BF_1','BF_2','BF_3','EMP_1','EMP_2','EMP_3']

### List of useful Labels
lis_labels= ['asset_1','asset_2','asset_3']

In [9]:
df_satData.head()

Unnamed: 0,DISTRICT,101,DN_00,DN_01,DN_02,DN_03,DN_04,DN_05,DN_06,DN_07,...,CHH_3,FC_1,FC_2,FC_3,BF_1,BF_2,BF_3,EMP_1,EMP_2,EMP_3
0,Adilabad,532,5237.12,0,0,0.76,1815.67,3332.38,2043.9,1049.05,...,0,1,0,0,1,0,0,0,1,0
1,Agra,146,401.66,0,0,0.0,48.54,451.36,620.77,441.57,...,0,0,0,1,0,0,1,1,0,0
3,Ahmadnagar,522,984.52,0,0,3.12,1756.19,3855.19,2683.53,1989.69,...,0,1,0,0,1,0,0,0,1,0
4,Aizawl,283,3412.59,0,0,0.0,49.31,102.02,42.99,27.32,...,0,0,0,1,0,0,1,0,0,1
5,Ajmer,119,3903.05,0,0,1.61,386.31,1286.76,648.55,490.42,...,0,0,0,1,0,0,1,0,0,1


In [10]:
df_satData.shape

(592, 103)

In [11]:
cols = df_satData.columns.tolist()

In [12]:
cols_new = cols[2:cols.index('MSL_1')]

In [13]:
pure_modis_features = cols[cols.index('Water'):cols.index('MSL_1')]

## Building features from night light data

In [14]:
var = []
for i in range(64):
    var.append('DN_' + str(i).zfill(2))

df_satData['Area'] = 0
for i in var:
    df_satData['Area']=df_satData['Area'] + df_satData[i]
df_satData['Area'] = df_satData['Area'].replace(0,1)


## Building features out of Modis Data

In [15]:
df_satData['Urban and built-up']=df_satData['Urban and built-up']+1
df_satData['CropRatio']=(df_satData['Croplands']+df_satData['Cropland/Natural vegetation mosaic'])/df_satData['Area']
#df_satData['AvgUrbanNTL']=df_satData['sum']/df_satData['Urban and built-up']
df_satData['UrbanRatio']= df_satData['Urban and built-up']/df_satData['Area']

df_satData['Natural']= 0
for i in ['Water','Evergreen Needleleaf forest','Evergreen Broadleaf forest','Deciduous Needleleaf forest','Deciduous Broadleaf forest','Mixed forest','Closed shrublands','Open shrublands','Woody savannas','Savannas','Grasslands','Permanent wetlands','Snow and ice']:
    df_satData['Natural']=df_satData['Natural']+df_satData[i]
    
df_satData['CropRemainRatio']=(df_satData['Croplands']+df_satData['Cropland/Natural vegetation mosaic'])/(df_satData['Area']-df_satData['Natural'])
df_satData['UrbanRemainRatio']=df_satData['Urban and built-up']/(df_satData['Area']-df_satData['Natural'])
#df_satData['UrbanRatio']=df_satData['UrbanRatio'].fillna(0)


forest =['Evergreen Broadleaf forest','Deciduous Broadleaf forest','Mixed forest']
df_satData['forest'] = df_satData[forest].sum(axis=1)

grass_shrubs =['Closed shrublands','Open shrublands','Woody savannas','Savannas','Grasslands']
df_satData['grass_shrubs'] = df_satData[grass_shrubs].sum(axis=1)


In [16]:
modi_area = df_satData[pure_modis_features].sum(axis=1).values.tolist()

In [17]:
left_area = [df_satData['Area'].values.tolist()[i]-modi_area[i] for i in range(len(df_satData))]

In [18]:
modi_var = []
modi_var=['CropRatio', 'UrbanRatio', 'CropRemainRatio', 'UrbanRemainRatio','Natural']
for i in ['Croplands',
 'Urban and built-up',
 'Cropland/Natural vegetation mosaic',
 'forest',
 'grass_shrubs']:
    df_satData['mod_'+i]=df_satData[i]/df_satData['Area']
    modi_var.append('mod_'+i)
    modi_var.append(i)
#modi_var.append('left_area')

In [19]:
## making log of sum

In [20]:
df_satData.loc[:,'logSum'] = log10(df_satData['sum'])

  """Entry point for launching an IPython kernel.


In [21]:
y = df_satData['sum'].values

In [22]:
X = df_satData[modi_var]

In [23]:
X = X.values

In [24]:
X = preprocessing.scale(X)

In [25]:
model = LinearRegression()

In [26]:
fit = model.fit(X,y)

In [27]:
yPredict = fit.predict(X)

In [28]:
#coefficient of determination R^2 of the prediction
scr = model.score(X,y)
print 'coefficient of determination R^2 of the prediction is : '
print scr
print 

print 'mean square error is:'
print metrics.mean_squared_error(y,yPredict)
print 

print 'root mean square error is:'
print sqrt(metrics.mean_squared_error(y,yPredict))

coefficient of determination R^2 of the prediction is : 
0.738995086255

mean square error is:
1.70382088176e+20

root mean square error is:
13053048999.2


In [29]:
from sklearn.feature_selection import RFE

In [30]:
estimator = LinearRegression()
selector = RFE(estimator, 1, step=1)
selector = selector.fit(X, y)
selector.support_ 

array([False, False, False, False, False, False, False, False,  True,
       False, False, False, False, False, False], dtype=bool)

In [32]:
ranks = selector.ranking_

In [36]:
df_modiVarRanks = pd.DataFrame(modi_var,ranks)

In [37]:
df_modiVarRanks.reset_index(inplace=True)

In [39]:
df_modiVarRanks.columns = ['ranks','modi_var']

In [46]:
df_modiVarRanks.sort_values(by='ranks').to_csv('modiVarRanked.csv',index=False)

#### doing linear regression with statsmodel to get the pvalues

In [47]:
import statsmodels.api as sm

  from pandas.core import datetools


In [49]:
X = sm.add_constant(X)

In [51]:
model = sm.OLS(y,X)

In [53]:
results = model.fit()

In [64]:
df_modiVarPvals = pd.DataFrame(results.pvalues,['constant']+modi_var)

In [66]:
df_modiVarPvals.sort_values(by=0)

Unnamed: 0,0
constant,2.768034e-187
Urban and built-up,7.6111969999999995e-50
Croplands,1.505882e-40
Cropland/Natural vegetation mosaic,5.587839e-13
forest,0.07389629
Natural,0.1640147
mod_Croplands,0.1783188
mod_grass_shrubs,0.2241915
grass_shrubs,0.3441705
CropRatio,0.3477093


In [63]:
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.739
Model:,OLS,Adj. R-squared:,0.733
Method:,Least Squares,F-statistic:,125.9
Date:,"Wed, 04 Jul 2018",Prob (F-statistic):,6.23e-159
Time:,12:26:01,Log-Likelihood:,-14629.0
No. Observations:,592,AIC:,29290.0
Df Residuals:,578,BIC:,29350.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.397e+10,5.43e+08,44.143,0.000,2.29e+10,2.5e+10
x1,-2.29e+09,2.44e+09,-0.940,0.348,-7.07e+09,2.5e+09
x2,-3.411e+08,6.9e+08,-0.494,0.621,-1.7e+09,1.01e+09
x3,4.317e+08,9.64e+08,0.448,0.654,-1.46e+09,2.32e+09
x4,-9.567e+08,1.59e+09,-0.601,0.548,-4.08e+09,2.17e+09
x5,6.313e+09,4.53e+09,1.393,0.164,-2.58e+09,1.52e+10
x6,-2.533e+09,1.88e+09,-1.348,0.178,-6.22e+09,1.16e+09
x7,1.142e+10,7.91e+08,14.433,0.000,9.87e+09,1.3e+10
x8,-3.411e+08,6.9e+08,-0.494,0.621,-1.7e+09,1.01e+09

0,1,2,3
Omnibus:,55.285,Durbin-Watson:,1.943
Prob(Omnibus):,0.0,Jarque-Bera (JB):,81.121
Skew:,0.673,Prob(JB):,2.43e-18
Kurtosis:,4.215,Cond. No.,8.19e+16


#### trying to do it via logistic regression by first doing clustering m

In [None]:
sse = {}
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(yPredict.reshape(-1, 1))
    #print(data["clusters"])
    sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3, max_iter=1000).fit(y.reshape(-1, 1))
yClustered = kmeans.labels_

In [None]:
model2 = LogisticRegression()

In [None]:
X = df_satData[modi_var].values

In [None]:
X = preprocessing.scale(X)

In [None]:
y = df_satData['clusters'].values

In [None]:
fit = model2.fit(X,y)

In [None]:
yPredict= model2.predict(X)

In [None]:
metrics.f1_score(yClustered,yPredictClustered,average='macro')

In [None]:
metrics.accuracy_score(yClustered,yPredictClustered)