In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sbrn
import numpy as np
import re
import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

%matplotlib inline

In [3]:
dat = pd.read_csv('water_training.csv', header=0)

In [4]:
labels = pd.read_csv('water_training_labels.csv', header=0)
#join labels to dat on "id" (left outer)

dat=dat.merge(labels, how='left', left_on='id', right_on='id',copy=False)

In [5]:
dat['functional'] = [1 if x=='functional' else 0 for x in dat['status_group']]

In [7]:
df_drop = dat.loc[dat['construction_year']!=0]

In [8]:
X = df_drop[['construction_year']]

In [9]:
y = df_drop[['functional']]

In [10]:
logit = sm.Logit(y, X)

In [11]:
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.685481
         Iterations 3


# Construction Year (all 0's are dropped)

In [12]:
print result.summary()

                           Logit Regression Results                           
Dep. Variable:             functional   No. Observations:                38691
Model:                          Logit   Df Residuals:                    38690
Method:                           MLE   Df Model:                            0
Date:                Tue, 10 May 2016   Pseudo R-squ.:               0.0003155
Time:                        19:48:43   Log-Likelihood:                -26522.
converged:                       True   LL-Null:                       -26530.
                                        LLR p-value:                       nan
                        coef    std err          z      P>|z|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------
construction_year     0.0001   5.13e-06     24.263      0.000         0.000     0.000


In [13]:
dat.basin.isnull().sum().sum()

0

In [14]:
y = dat[['functional']]

In [15]:
basin_dummies = pd.get_dummies(dat['basin'])

In [16]:
X = basin_dummies

In [17]:
logit = sm.Logit(y, X)

In [18]:
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.676049
         Iterations 4


# Basin

In [19]:
print result.summary()

                           Logit Regression Results                           
Dep. Variable:             functional   No. Observations:                59400
Model:                          Logit   Df Residuals:                    59391
Method:                           MLE   Df Model:                            8
Date:                Tue, 10 May 2016   Pseudo R-squ.:                 0.01941
Time:                        19:49:05   Log-Likelihood:                -40157.
converged:                       True   LL-Null:                       -40952.
                                        LLR p-value:                     0.000
                              coef    std err          z      P>|z|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------------
Internal                    0.3052      0.023     13.311      0.000         0.260     0.350
Lake Nyasa                  0.6353      0.029     21.554      0.000         0.578     0.693


In [20]:
dat.quantity.isnull().sum().sum()
#no missing values!

0

In [21]:
quantity_dummies = pd.get_dummies(dat['quantity'])

In [22]:
X = quantity_dummies

In [23]:
logit = sm.Logit(y, X)

In [24]:
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.603778
         Iterations 8


# Quantity

In [25]:
print result.summary()

                           Logit Regression Results                           
Dep. Variable:             functional   No. Observations:                59400
Model:                          Logit   Df Residuals:                    59395
Method:                           MLE   Df Model:                            4
Date:                Tue, 10 May 2016   Pseudo R-squ.:                  0.1242
Time:                        19:49:11   Log-Likelihood:                -35864.
converged:                       True   LL-Null:                       -40952.
                                        LLR p-value:                     0.000
                   coef    std err          z      P>|z|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
dry             -3.6580      0.081    -45.255      0.000        -3.816    -3.500
enough           0.6293      0.012     54.592      0.000         0.607     0.652
insufficient     0.0930      0.016      5.71

In [26]:
df_drop = dat.loc[dat['gps_height']!=0]

In [27]:
X = df_drop[['gps_height']]

In [28]:
y = df_drop[['functional']]

In [29]:
logit = sm.Logit(y, X)

In [30]:
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.677891
         Iterations 3


# GPS Height (all 0's are dropped)

In [31]:
print result.summary()

                           Logit Regression Results                           
Dep. Variable:             functional   No. Observations:                38962
Model:                          Logit   Df Residuals:                    38961
Method:                           MLE   Df Model:                            0
Date:                Tue, 10 May 2016   Pseudo R-squ.:                 0.01197
Time:                        19:49:17   Log-Likelihood:                -26412.
converged:                       True   LL-Null:                       -26732.
                                        LLR p-value:                       nan
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
gps_height     0.0003   8.76e-06     34.018      0.000         0.000     0.000


In [32]:
df_drop = dat.loc[dat['population']!=0]

In [33]:
X = df_drop[['population']]

In [34]:
y = df_drop[['functional']]

In [35]:
logit = sm.Logit(y, X)

In [36]:
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.691282
         Iterations 4


# Population (all 0's are dropped)

In [37]:
print result.summary()

                           Logit Regression Results                           
Dep. Variable:             functional   No. Observations:                38019
Model:                          Logit   Df Residuals:                    38018
Method:                           MLE   Df Model:                            0
Date:                Tue, 10 May 2016   Pseudo R-squ.:               -0.005154
Time:                        19:49:24   Log-Likelihood:                -26282.
converged:                       True   LL-Null:                       -26147.
                                        LLR p-value:                       nan
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
population     0.0002   1.87e-05     11.267      0.000         0.000     0.000


In [38]:
dat.district_code.isnull().sum().sum()

0

In [39]:
df_drop = dat.loc[dat['district_code']!=0]

In [40]:
district_dummies = pd.get_dummies(df_drop['district_code'])

In [41]:
X = district_dummies

In [42]:
y = df_drop[['functional']]

In [43]:
logit = sm.Logit(y, X)

In [44]:
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.679303
         Iterations 5


# District (all 0's are dropped)

In [45]:
print result.summary()

                           Logit Regression Results                           
Dep. Variable:             functional   No. Observations:                59377
Model:                          Logit   Df Residuals:                    59358
Method:                           MLE   Df Model:                           18
Date:                Tue, 10 May 2016   Pseudo R-squ.:                 0.01465
Time:                        19:49:33   Log-Likelihood:                -40335.
converged:                       True   LL-Null:                       -40935.
                                        LLR p-value:                1.295e-243
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
1              0.1499      0.018      8.257      0.000         0.114     0.186
2              0.2216      0.019     11.641      0.000         0.184     0.259
3             -0.0180      0.020     -0.900      0.3

In [46]:
dat.scheme_management.isnull().sum().sum()

3877

In [47]:
df_drop = df_drop[pd.notnull(dat['scheme_management'])]

  if __name__ == '__main__':


In [48]:
df_drop.scheme_management.isnull().sum().sum()

0

In [49]:
scheme_dummies = pd.get_dummies(df_drop['scheme_management'])

In [50]:
X = scheme_dummies

In [51]:
y = df_drop[['functional']]

In [52]:
logit = sm.Logit(y, X)

In [53]:
result = logit.fit()

         Current function value: 0.679081
         Iterations: 35




# Scheme (missing data dropped)

In [54]:
print result.summary()

                           Logit Regression Results                           
Dep. Variable:             functional   No. Observations:                55500
Model:                          Logit   Df Residuals:                    55488
Method:                           MLE   Df Model:                           11
Date:                Tue, 10 May 2016   Pseudo R-squ.:                 0.01388
Time:                        19:49:47   Log-Likelihood:                -37689.
converged:                      False   LL-Null:                       -38220.
                                        LLR p-value:                1.246e-220
                       coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------------
Company              0.0132      0.061      0.215      0.830        -0.107     0.134
None                26.2830    5.1e+05   5.16e-05      1.000     -9.99e+05  9.99e+05
Other                0.3968 

In [55]:
#dat.scheme_management.isnull().sum().sum()
dat.extraction_type_class.isnull().sum().sum()

0

In [56]:
extraction_dummies = pd.get_dummies(dat['extraction_type_class'])

In [57]:
X = extraction_dummies

In [58]:
y = dat[['functional']]

In [59]:
logit = sm.Logit(y, X)

In [60]:
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.645088
         Iterations 6


# Extraction

In [61]:
print result.summary()

                           Logit Regression Results                           
Dep. Variable:             functional   No. Observations:                59400
Model:                          Logit   Df Residuals:                    59393
Method:                           MLE   Df Model:                            6
Date:                Tue, 10 May 2016   Pseudo R-squ.:                 0.06432
Time:                        19:49:57   Log-Likelihood:                -38318.
converged:                       True   LL-Null:                       -40952.
                                        LLR p-value:                     0.000
                   coef    std err          z      P>|z|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
gravity          0.4024      0.012     32.267      0.000         0.378     0.427
handpump         0.5342      0.016     33.079      0.000         0.503     0.566
motorpump       -0.4896      0.038    -12.98

In [62]:
dat.water_quality.isnull().sum().sum()

0

In [63]:
water_quality_dummies = pd.get_dummies(dat['water_quality'])

In [64]:
X = water_quality_dummies

In [65]:
logit = sm.Logit(y, X)

In [66]:
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.675797
         Iterations 6


# Water Quality

In [67]:
print result.summary()

                           Logit Regression Results                           
Dep. Variable:             functional   No. Observations:                59400
Model:                          Logit   Df Residuals:                    59392
Method:                           MLE   Df Model:                            7
Date:                Tue, 10 May 2016   Pseudo R-squ.:                 0.01978
Time:                        19:50:02   Log-Likelihood:                -40142.
converged:                       True   LL-Null:                       -40952.
                                        LLR p-value:                     0.000
                         coef    std err          z      P>|z|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------------
coloured               0.0082      0.090      0.090      0.928        -0.169     0.185
fluoride               1.1255      0.164      6.845      0.000         0.803     1.448
fluoride abandoned  

In [68]:
dat.quantity.isnull().sum().sum()

0

In [69]:
quantity_dummies = pd.get_dummies(dat['quantity'])

In [70]:
X = quantity_dummies

In [71]:
logit = sm.Logit(y, X)

In [72]:
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.603778
         Iterations 8


# Quantity

In [73]:
print result.summary()

                           Logit Regression Results                           
Dep. Variable:             functional   No. Observations:                59400
Model:                          Logit   Df Residuals:                    59395
Method:                           MLE   Df Model:                            4
Date:                Tue, 10 May 2016   Pseudo R-squ.:                  0.1242
Time:                        19:50:06   Log-Likelihood:                -35864.
converged:                       True   LL-Null:                       -40952.
                                        LLR p-value:                     0.000
                   coef    std err          z      P>|z|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
dry             -3.6580      0.081    -45.255      0.000        -3.816    -3.500
enough           0.6293      0.012     54.592      0.000         0.607     0.652
insufficient     0.0930      0.016      5.71

In [74]:
dat.source.isnull().sum().sum()

0

In [75]:
source_dummies = pd.get_dummies(dat['source'])

In [76]:
X = source_dummies

In [77]:
logit = sm.Logit(y, X)

In [78]:
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.679294
         Iterations 5


# Source

In [79]:
print result.summary()

                           Logit Regression Results                           
Dep. Variable:             functional   No. Observations:                59400
Model:                          Logit   Df Residuals:                    59390
Method:                           MLE   Df Model:                            9
Date:                Tue, 10 May 2016   Pseudo R-squ.:                 0.01470
Time:                        19:50:10   Log-Likelihood:                -40350.
converged:                       True   LL-Null:                       -40952.
                                        LLR p-value:                1.449e-253
                           coef    std err          z      P>|z|      [95.0% Conf. Int.]
----------------------------------------------------------------------------------------
dam                     -0.4655      0.080     -5.804      0.000        -0.623    -0.308
hand dtw                 0.2763      0.068      4.046      0.000         0.142     0.410
lake        

In [80]:
dat.waterpoint_type.isnull().sum().sum()

0

In [81]:
waterpoint_dummies = pd.get_dummies(dat['waterpoint_type'])

In [82]:
X = waterpoint_dummies

In [83]:
logit = sm.Logit(y, X)

In [84]:
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.632706
         Iterations 6


# Waterpoint

In [85]:
print result.summary()

                           Logit Regression Results                           
Dep. Variable:             functional   No. Observations:                59400
Model:                          Logit   Df Residuals:                    59393
Method:                           MLE   Df Model:                            6
Date:                Tue, 10 May 2016   Pseudo R-squ.:                 0.08228
Time:                        19:50:47   Log-Likelihood:                -37583.
converged:                       True   LL-Null:                       -40952.
                                        LLR p-value:                     0.000
                                  coef    std err          z      P>|z|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------------
cattle trough                   0.9651      0.208      4.646      0.000         0.558     1.372
communal standpipe              0.4959      0.012     40.617      0.000         