In [112]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

### Train Label EDA

In [96]:
y = pd.read_csv('../data/training_set_labels.csv')

In [97]:
y.head(10)

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional
5,9944,functional
6,19816,non functional
7,54551,non functional
8,53934,non functional
9,46144,functional


In [98]:
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            59400 non-null  int64 
 1   status_group  59400 non-null  object
dtypes: int64(1), object(1)
memory usage: 928.2+ KB


In [99]:
y.describe(include='all')

Unnamed: 0,id,status_group
count,59400.0,59400
unique,,3
top,,functional
freq,,32259
mean,37115.131768,
std,21453.128371,
min,0.0,
25%,18519.75,
50%,37061.5,
75%,55656.5,


In [100]:
y['status_group'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

### Train values EDA

In [119]:
X = pd.read_csv('../data/training_set_values.csv')

In [102]:
X.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,,,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [103]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     59400 non-null  int64  
 1   amount_tsh             59400 non-null  float64
 2   date_recorded          59400 non-null  object 
 3   funder                 55765 non-null  object 
 4   gps_height             59400 non-null  int64  
 5   installer              55745 non-null  object 
 6   longitude              59400 non-null  float64
 7   latitude               59400 non-null  float64
 8   wpt_name               59400 non-null  object 
 9   num_private            59400 non-null  int64  
 10  basin                  59400 non-null  object 
 11  subvillage             59029 non-null  object 
 12  region                 59400 non-null  object 
 13  region_code            59400 non-null  int64  
 14  district_code          59400 non-null  int64  
 15  lg

In [104]:
X.isnull().sum()

id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

### Numeric column EDA

In [105]:
num_cols = X.select_dtypes('number').columns
display(num_cols)
print(f'Object column count: {len(num_cols)}')

Index(['id', 'amount_tsh', 'gps_height', 'longitude', 'latitude',
       'num_private', 'region_code', 'district_code', 'population',
       'construction_year'],
      dtype='object')

Object column count: 10


### Object column EDA

In [106]:
obj_cols = X.select_dtypes('object').columns
display(obj_cols)
print(f'Object column count: {len(obj_cols)}')

Index(['date_recorded', 'funder', 'installer', 'wpt_name', 'basin',
       'subvillage', 'region', 'lga', 'ward', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 'extraction_type',
       'extraction_type_group', 'extraction_type_class', 'management',
       'management_group', 'payment', 'payment_type', 'water_quality',
       'quality_group', 'quantity', 'quantity_group', 'source', 'source_type',
       'source_class', 'waterpoint_type', 'waterpoint_type_group'],
      dtype='object')

Object column count: 30


In [107]:
print(len(X['basin'].unique()))
X['basin'].value_counts()

9


Lake Victoria              10248
Pangani                     8940
Rufiji                      7976
Internal                    7785
Lake Tanganyika             6432
Wami / Ruvu                 5987
Lake Nyasa                  5085
Ruvuma / Southern Coast     4493
Lake Rukwa                  2454
Name: basin, dtype: int64

In [108]:
for i in obj_cols:
    print('{:<25} {}'.format(i, len(X[i].unique())))

date_recorded             356
funder                    1898
installer                 2146
wpt_name                  37400
basin                     9
subvillage                19288
region                    21
lga                       125
ward                      2092
public_meeting            3
recorded_by               1
scheme_management         13
scheme_name               2697
permit                    3
extraction_type           18
extraction_type_group     13
extraction_type_class     7
management                12
management_group          5
payment                   7
payment_type              7
water_quality             8
quality_group             6
quantity                  5
quantity_group            5
source                    10
source_type               7
source_class              3
waterpoint_type           7
waterpoint_type_group     6


In [109]:
print(len(X['funder'].unique()))
X['funder'].value_counts().sum()

1898


55765

In [110]:
X['funder'].isnull().sum()

3635

In [111]:
X['funder'].str.lower()
# def clean_text(columns):
    

0                                 roman
1                               grumeti
2                          lottery club
3                                unicef
4                           action in a
5                   mkinga distric coun
6                                  dwsp
7                                 rwssp
8                              wateraid
9                           isingiro ho
10                              private
11                               danida
12                         world vision
13              lawatefuka water supply
14                                biore
15                                rudep
16                               unicef
17                               unicef
18                               hesawa
19                               danida
20                                  twe
21                                 dwsp
22                               unicef
23                                  isf
24             african development bank


## Random Forest Model

In [126]:
X_rf = X.drop(['date_recorded','funder','installer','wpt_name','subvillage','lga','ward','scheme_name'], axis=1)
X_rf.head()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,basin,region,region_code,district_code,population,public_meeting,recorded_by,scheme_management,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,1390,34.938093,-9.856322,0,Lake Nyasa,Iringa,11,5,109,True,GeoData Consultants Ltd,VWC,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,1399,34.698766,-2.147466,0,Lake Victoria,Mara,20,2,280,,GeoData Consultants Ltd,Other,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,686,37.460664,-3.821329,0,Pangani,Manyara,21,4,250,True,GeoData Consultants Ltd,VWC,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,263,38.486161,-11.155298,0,Ruvuma / Southern Coast,Mtwara,90,63,58,True,GeoData Consultants Ltd,VWC,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,0,31.130847,-1.825359,0,Lake Victoria,Kagera,18,1,0,True,GeoData Consultants Ltd,,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [127]:
obj_cols_rf = X_rf.select_dtypes('object').columns
for i in obj_cols_rf:
    print('{:<25} {}'.format(i, len(X_rf[i].unique())))

basin                     9
region                    21
public_meeting            3
recorded_by               1
scheme_management         13
permit                    3
extraction_type           18
extraction_type_group     13
extraction_type_class     7
management                12
management_group          5
payment                   7
payment_type              7
water_quality             8
quality_group             6
quantity                  5
quantity_group            5
source                    10
source_type               7
source_class              3
waterpoint_type           7
waterpoint_type_group     6


In [122]:
X_train, y_train, X_val, y_val = train_test_split(X_rf, y, random_state=42, test_size = 0.20)

In [124]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [125]:
rf.fit(X_train, y_train)

ValueError: could not convert string to float: 'Internal'