In [1]:
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning, module='numpy')

In [2]:
# 1. Load the data

import pandas as pd

train_features = pd.read_csv('../data/tanzania/train_features.csv')
train_labels = pd.read_csv('../data/tanzania/train_labels.csv')
test_features = pd.read_csv('../data/tanzania/test_features.csv')
sample_submission = pd.read_csv('../data/tanzania/sample_submission.csv')

assert train_features.shape == (59400, 40)
assert train_labels.shape == (59400, 2)
assert test_features.shape == (14358, 40)
assert sample_submission.shape == (14358, 2)

In [4]:
train_features.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [5]:
train_labels.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [6]:
test_features.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,50785,0.0,2013-02-04,Dmdd,1996,DMDD,35.290799,-4.059696,Dinamu Secondary School,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other
1,51630,0.0,2013-02-04,Government Of Tanzania,1569,DWE,36.656709,-3.309214,Kimnyak,0,...,never pay,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe
2,17168,0.0,2013-02-01,,1567,,34.767863,-5.004344,Puma Secondary,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,other,other
3,45559,0.0,2013-01-22,Finn Water,267,FINN WATER,38.058046,-9.418672,Kwa Mzee Pange,0,...,unknown,soft,good,dry,dry,shallow well,shallow well,groundwater,other,other
4,49871,500.0,2013-03-27,Bruder,1260,BRUDER,35.006123,-10.950412,Kwa Mzee Turuka,0,...,monthly,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe


In [7]:
# 2. check for na values

train_features.isna().sum()

id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [8]:
test_features.isna().sum()

id                          0
amount_tsh                  0
date_recorded               0
funder                    783
gps_height                  0
installer                 788
longitude                   0
latitude                    0
wpt_name                    0
num_private                 0
basin                       0
subvillage                 94
region                      0
region_code                 0
district_code               0
lga                         0
ward                        0
population                  0
public_meeting            785
recorded_by                 0
scheme_management         939
scheme_name              6839
permit                    663
construction_year           0
extraction_type             0
extraction_type_group       0
extraction_type_class       0
management                  0
management_group            0
payment                     0
payment_type                0
water_quality               0
quality_group               0
quantity  

In [20]:
# 3. figure out how you will fill them in, then fill them in

train_features.funder.value_counts()[:5]

Government Of Tanzania    12719
Danida                     3114
Hesawa                     2202
Rwssp                      1374
World Bank                 1349
Name: funder, dtype: int64

In [15]:
import numpy as np

train_features.funder.fillna('Government Of Tanzania', inplace=True)
train_features.funder.isna().sum()

0

In [16]:
test_features.funder.fillna('Government Of Tanzania', inplace=True)
test_features.funder.isna().sum()

0

In [18]:
test_features.gps_height.value_counts()[:5]

 0       4978
-19        18
 1294      18
-16        16
 1343      16
Name: gps_height, dtype: int64

In [27]:
avg_height = test_features.gps_height.sum() / len(test_features.gps_height)
test_features.gps_height.replace(0, avg_height, inplace=True)
train_features.gps_height.replace(0, avg_height, inplace=True)
test_features.gps_height.value_counts()[:5]

 653.6363     4978
-19.0000        18
 1294.0000      18
 1343.0000      16
-16.0000        16
 1285.0000      16
 1301.0000      15
 1293.0000      15
 1373.0000      15
 1312.0000      14
 1362.0000      14
 1270.0000      14
 1183.0000      14
 1283.0000      14
 1337.0000      14
 1254.0000      14
 1341.0000      14
 1306.0000      14
-11.0000        14
-14.0000        13
 1309.0000      13
 1280.0000      13
 1271.0000      13
 1249.0000      13
 272.0000       13
 1666.0000      13
-12.0000        13
 258.0000       13
 1431.0000      13
-13.0000        13
              ... 
 59.0000         4
 120.0000        4
 946.0000        4
 1101.0000       4
 1511.0000       4
 80.0000         4
 954.0000        4
 241.0000        4
 256.0000        4
 504.0000        4
-43.0000         4
 425.0000        4
 1247.0000       4
 48.0000         4
 1813.0000       4
 1082.0000       4
 1536.0000       4
 1959.0000       4
 914.0000        4
 1046.0000       4
 1576.0000       4
 45.0000    

In [31]:
test_features.installer.value_counts()[:5]

DWE           4162
Government     450
RWE            288
Commu          273
DANIDA         254
Name: installer, dtype: int64

In [34]:
test_features.installer.fillna('DWE', inplace=True)
train_features.installer.fillna('DWE', inplace=True)
test_features.installer.isna().sum()

0

In [None]:
# 4. split your data into features and target
# 5. train test split once to get X_train, X_test, y_train, y_test
# 6. Split your X_train and y_train into X_train, X_val, y_train, y_val
# 7. Fit your model with the train X_train and y_train
# 8. predict with your X_val
# 9. Use a validation metric(r2, rsme, etc) with you new predicted ys and the actual data points(y_val)
# 10. Tweak model parameters to see if you can get a better score, possibly tweak the way you filled in your na values. Also try using more or less features.
# 11. After tweaking your model as much as you want, use the model with the best score and predict on X_test.
# 12. Use the metric you have been using and see what the score is for your test.
# Extra tips:
# a. After step 12, if you tweak your model more, you will be adding bias into it due to making changes based off the test scores.
# b. While tweaking your scores, if your train score is better than your val score, you are underfitting
# c. if your train score is lower than your val score you are overfitting.