In [48]:
# First let's import
# Let's do our imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import pandas_profiling
import seaborn as sns; sns.set()
import category_encoders as ce
import graphviz
from sklearn.tree import export_graphviz
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
%matplotlib inline

In [49]:
# Let's try out that new workflow from the random forests lecture
# Let's get our data
Location = "../data/tanzania/"

train = pd.merge(pd.read_csv(Location + 'train_features.csv'),
                 pd.read_csv(Location + 'train_labels.csv'))

test = pd.read_csv(Location + 'test_features.csv')
sample_submission = pd.read_csv(Location + 'sample_submission.csv')

# Now Let's do our test(val)-train split

train, val = train_test_split(train, train_size=0.80, test_size=0.20, random_state=42, 
                              stratify=train['status_group'])

# Next comes our feature engineering function
def organize(X):
    """Function will organize the features of train, validate and test sets in the same way"""
    X = X.copy()
    # Before we do any engineering we'll use what we've learned through pandas_profiler or general exploration to 
    # name some duplicate and meaningless features we will drop later
    duplicates = ['quantity_group', 'extraction_type_group', 'quality_group']
    meaningless = ['id', 'recorded_by', 'num_private']
    
    # We'll start our engineering with latitude and it's tiny values (3% of total)
    # First, we have to replace these values with 0
    X['latitude'] = X['latitude'].replace(-2e-08, 0)
    
    # Now those values are the same as the incorrect null values in some of our other features
    colsw0s = ['longitude', 'latitude', 'population', 'gps_height', 'construction_year']
    
    # We'll turn those 0's into np.nan the replace them with the median of their columns
    # I don't like mean for situations like this, so I would almost always choose median or mode
    for col in colsw0s:
        X[col] = X[col].replace(0, np.nan)
#         X[col] = X[col].fillna(X[col].median())
    
    
    # Next we'll convert date_recorded to datetime
    X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format=True)
    
    # And we'll extract year_recorded, month_recorded, and day_recorded also
    X['year_recorded'] = X['date_recorded'].dt.year
    X['month_recorded'] = X['date_recorded'].dt.month
    X['day_recorded'] = X['date_recorded'].dt.day
    # We'll also add date_recorded to meaningless since it adds nothing now
    meaningless = meaningless + ['date_recorded']
 
    
    # Next we'll work with the different source features
    # We're going to fuse the source_type and source_class features into source_cat
    X['source_cat'] = X['source_class'] +['_'] + X['source_type']
    # Now we set the tree source features to be dropped
    duplicates = duplicates + ['source', 'source_type', 'source_class']
    
#     # And we'll fill the missing values for categorical features with 'MISSING'
#     cats = X.select_dtypes(exclude='number').columns
#     for col in cats:
#         X[col] = X[col].fillna('MISSING')
    
    # Now we'll drop our meaningless and duplicate features
    todrop = duplicates + meaningless
    X = X.drop(columns=todrop)
        
    return X

train = organize(train)
val = organize(val)
test = organize(test)

In [50]:
train.sample(10)

Unnamed: 0,amount_tsh,funder,gps_height,installer,longitude,latitude,wpt_name,basin,subvillage,region,...,payment_type,water_quality,quantity,waterpoint_type,waterpoint_type_group,status_group,year_recorded,month_recorded,day_recorded,source_cat
25657,200.0,Government Of Tanzania,348.0,DWE,36.841166,-9.031493,Bahatinasibu,Rufiji,Mkoroshini,Morogoro,...,monthly,soft,enough,hand pump,hand pump,functional,2011,3,5,groundwater_shallow well
9743,0.0,Rwssp,,DWE,32.540098,-3.83414,Shuleni,Lake Victoria,Seke,Shinyanga,...,unknown,milky,insufficient,other,other,functional,2013,1,26,groundwater_shallow well
16332,500.0,Women For Partnership,1712.0,District Council,37.92721,-4.411243,Hemkisinga,Pangani,Mturo,Kilimanjaro,...,on failure,soft,insufficient,communal standpipe,communal standpipe,functional,2013,2,14,groundwater_spring
34076,500.0,Shipo,1618.0,Shipo,34.878366,-8.835866,Kwa Festo Mlonganile,Rufiji,Ibumila,Iringa,...,on failure,soft,enough,hand pump,hand pump,functional,2011,8,3,groundwater_borehole
25564,0.0,Hesawa,1391.0,HESAWA,34.677965,-2.137495,Makoro,Lake Victoria,Senta,Mara,...,never pay,salty,insufficient,communal standpipe multiple,communal standpipe,non functional,2013,3,6,groundwater_borehole
38211,0.0,Ministry Of Water,856.0,Wizara ya maji,35.120394,-5.87848,Kwa Mwakigelelo Hamis,Internal,Chinyika,Singida,...,never pay,salty,enough,communal standpipe multiple,communal standpipe,non functional,2013,1,18,groundwater_borehole
4044,0.0,,,,34.137867,-8.797847,Kwa Mzee Shomali,Rufiji,Mbuyuni,Mbeya,...,monthly,soft,seasonal,communal standpipe,communal standpipe,functional needs repair,2011,4,9,surface_river/lake
32859,0.0,Private Individual,-32.0,Abdallah Ally Wazir,38.974922,-5.423863,Kwa Abdallah Ally Wazir,Pangani,Kinarani,Tanga,...,never pay,salty,insufficient,other,other,non functional,2011,3,13,groundwater_shallow well
12394,0.0,District Council,282.0,District water department,39.120803,-9.903242,Nambalwe,Ruvuma / Southern Coast,Kariakoo,Lindi,...,never pay,soft,enough,communal standpipe,communal standpipe,functional,2013,1,25,groundwater_spring
35795,0.0,Twe,1637.0,TWE,35.037152,-9.425883,none,Rufiji,Kanisani,Iringa,...,never pay,soft,enough,communal standpipe,communal standpipe,functional,2011,3,5,groundwater_spring


In [75]:
# Now we can work with our features
# We'll set the target first
target = 'status_group'

# Now we'll set a df with all train features except for our target and the id column
trainfeat = train.drop(columns=[target])

# We'll also separate the numeric features into one list...
numfeat = trainfeat.select_dtypes(include='number').columns.tolist()

# And we'll get the cardinality of the non-numeric features...
cardinality = trainfeat.select_dtypes(exclude='number').nunique()

# And then we'll get the list for categorical features with cardinality <= 50
catfeat = cardinality[cardinality <= 50].index.tolist()

# Finally we'll combine those lists
feats = numfeat + catfeat

In [52]:
# Now let's take a look at cardinality
cardinality.sort_values()

permit                       2
public_meeting               2
quantity                     5
management_group             5
waterpoint_type_group        6
waterpoint_type              7
payment_type                 7
payment                      7
extraction_type_class        7
source_cat                   7
water_quality                8
basin                        9
management                  12
scheme_management           12
extraction_type             18
region                      21
lga                        124
funder                    1716
installer                 1929
ward                      2082
scheme_name               2563
subvillage               17231
wpt_name                 30661
dtype: int64

In [53]:
# and also catfeat
catfeat

['basin',
 'region',
 'public_meeting',
 'scheme_management',
 'permit',
 'extraction_type',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'payment_type',
 'water_quality',
 'quantity',
 'waterpoint_type',
 'waterpoint_type_group',
 'source_cat']

In [54]:
# and finally feats
feats

['amount_tsh',
 'gps_height',
 'longitude',
 'latitude',
 'region_code',
 'district_code',
 'population',
 'construction_year',
 'year_recorded',
 'month_recorded',
 'day_recorded',
 'basin',
 'region',
 'public_meeting',
 'scheme_management',
 'permit',
 'extraction_type',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'payment_type',
 'water_quality',
 'quantity',
 'waterpoint_type',
 'waterpoint_type_group',
 'source_cat']

In [55]:
len(feats)

27

In [76]:
# Now let's use our new tools: Pipeline, OrdinalEncoder, SimpleImputer
# First we arrange our data into X features matrix and y target vector
xtrain = train[feats]
ytrain = train[target]

xval = val[feats]
yval = val[target]

xtest = test[feats]

# Now let's setup our pipeline
pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    SimpleImputer(strategy='median'),
    DecisionTreeClassifier(max_depth=10, random_state=42)
)

# Fit on train, score on val, predict on test
pipeline.fit(xtrain, ytrain)
print('Val Acc:', pipeline.score(xval, yval))

Val Acc: 0.7484006734006734


In [97]:
# Now let's try our RandomForestClassifier

pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    SimpleImputer(strategy='mean'),
    RandomForestClassifier(max_depth=20 ,n_estimators=100, random_state=42, n_jobs=-1)
)

# Fit on train, score on val, predict on test
pipeline.fit(xtrain, ytrain)
print('Train Acc:', pipeline.score(xtrain, ytrain))
print('Val Acc:', pipeline.score(xval, yval))
ypred = pipeline.predict(xtest)

# # Let's write our submission csv file
# submission = sample_submission.copy()
# submission['status_group'] = ypred
# submission.to_csv('rfsubmission.csv', index=False)

Train Acc: 0.9575126262626262
Val Acc: 0.812037037037037
