In [1]:
%matplotlib inline
import category_encoders as ce
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.tree import DecisionTreeClassifier

LOCAL = '../data/tanzania/'
WEB = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Tree-Ensembles/master/data/tanzania/'
source = LOCAL

train = pd.merge(pd.read_csv(source + 'train_features.csv'), pd.read_csv(source + 'train_labels.csv'))

test = pd.read_csv(source + 'test_features.csv')
sample_submission = pd.read_csv(source + 'sample_submission.csv')

train, val = train_test_split(train, train_size=0.80, test_size=0.20, stratify=train['status_group'], random_state=42)


def wrangle(X):
    """Wrangles train, validate, and test sets in the same way"""
    X = X.copy()

    X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format=True)
    
    X['year_recorded'] = X['date_recorded'].dt.year
    X['month_recorded'] = X['date_recorded'].dt.month
    X['day_recorded'] = X['date_recorded'].dt.day
    X = X.drop(columns='date_recorded')
    
    X['years'] = X['year_recorded'] - X['construction_year']    
    
    X = X.drop(columns=['recorded_by', 'id'])
    
    duplicate_columns = ['quantity_group']
    X = X.drop(columns=duplicate_columns)
    
    X['latitude'] = X['latitude'].replace(-2e-08, np.nan)
    
    cols_with_zeros = ['construction_year', 'longitude', 'latitude', 'gps_height', 'population']
    for col in cols_with_zeros:
        X[col] = X[col].replace(0, np.nan)

    return X


train = wrangle(train)
val = wrangle(val)
test = wrangle(test)

In [2]:
train.head()

Unnamed: 0,amount_tsh,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,...,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group,year_recorded,month_recorded,day_recorded,years
43360,0.0,,,,33.542898,-9.174777,Kwa Mzee Noa,0,Lake Nyasa,Mpandapanda,...,spring,spring,groundwater,communal standpipe,communal standpipe,functional,2011,7,27,2011
7263,500.0,Rc Church,2049.0,ACRA,34.66576,-9.308548,Kwa Yasinta Ng'Ande,0,Rufiji,Kitichi,...,spring,spring,groundwater,communal standpipe,communal standpipe,functional,2011,3,23,3
2486,25.0,Donor,290.0,Do,38.238568,-6.179919,Kwasungwini,0,Wami / Ruvu,Kwedigongo,...,shallow well,shallow well,groundwater,hand pump,hand pump,functional,2011,3,7,1
313,0.0,Government Of Tanzania,,DWE,30.716727,-1.289055,Kwajovin 2,0,Lake Victoria,Kihanga,...,shallow well,shallow well,groundwater,other,other,non functional,2011,7,31,2011
52726,0.0,Water,,Gove,35.389331,-6.399942,Chama,0,Internal,Mtakuj,...,machine dbh,borehole,groundwater,communal standpipe,communal standpipe,functional,2011,3,10,2011


In [3]:
target = 'status_group';
train_features = train.drop(columns=[target]);

numeric_features = train_features.select_dtypes(include='number').columns.tolist();

cardinality = train_features.select_dtypes(exclude='number').nunique();

categorical_features = cardinality[cardinality <= 50].index.tolist();

features = numeric_features + categorical_features;

In [4]:
x_train = train[features];
y_train = train[target];
x_val = val[features];
y_val = val[target];

x_test = test[features];

pipeline= make_pipeline(
    ce.OneHotEncoder(use_cat_names=True),
    SimpleImputer(strategy='median'),
    DecisionTreeClassifier(max_depth=20, random_state=42)
);
pipeline.fit(x_train, y_train);

print('Val Score:', pipeline.score(x_val, y_val));
y_pred = pipeline.predict(x_test);

submission = sample_submission.copy();
submission['status_group'] = y_pred;
submission.to_csv('submission-03.csv', index=False);

Val Score: 0.7712121212121212


In [5]:
from sklearn.ensemble import RandomForestClassifier;

pipeline2 = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True), 
    SimpleImputer(strategy='median'), 
    RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
);
pipeline2.fit(x_train, y_train);

print('Val Score:', pipeline2.score(x_val, y_val));
y_pred2 = pipeline2.predict(x_test);

submission2 = sample_submission.copy();
submission2['status_group'] = y_pred2;
submission2.to_csv('submission-04.csv', index=False);

Val Score: 0.8087542087542088


In [6]:
encoder = pipeline2.named_steps['onehotencoder'];
encoded_shape = encoder.transform(x_train).shape;
print(encoded_shape, x_train.shape);

(47520, 183) (47520, 33)


# Ordinal Encoder

In [7]:
x_train2 = train.drop(columns=[target]);
y_train2 = train[target];

x_val2 = val.drop(columns=[target]);
y_val2 = val[target];

x_test = test;

In [8]:
pipeline3 = make_pipeline(
    ce.OrdinalEncoder(), 
    SimpleImputer(strategy='median'), 
    RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
);

pipeline3.fit(x_train2, y_train2);
print('Val Acc:', pipeline3.score(x_val2, y_val2));

Val Acc: 0.8104377104377104


In [9]:
encoder2 = pipeline3.named_steps['ordinalencoder'];
encoded_shape2 = encoder2.transform(x_train2).shape;
print(encoded_shape2, x_train2.shape)

(47520, 40) (47520, 40)
