In [1]:
import category_encoders as ce
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn_pandas import DataFrameMapper, gen_features
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, KBinsDiscretizer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from IPython.display import FileLink, FileLinks
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

WEB = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Tree-Ensembles/master/data/tanzania/'
source = WEB

train = pd.merge(pd.read_csv(source + 'train_features.csv'),
                 pd.read_csv(source + 'train_labels.csv'))

test = pd.read_csv(source + 'test_features.csv')
sample_submission = pd.read_csv(source + 'sample_submission.csv')

train, val = train_test_split(train, train_size = 0.80, test_size = 0.20,
                             stratify = train['status_group'], random_state = 42)

def wrangle(X):
    X = X.copy()
    X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format = True)
    X['year_recorded'] = X['date_recorded'].dt.year
    X['month_recorded'] = X['date_recorded'].dt.month
    X['day_recorded'] = X['date_recorded'].dt.day
    X = X.drop(columns = 'date_recorded')
    X['years'] = X['year_recorded'] - X['construction_year']
    X.drop(columns = ['recorded_by', 'id'])
    duplicate_columns = ['quantity_group']
    X = X.drop(columns = duplicate_columns)
    X['latitude'] = X['latitude'].replace(-2e-08, np.nan)
    cols_with_zeros = ['construction_year', 'longitude', 'latitude', 'gps_height', 'population']
    for col in cols_with_zeros:
        X[col] = X[col].replace(0, np.nan)
    return X

train = wrangle(train)
val = wrangle(val)
test = wrangle(test)   

target = 'status_group'
train_features = train.drop(columns = [target])
features = train_features.columns.tolist()

X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]

pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    SimpleImputer(strategy = 'mean'),
    RandomForestClassifier(n_estimators = 100,
                          n_jobs = -1)
)


pipeline.fit(X_train, y_train)
print(f'Validation Score: {pipeline.score(X_val, y_val):.3f}')

Validation Score: 0.811


In [None]:
y_pred = pipeline.predict(X_test)

In [2]:
import category_encoders as ce
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn_pandas import DataFrameMapper, gen_features
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, KBinsDiscretizer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from IPython.display import FileLink, FileLinks
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

In [3]:
WEB = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Tree-Ensembles/master/data/tanzania/'
source = WEB

train = pd.merge(pd.read_csv(source + 'train_features.csv'),
                 pd.read_csv(source + 'train_labels.csv'))

test = pd.read_csv(source + 'test_features.csv')
sample_submission = pd.read_csv(source + 'sample_submission.csv')

train, val = train_test_split(train, train_size = 0.80, test_size = 0.20,
                             stratify = train['status_group'], random_state = 42)

def wrangle(X):
    X = X.copy()
    X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format = True)
    X['year_recorded'] = X['date_recorded'].dt.year
    X['month_recorded'] = X['date_recorded'].dt.month
    X['day_recorded'] = X['date_recorded'].dt.day
    X = X.drop(columns = ['date_recorded', 'id', 'recorded_by'])
    X['years'] = X['year_recorded'] - X['construction_year']
    duplicate_columns = ['quantity_group']
    X = X.drop(columns = duplicate_columns)
    X['latitude'] = X['latitude'].replace(-2e-08, np.nan)
    cols_with_zeros = ['construction_year', 'longitude', 'latitude', 'gps_height', 'population']
    for col in cols_with_zeros:
        X[col] = X[col].replace(0, np.nan)
    return X

train = wrangle(train)
val = wrangle(val)
test = wrangle(test)   

target = 'status_group'
train_features = train.drop(columns = [target])
features = train_features.columns.tolist()

X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]

In [4]:
BinaryEncoder = ce.BinaryEncoder
OrdinalEncoder = ce.OrdinalEncoder
OneHotEncoder = ce.OneHotEncoder

In [9]:
impute_missing_binary = gen_features(
    columns = [['funder'], ['installer'], ['scheme_name']],
    classes = [{'class': SimpleImputer, 'strategy': 'constant', 'fill_value': 'missing'},
               BinaryEncoder]
)
impute_mean = gen_features(
    columns = [['longitude'], ['latitude'], ['gps_height'], ['construction_year'], ['population']],
    classes = [{'class': SimpleImputer, 'strategy': 'mean'}]
)
onehot_encode = gen_features(
    columns = [['quantity'], ['waterpoint_type'], ['extraction_type_class'], ['waterpoint_type_group'],
              ['extraction_type_group'], ['basin'], ['extraction_type'], ['extraction_type_group'],
              ['management'], ['management_group'], ['payment'], ['payment_type'], ['quality_group'],
              ['region'], ['source'], ['source_class'], ['source_type'], ['water_quality'],
              ['waterpoint_type']],
    classes = [OneHotEncoder]
)
ordinal_encode = gen_features(
    columns = [['lga']],
    classes = [OrdinalEncoder]
)
binary_encode = gen_features(
    columns = [['wpt_name'], ['subvillage'], ['ward']],
    classes = [BinaryEncoder]
)
no_encode = gen_features(
    columns = [['day_recorded'], ['years'], ['amount_tsh'], ['district_code'], ['month_recorded'], 
              ['num_private'], ['region_code'], ['year_recorded']],
    classes = None
)

mapped_features = (impute_missing_binary + impute_mean + onehot_encode + 
                  ordinal_encode + binary_encode + no_encode)

mapper = DataFrameMapper(mapped_features, df_out = True)

In [10]:
pipeline = make_pipeline(
    mapper,
    RandomForestClassifier(n_estimators = 100,
                          n_jobs = -1)
)
pipeline.fit(X_train, y_train)
print(f'Validation Score: {pipeline.score(X_val, y_val):.3f}')

Validation Score: 0.814


In [14]:
y_pred = pipeline.predict(X_test)

submission = sample_submission.copy()
submission['status_group'] = y_pred
submission.to_csv('rgolds-submission-10.csv', index = False)

from IPython.display import FileLink, FileLinks
FileLinks('.') #lists all downloadable files on server

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
num_impute_features = ['construction_year', 'gps_height', 'latitude', 'longitude', 'population']
catimpute_binary_features = ['funder', 'installer', 'scheme_name']
catimpute_onehot_features = ['scheme_management']
one_hot_features = ['basin', 'extraction_type', 'extraction_type_class', 'extraction_type_group', 'management',
                   'management_group', 'payment', 'payment_type', 'quality_group', 'quantity', 'region', 
                   'source', 'source_class', 'source_type', 'water_quality', 
                   'waterpoint_type', 'waterpoint_type_group']
ordinal_features = ['lga']
binary_features = ['subvillage', 'ward', 'wpt_name']

In [None]:
catimpute_binary_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
    ('binary', ce.BinaryEncoder())
])
catimpute_onehot_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
    ('onehot', ce.OneHotEncoder(use_cat_names = True))
])
num_impute_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'mean'))
])
one_hot_transformer = Pipeline(steps = [
    ('onehot', ce.OneHotEncoder(use_cat_names = True))
])
ordinal_transformer = Pipeline(steps = [
    ('ordinal', ce.OrdinalEncoder())
])
binary_transformer = Pipeline(steps = [
    ('binary', ce.BinaryEncoder())
])

In [None]:
preprocessor = ColumnTransformer(
    transformers = [
        ('catimpute_binary', catimpute_binary_transformer, catimpute_binary_features),
        ('catimpute_onehot', catimpute_onehot_transformer, catimpute_onehot_features),
        ('num_impute', num_impute_transformer, num_impute_features),
        ('onehot', one_hot_transformer, one_hot_features),
        ('ordinal', ordinal_transformer, ordinal_features),
        ('binary', binary_transformer, binary_features)],
    remainder = 'passthrough')

In [None]:
pipeline = Pipeline(steps = [
    ('preprocessor', preprocessor),
       ])