In [86]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score,accuracy_score
from sklearn.pipeline import Pipeline
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

In [38]:
LOCAL = '../data/tanzania/'
WEB = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/tanzania/'

train_features = pd.read_csv(WEB + 'train_features.csv')
train_labels = pd.read_csv(WEB + 'train_labels.csv')
test_features = pd.read_csv(WEB + 'test_features.csv')
sample_submission = pd.read_csv(WEB + 'sample_submission.csv')

assert train_features.shape == (59400, 40)
assert train_labels.shape == (59400, 2)
assert test_features.shape == (14358, 40)
assert sample_submission.shape == (14358, 2)

In [39]:
train_features.columns

Index(['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height',
       'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
       'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
       'ward', 'population', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_quality', 'quality_group', 'quantity', 'quantity_group',
       'source', 'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group'],
      dtype='object')

In [40]:
train_features.drop(['id', 'amount_tsh','num_private','quantity_group','recorded_by'],axis =1)

Unnamed: 0,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,basin,subvillage,region,...,payment,payment_type,water_quality,quality_group,quantity,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322e+00,none,Lake Nyasa,Mnyusi B,Iringa,...,pay annually,annually,soft,good,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466e+00,Zahanati,Lake Victoria,Nyamara,Mara,...,never pay,never pay,soft,good,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329e+00,Kwa Mahundi,Pangani,Majengo,Manyara,...,pay per bucket,per bucket,soft,good,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,2013-01-28,Unicef,263,UNICEF,38.486161,-1.115530e+01,Zahanati Ya Nanyumbu,Ruvuma / Southern Coast,Mahakamani,Mtwara,...,never pay,never pay,soft,good,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359e+00,Shuleni,Lake Victoria,Kyanyamisa,Kagera,...,never pay,never pay,soft,good,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
5,2011-03-13,Mkinga Distric Coun,0,DWE,39.172796,-4.765587e+00,Tajiri,Pangani,Moa/Mwereme,Tanga,...,pay per bucket,per bucket,salty,salty,enough,other,other,unknown,communal standpipe multiple,communal standpipe
6,2012-10-01,Dwsp,0,DWSP,33.362410,-3.766365e+00,Kwa Ngomho,Internal,Ishinabulandi,Shinyanga,...,never pay,never pay,soft,good,enough,machine dbh,borehole,groundwater,hand pump,hand pump
7,2012-10-09,Rwssp,0,DWE,32.620617,-4.226198e+00,Tushirikiane,Lake Tanganyika,Nyawishi Center,Shinyanga,...,unknown,unknown,milky,milky,enough,shallow well,shallow well,groundwater,hand pump,hand pump
8,2012-11-03,Wateraid,0,Water Aid,32.711100,-5.146712e+00,Kwa Ramadhan Musa,Lake Tanganyika,Imalauduki,Tabora,...,never pay,never pay,salty,salty,seasonal,machine dbh,borehole,groundwater,hand pump,hand pump
9,2011-08-03,Isingiro Ho,0,Artisan,30.626991,-1.257051e+00,Kwapeto,Lake Victoria,Mkonomre,Kagera,...,never pay,never pay,soft,good,enough,shallow well,shallow well,groundwater,hand pump,hand pump


In [41]:
train_features['date_recorded'] = pd.to_datetime(train_features['date_recorded'], infer_datetime_format=True)

In [42]:
 # Extract components from date_recorded, then drop the original column
train_features['year_recorded'] = train_features['date_recorded'].dt.year
train_features['month_recorded'] = train_features['date_recorded'].dt.month
train_features['day_recorded'] = train_features['date_recorded'].dt.day
train_features = train_features.drop(columns='date_recorded')

In [43]:
# from pandas profiling we have 10 numerical and 26 categorical features. 2 boolean 

bool_features = ['public_meeting','permit']
categorical_features = train_features.select_dtypes(exclude=[np.number]).columns.to_list()
numerical_features = train_features.select_dtypes(include=[np.number]).columns.to_list()



In [47]:
len(numerical_features)

13

In [48]:
#Remove boolean values from categorical_features
categorical_features = list(set(categorical_features) - set(bool_features))

In [49]:
# find if there are any NANs in categorical_features 
train_features[categorical_features].isnull().values.any()

True

In [50]:
len(categorical_features)

27

In [51]:
#Since null values are only in categorical features, 
#I will find the top values in each and replace NANs with that value
def clean_data(lis):
    for element in lis:
        top = train_features[element].describe().to_list()[2]
        train_features[element]=train_features[element].fillna(top)

In [52]:
clean_data(categorical_features)
# No NaNs now
train_features[categorical_features].isnull().values.any()

False

In [53]:
#Split Training and Validation data

X_train = train_features
y_train = train_labels['status_group']

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.80, test_size=0.20,
    stratify=y_train, random_state=42)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((47520, 42), (11880, 42), (47520,), (11880,))

In [61]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])
onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(sparse=True, handle_unknown='ignore'))]) 
boolean_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('boo', boolean_transformer, bool_features),
        ('one', onehot_transformer, categorical_features)]) 
       

In [68]:
dt = DecisionTreeClassifier(random_state=0)

In [75]:
classification = Pipeline(steps=[('preprocessor', preprocessor),
                      ('dt', DecisionTreeClassifier(max_depth=25))])

In [76]:
classification.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose...      min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

In [77]:
print("Validation score: %.3f" % classification.score(X_val, y_val))

Validation score: 0.784


In [81]:
y_pred = classification.predict(test_features)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [82]:
len(y_pred)

14358

In [83]:
# Write submission csv file
submission = sample_submission.copy()
submission['status_group'] = y_pred
submission.to_csv('submission-03.csv', index=False)

In [88]:
rf = RandomForestClassifier(n_estimators=200)

In [106]:
classification = Pipeline(steps=[('preprocessor', preprocessor),
                      ('rf', RandomForestClassifier(max_depth=300,random_state=5))])

In [107]:
classification.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose...imators=10, n_jobs=None,
            oob_score=False, random_state=5, verbose=0, warm_start=False))])

In [108]:
print("Validation score: %.3f" % classification.score(X_val, y_val))

Validation score: 0.797


In [109]:
y_pred = classification.predict(test_features)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [110]:
# Write submission csv file
submission = sample_submission.copy()
submission['status_group'] = y_pred
submission.to_csv('submission-05.csv', index=False)