In [74]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
# !pip install category_encoders ### Uncomment this went running notebook for the first time ###
import category_encoders as ce

In [75]:
# Create the dataframes from the csv's
train = pd.read_csv('../input/train_features.csv')
test = pd.read_csv('../input/test_features.csv')
train_labels = pd.read_csv('../input/train_labels.csv')
sample_submission = pd.read_csv('../input/sample_submission.csv')

In [76]:
test_ids = test['id']

In [77]:
train = pd.merge(train, train_labels)

In [78]:
replace_values = {'functional':2, 'functional needs repair':1, 'non functional':0}
train['status_values']  = train['status_group'].replace(replace_values)

In [79]:
train = train.drop(['id','source','wpt_name', 'num_private', 'region', 
          'quantity', 'quality_group','lga','ward','management', 'payment', 
           'extraction_type_group','extraction_type_class'],axis = 1)

In [80]:
train['subvillage'] = train['subvillage'].fillna('other')
train['public_meeting'] = train['public_meeting'].fillna('Unknown')
train['scheme_name'] = train['scheme_name'].fillna('other')
train['permit'] = train['permit'].fillna('Unknown')

In [81]:
def extraction_cl(row):
    if row['extraction_type']=='gravity':
        return 'gravity'
    elif row['extraction_type']=='nira/tanira':
        return 'nira'
    elif row['extraction_type']=='submersible':
        return 'submersible'
    elif row['extraction_type']=='swn 80':
        return 'swn'
    elif row['extraction_type']=='mono':
        return 'mono'
    elif row['extraction_type']=='india mark ii':
        return 'indiamark2'
    elif row['extraction_type']=='afridev':
        return 'afridev'
    elif row['extraction_type']=='ksb':
        return 'ksb'
    elif row['extraction_type']=='windmill':
        return 'windmill'
    elif row['extraction_type']=='india mark iii':
        return 'indiamark3'
    else:
        return 'other'
train['extraction_type'] = train.apply(lambda row: extraction_cl(row), axis=1)

In [82]:
train['construction_year'] = pd.to_numeric(train['construction_year'])

In [83]:
def construction_cl(row):
    if row['construction_year'] >= 1960 and row['construction_year'] < 1970:
        return '60s'
    elif row['construction_year'] >= 1970 and row['construction_year'] < 1980:
        return '70s'
    elif row['construction_year'] >= 1980 and row['construction_year'] < 1990:
        return '80s'
    elif row['construction_year'] >= 1990 and row['construction_year'] < 2000:
        return '90s'
    elif row['construction_year'] >= 2000 and row['construction_year'] < 2010:
        return '00s'
    elif row['construction_year'] >= 2010:
        return '10s'
    else:
        return 'unknown'
    
train['construction_year'] = train.apply(lambda row: construction_cl(row), axis=1)

In [84]:
train['month'] = pd.to_datetime(train['date_recorded']).dt.month
train['date_recorded'] = pd.to_datetime(train['date_recorded'])
train['date_recorded'] = pd.datetime(2013, 12, 3) - pd.to_datetime(train['date_recorded'])
train.columns = ['days_since_recorded' if x == 'date_recorded' else x for x in train.columns]
train['days_since_recorded'] = train['days_since_recorded'].astype('timedelta64[D]').astype(int)

In [85]:
def scheme_cl(row):
    if row['scheme_management']=='VWC':
        return 'vwc'
    elif row['scheme_management']=='WUG':
        return 'wug'
    elif row['scheme_management']=='Water authority':
        return 'wtr_auth'
    elif row['scheme_management']=='WUA':
        return 'wua'
    elif row['scheme_management']=='Water Board':
        return 'wtr_brd'
    elif row['scheme_management']=='Parastatal':
        return 'Parastatal'
    elif row['scheme_management']=='Private operator':
        return 'pri_optr'
    elif row['scheme_management']=='SWC':
        return 'swc'
    elif row['scheme_management']=='Company':
        return 'company'
    elif row['scheme_management']=='Trust':
        return 'trust'
    else:
        return 'other'
train['scheme_management'] = train.apply(lambda row: scheme_cl(row), axis=1)

In [86]:
def installer_cl(row):
    if row['installer']=='DWE':
        return 'dwe'
    elif row['installer']=='Government':
        return 'gov'
    elif row['installer']=='RWE':
        return 'rwe'
    elif row['installer']=='Commu':
        return 'commu'
    elif row['installer']=='DANIDA':
        return 'danida'
    elif row['installer']=='KKKT':
        return 'kkkt'
    elif row['installer']=='Hesawa':
        return 'hesawa'
    elif row['installer']=='TCRS':
        return 'tcrs'
    elif row['installer']=='Central government':
        return 'Central government'
    else:
        return 'other'  
train['installer'] = train.apply(lambda row: installer_cl(row), axis=1)

In [87]:
def funder_cl(row):  
    if row['funder']=='Government Of Tanzania':
        return 'gov'
    elif row['funder']=='Danida':
        return 'danida'
    elif row['funder']=='Hesawa':
        return 'hesawa'
    elif row['funder']=='Rwssp':
        return 'rwssp'
    elif row['funder']=='World Bank':
        return 'world_bank'    
    elif row['funder']=='Kkkt':
        return 'Kkkt'
    elif row['funder']=='World Vision':
        return 'World Vision'
    elif row['funder']=='Unicef':
        return 'Unicef'
    elif row['funder']=='Tasaf':
        return 'Tasaf'
    elif row['funder']=='District Council':
        return 'District Council'
    else:
        return 'other'
    
train['funder'] = train.apply(lambda row: funder_cl(row), axis=1)

In [88]:
train = train.drop(['longitude', 'latitude', 'region_code', 'district_code', 'recorded_by', 'management_group', 'status_values', 'month'], axis=1)

In [89]:
train.head()

Unnamed: 0,amount_tsh,days_since_recorded,funder,gps_height,installer,basin,subvillage,population,public_meeting,scheme_management,scheme_name,permit,construction_year,extraction_type,payment_type,water_quality,quantity_group,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,6000.0,995,other,1390,other,Lake Nyasa,Mnyusi B,109,True,vwc,Roman,False,90s,gravity,annually,soft,enough,spring,groundwater,communal standpipe,communal standpipe,functional
1,0.0,272,other,1399,other,Lake Victoria,Nyamara,280,Unknown,other,other,True,10s,gravity,never pay,soft,insufficient,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,25.0,281,other,686,other,Pangani,Majengo,250,True,vwc,Nyumba ya mungu pipe scheme,True,00s,gravity,per bucket,soft,enough,dam,surface,communal standpipe multiple,communal standpipe,functional
3,0.0,309,Unicef,263,other,Ruvuma / Southern Coast,Mahakamani,58,True,vwc,other,True,80s,submersible,never pay,soft,dry,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,0.0,874,other,0,other,Lake Victoria,Kyanyamisa,0,True,other,other,True,unknown,gravity,never pay,soft,seasonal,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


In [90]:
test = test.drop(['longitude', 'latitude', 'region_code', 'district_code',
                  'num_private', 'id', 'payment', 'management_group', 'management', 
                  'extraction_type_class', 'extraction_type_group', 'recorded_by','region', 'lga',
                  'ward', 'wpt_name', 'quantity',
                 'quality_group', 'source'], axis=1)

In [91]:
##cleaning the columns as the training data set
test.date_recorded = pd.datetime(2013, 12, 3) - pd.to_datetime(test.date_recorded)
test.columns = ['days_since_recorded' if x=='date_recorded' else x for x in test.columns]
test.days_since_recorded = test.days_since_recorded.astype('timedelta64[D]').astype(int)

In [92]:
test.permit = test.permit.fillna('Unknown')
test.public_meeting = test.public_meeting.fillna('Unknown')

In [93]:
test['scheme_management'] = test.apply(lambda row: scheme_cl(row), axis=1)
test['construction_year'] = test.apply(lambda row: construction_cl(row), axis=1)
test['installer'] = test.apply(lambda row: installer_cl(row), axis=1)
test['funder'] = test.apply(lambda row: funder_cl(row), axis=1)
test['extraction_type'] = test.apply(lambda row: extraction_cl(row), axis=1)

In [94]:
training = train.drop('status_group', axis=1)

In [95]:
def transform_feature(df, column_name):
    unique_values = set(df[column_name].tolist())
    transformer_dict = {}
    for index, value in enumerate(unique_values):
        transformer_dict[value] = index
    df[column_name] = df[column_name].apply(lambda y: transformer_dict[y])
    return df


In [96]:
integer_columns = ['days_since_recorded', 'population','gps_height'] 
columns_to_transform = [col for col in training.columns if col not in integer_columns]
for column in columns_to_transform: 
    training = transform_feature(training, column)
    test = transform_feature(test, column)

In [97]:
## Converting the Training dataframe into a matrix and predictor as y 
X = training.as_matrix()
y = train["status_group"].tolist()

  


In [98]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size = 0.3,
                                                    random_state = 0)

In [107]:
model = RandomForestClassifier(n_estimators=10000,
                               min_samples_split=6,
                               criterion='gini',
                               max_features='auto',
                               oob_score=True,
                               random_state=1,
                               n_jobs=-1)

In [108]:
model.fit(X_train, y_train)

print('Random Forest Classifier Train Accuracy Score :', model.score(X_train, y_train))
print('Random Forest Classifier Test Score :', model.score(X_test, y_test))

Random Forest Classifier Train Accuracy Score : 0.9398508898508898
Random Forest Classifier Test Score : 0.8076879910213244


In [101]:
pred = model.predict(test)
pred

array(['non functional', 'functional', 'non functional', ...,
       'functional', 'functional', 'non functional'], dtype='<U23')

In [102]:
# Create a dataframe with the Id's and the predictions.
sub = pd.DataFrame(data = {
    'id': test_ids,
    'status_group': pred
})

In [103]:
# Create an output file to submit
sub.to_csv('submission.csv', index=False)