In [1]:
# importing required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ignoring annoying warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# importing features and target files and merging in one data file
data_features = pd.read_csv('train_values.csv')
data_target = pd.read_csv('train_labels.csv')
data = data_features.merge(data_target, on='id')

In [3]:
# exporting merged data to double check algorithm using Azure Machine Learning Studio
#data.to_csv('data_azure.csv', index=False)

In [4]:
# brief summary of the data types and number of values per column
#data.info()

In [5]:
# we have both numeric and categorical type of data

In [6]:
data.nunique()

id                       59400
amount_tsh                  98
date_recorded              356
funder                    1897
gps_height                2428
installer                 2145
longitude                57516
latitude                 57517
wpt_name                 37400
num_private                 65
basin                        9
subvillage               19287
region                      21
region_code                 27
district_code               20
lga                        125
ward                      2092
population                1049
public_meeting               2
recorded_by                  1
scheme_management           12
scheme_name               2696
permit                       2
construction_year           55
extraction_type             18
extraction_type_group       13
extraction_type_class        7
management                  12
management_group             5
payment                      7
payment_type                 7
water_quality                8
quality_

In [7]:
# dropping all columns that bring no value - id, date_recorded, wpt_name, recorded_by
data=data.drop(['id','date_recorded', 'funder', 'wpt_name', 'recorded_by', 'subvillage', 'num_private', 'scheme_name', 'quantity_group', 'payment'], axis=1)
# scheme_name dropped due to 50% missing values

In [8]:
data.columns

Index(['amount_tsh', 'gps_height', 'installer', 'longitude', 'latitude',
       'basin', 'region', 'region_code', 'district_code', 'lga', 'ward',
       'population', 'public_meeting', 'scheme_management', 'permit',
       'construction_year', 'extraction_type', 'extraction_type_group',
       'extraction_type_class', 'management', 'management_group',
       'payment_type', 'water_quality', 'quality_group', 'quantity', 'source',
       'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group', 'status_group'],
      dtype='object')

In [9]:
# collecting numeric columns for later preprocessing and exploratory analysis
numeric = [col for col in data if data[col].dtypes !='object']

# we will also drop region_code and district code, because they are categorical
numeric.remove('region_code')
numeric.remove('district_code')

# and we also drop num_private since it looks more like a category and has no description at all
#numeric.remove('num_private')

In [10]:
numeric

['amount_tsh',
 'gps_height',
 'longitude',
 'latitude',
 'population',
 'construction_year']

In [11]:
(data['construction_year'] == 0).value_counts()

False    38691
True     20709
Name: construction_year, dtype: int64

In [12]:
for num in data[numeric].columns:
    data[num] = data[num].replace(0, data[num].median())

In [13]:
#data['construction_year'] = data['construction_year'].replace(0, data['construction_year'].median())

In [14]:
#data['amount_tsh'] = data['amount_tsh'].replace(0, data['amount_tsh'].median())

In [15]:
#data['source_class'] = data['source_class'].replace('unknown', data['source_class'].mode())

In [16]:
#data['longitude'] = data['longitude'].replace(0, data['longitude'].median())

In [17]:
#data['latitude'] = data['latitude'].replace(0, data['latitude'].median())

In [18]:
#data['gps_height'] = data['gps_height'].replace(0, data['gps_height'].median())

In [19]:
#data['population'] = data['population'].replace(0, data['population'].median())

In [20]:
# collecting categorical columns for later preprocessing and exploratory analysis directly by dropping numeric columns
categorical = data.drop(numeric, axis=1).columns

In [21]:
categorical

Index(['installer', 'basin', 'region', 'region_code', 'district_code', 'lga',
       'ward', 'public_meeting', 'scheme_management', 'permit',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment_type', 'water_quality',
       'quality_group', 'quantity', 'source', 'source_type', 'source_class',
       'waterpoint_type', 'waterpoint_type_group', 'status_group'],
      dtype='object')

In [22]:
for cat in data[categorical].columns:
    data[cat] = data[cat].replace(0, data[cat].mode())

In [23]:
# checking the number of missing values per numeric feature
data[numeric].isnull().sum()

amount_tsh           0
gps_height           0
longitude            0
latitude             0
population           0
construction_year    0
dtype: int64

In [24]:
# numeric data has no missing values

In [25]:
data[categorical].isnull().sum()

installer                3655
basin                       0
region                      0
region_code                 0
district_code               0
lga                         0
ward                        0
public_meeting           3334
scheme_management        3877
permit                   3056
extraction_type             0
extraction_type_group       0
extraction_type_class       0
management                  0
management_group            0
payment_type                0
water_quality               0
quality_group               0
quantity                    0
source                      0
source_type                 0
source_class                0
waterpoint_type             0
waterpoint_type_group       0
status_group                0
dtype: int64

In [26]:
# categorical data has 5 columns with missing values, each will be explored and processed case by case

In [27]:
# exploring correlation for numeric features
numeric_corr = data[numeric].corr()

# table representation
numeric_corr

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,population,construction_year
amount_tsh,1.0,0.068595,0.012664,-0.05267,0.014918,0.036297
gps_height,0.068595,1.0,-0.155917,0.039209,0.074532,0.222107
longitude,0.012664,-0.155917,1.0,-0.285799,0.051254,0.224121
latitude,-0.05267,0.039209,-0.285799,1.0,-0.017276,-0.085974
population,0.014918,0.074532,0.051254,-0.017276,1.0,0.136254
construction_year,0.036297,0.222107,0.224121,-0.085974,0.136254,1.0


In [28]:
# graphic representation of correlation - heatmap
# plt.figure(figsize=(10,10))
# sns.heatmap(numeric_corr)

In [29]:
# we will plot all numeric features hue-ed by categorical features to identify relationships and gain more insight
# let's first check which categorical columns have reasonable number of uniqe values so that our plots are readable

data[categorical].nunique()

installer                2145
basin                       9
region                     21
region_code                27
district_code              20
lga                       125
ward                     2092
public_meeting              2
scheme_management          12
permit                      2
extraction_type            18
extraction_type_group      13
extraction_type_class       7
management                 12
management_group            5
payment_type                7
water_quality               8
quality_group               6
quantity                    5
source                     10
source_type                 7
source_class                3
waterpoint_type             7
waterpoint_type_group       6
status_group                3
dtype: int64

In [30]:
# this step is optional that is why the drop list is empty. Also some categorical features have too many unique values to be pair ploted
# some categorical features have too many uniqe values and makes no sense to be ploted
# those will be dropped from the pairplots
#cat_drop_list = []
#data = data.drop(cat_drop_list, axis=1)

#cat_pair_hue = data[categorical].columns.to_list()

In [31]:
# continues from above - optional
# creating pairplots with seaborn
# merging data and hue columns
#pair_data_cols=cat_pair_hue + numeric
#pair_data_cols

for col in cat_pair_hue:
    sns.set()
    plt.figure()
    sns.pairplot(data[pair_data_cols], hue=col, size=3.0)
    plt.show()

In [32]:
# preprocessing categorical data - missing values and encoding
data[categorical].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 25 columns):
installer                55745 non-null object
basin                    59400 non-null object
region                   59400 non-null object
region_code              59400 non-null int64
district_code            59400 non-null int64
lga                      59400 non-null object
ward                     59400 non-null object
public_meeting           56066 non-null object
scheme_management        55523 non-null object
permit                   56344 non-null object
extraction_type          59400 non-null object
extraction_type_group    59400 non-null object
extraction_type_class    59400 non-null object
management               59400 non-null object
management_group         59400 non-null object
payment_type             59400 non-null object
water_quality            59400 non-null object
quality_group            59400 non-null object
quantity                 59400 non-null object


In [33]:
# filling missing values
data[categorical].isnull().sum()

installer                3655
basin                       0
region                      0
region_code                 0
district_code               0
lga                         0
ward                        0
public_meeting           3334
scheme_management        3877
permit                   3056
extraction_type             0
extraction_type_group       0
extraction_type_class       0
management                  0
management_group            0
payment_type                0
water_quality               0
quality_group               0
quantity                    0
source                      0
source_type                 0
source_class                0
waterpoint_type             0
waterpoint_type_group       0
status_group                0
dtype: int64

In [34]:
# imputing all missing values with 'unknown' category
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='constant', fill_value='unknown')

data[categorical] = pd.DataFrame(imputer.fit_transform(data[categorical]))

In [35]:
# double check if all values were imputed
data[categorical].isnull().sum()

installer                0
basin                    0
region                   0
region_code              0
district_code            0
lga                      0
ward                     0
public_meeting           0
scheme_management        0
permit                   0
extraction_type          0
extraction_type_group    0
extraction_type_class    0
management               0
management_group         0
payment_type             0
water_quality            0
quality_group            0
quantity                 0
source                   0
source_type              0
source_class             0
waterpoint_type          0
waterpoint_type_group    0
status_group             0
dtype: int64

In [36]:
data[categorical].count()

installer                59400
basin                    59400
region                   59400
region_code              59400
district_code            59400
lga                      59400
ward                     59400
public_meeting           59400
scheme_management        59400
permit                   59400
extraction_type          59400
extraction_type_group    59400
extraction_type_class    59400
management               59400
management_group         59400
payment_type             59400
water_quality            59400
quality_group            59400
quantity                 59400
source                   59400
source_type              59400
source_class             59400
waterpoint_type          59400
waterpoint_type_group    59400
status_group             59400
dtype: int64

In [37]:
# let's turn everything to str to be sure the encoding works correctly
data[categorical]=data[categorical].applymap(str)

In [38]:
# encoding with dummy (onehot) values and dropping the label since label will be encoded separately
y = data['status_group']
encoded_cat_data=pd.get_dummies(data[categorical].drop('status_group', axis=1))

In [39]:
data[categorical].nunique()
print(data[categorical].nunique().sum())

4575


In [40]:
X = data.drop(categorical, axis=1).merge(encoded_cat_data, left_index=True, right_index=True)

In [41]:
# encoding the label
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# later we need to inverse_transform labels to have the readable predictions

In [42]:
# let's split data to train and test

In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size = 0.20, train_size=0.80, random_state=1)

In [44]:
# Normalization - fitting
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()

In [45]:
# transforming X_train and X_test
X_train[numeric]=min_max_scaler.fit_transform(X_train[numeric])
X_test[numeric]=min_max_scaler.fit_transform(X_test[numeric])

In [46]:
# time to choose a model, train, evaluate and score
# using sklearn's flowchart for choosing the right estimator, we will try the following 3:
# sklearn.ensemble.GradientBoostingClassifier
# sklearn.neighbors.KNeighborsClassifier
# sklearn.svm.LinearSVC¶

In [47]:
from sklearn.model_selection import GridSearchCV

In [48]:
from sklearn.ensemble import RandomForestClassifier

In [77]:
random_forest_class = RandomForestClassifier(n_jobs=-1, min_samples_split=8)
# random_forest_class_grid = {'min_samples_leaf': []}
# random_forest_class_optimized = GridSearchCV(random_forest_class, random_forest_class_grid, verbose=2)
random_forest_class_optimized.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] min_samples_leaf=2 ..............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............................... min_samples_leaf=2, total=   6.5s
[CV] min_samples_leaf=2 ..............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.4s remaining:    0.0s


[CV] ............................... min_samples_leaf=2, total=   5.9s
[CV] min_samples_leaf=2 ..............................................
[CV] ............................... min_samples_leaf=2, total=   5.6s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   18.0s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=8,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=-1,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             

In [51]:
random_forest_class_optimized.score(X_test, y_test)

0.8141414141414142

In [78]:
random_forest_class_optimized.score(X_test, y_test)

0.7725589225589226

In [76]:
random_forest_class_optimized.best_params_

{'min_samples_leaf': 4}

In [51]:
# very good score of 0.81 {'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 600}

In [58]:
# even better score of 0.8136 {'min_samples_split': 5, 'n_estimators': 600}

In [52]:
from joblib import dump, load

In [54]:
# best optimized n_100
dump(random_forest_class_optimized, 'best_n_100.joblib')

['best_n_100.joblib']

In [55]:
best_n_100 = load('best_n_100.joblib')

In [69]:
best_n_100.best_params_

{'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 100}