In [1]:
# Upgrade pip
!pip install --upgrade pip

# Load the data from Kaggle
!pip install kaggle

# Upgrade the version of Seaborn
!pip install -U seaborn

# Install category_encoders
!pip install category_encoders

# Install XGBoost
!pip install xgboost

Requirement already up-to-date: pip in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (19.0.3)
Requirement already up-to-date: seaborn in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (0.9.0)


In [2]:
# Download the data from Kaggle

!kaggle competitions download -c ds1-predictive-modeling-challenge

sample_submission.csv: Skipping, found more recently modified local copy (use --force to force download)
test_features.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
train_labels.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
train_features.csv.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
# Extract the csv files
"""!unzip train_features.csv.zip 
!unzip train_labels.csv.zip 
!unzip test_features.csv.zip"""

'!unzip train_features.csv.zip \n!unzip train_labels.csv.zip \n!unzip test_features.csv.zip'

In [4]:
# Generic imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# Loading the independent features as X and
# dependent variable as y
nan_values_list = ['Not Known', 'Unknown', 'None', 'Not known', 'not known', 
                   '-', 'unknown', 'Unknown Installer', '##', 'none']

train_features_df = pd.read_csv('train_features.csv', na_values=nan_values_list)
train_labels_df = pd.read_csv('train_labels.csv')

In [6]:
def atleast(row, value_count_series, count=5):
  # Identify items who have funded atleast 5 pumps
  if str(row) == "nan":
    return np.nan
  
  value_count = value_count_series.get(row)
  
  if value_count < count:
    return 0
  else:
    return 1

def character_grouping(row):
  # Reduce the dimension based on 1st character else return *
  if str(row) == "nan":
    return np.nan
  
  if row[0].isalpha():
    return row[0].lower()
  else:
    return "*"
  
def classify_lga(row):
  # Classify lga into Rural, Urban and others
  if str(row) == "nan":
    return np.nan
  
  if row.lower().find('rural'):
    return "rural"
  elif row.lower().find('urban'):
    return "urban"
  else:
    return "other"
  
def prefix_grouping(row, prefix_count=3):
  # Reduce the dimension based on 1st character else return *
  if str(row) == "nan":
    return np.nan
  
  if prefix_count > len(row):
    return "#"
  
  if row[0:prefix_count].isalpha():
    return row[0:prefix_count].lower()
  else:
    return "*"
  
def map_ward_construction_year(input_df):
  # Map ward to construction year
  
  # Here train_features_df shall be used as reference for
  # both trainging and test data set.
  df = input_df.copy()
  
  ward_construction_year_dict = {}
  ward_list = df['ward'].unique()
 
  # top ward's construction year shall be used incase there is no
  # matching construction year for individual ward.  
  top_ward = df['ward'].describe().top
  top_ward_construction_year =  \
    int(df[df['ward'] == top_ward]['construction_year'].median())

  for ward in ward_list:
    ward_construction_year = \
      int(df[df['ward'] == ward]['construction_year'].median())
    
    if ward not in ward_construction_year_dict:\
      
      if ward_construction_year == 0:
        ward_construction_year_dict[ward] = top_ward_construction_year
      else:
        ward_construction_year_dict[ward] = ward_construction_year
  
  return ward_construction_year_dict


def compute_construction_year(row, ward_construction_year_dict, 
                              top_ward_construction_year):
  # compute the consturction year if it is 0  
  ward = row['ward']
  construction_year = row['construction_year']  
  
  if construction_year == 0:
    if ward in ward_construction_year_dict:
      return ward_construction_year_dict[ward]
    else:
      return top_ward_construction_year
  else:
    return construction_year


def compute_age(row):
  # compute the consturction age
  date_recorded = row['date_recorded']
  year_recorded = int(date_recorded.split('-')[0])
  
  construction_year = row['construction_year']
  
  return (year_recorded - construction_year)


def compute_year_recorded(row):
  # split year from date_recorded
  return int(row.split('-')[0])

def compute_month_recorded(row):
  # split year from date_recorded
  return int(row.split('-')[1])

In [7]:
%%time

def feature_engineering(df):
  # Create a column to indicate funder with atleast 5 pumps maintained.
  value_count_funder = df.funder.value_counts()
  df['funder_aleast_5'] = df['funder'].apply(atleast, 
                                            args=(value_count_funder,))
  
  # Create a column to indicate installer with atleast 5 pumps maintained.
  value_count_installer = df.installer.value_counts()
  df['installer_aleast_5'] = df['installer'].apply(atleast, 
                                            args=(value_count_installer,))
  
  # Apply mean for missing values of latitude and longitude
  mean_longitude = df['longitude'].mean()
  df['longitude'] = df['longitude'].apply(lambda x: mean_longitude if round(x, 2) == 0 else x)
  mean_latitude = df['latitude'].mean()
  df['latitude'] = df['latitude'].apply(lambda x: mean_latitude if round(x, 2) == 0 else x)
  
  # Grouping wpt_name, subvillage based on 1st alphabet
  df['wpt_name_character_grouping'] = df['wpt_name'].apply(character_grouping)
  df['subvillage_character_grouping'] = df['subvillage'].apply(character_grouping)
  
  # Classify lga based on Rural, Urban and others
  df['lga_engineered'] = df['lga'].apply(classify_lga)
  
  # Grouping ward, scheme_name based on 1st alphabet
  df['ward_character_grouping'] = df['ward'].apply(character_grouping)
  df['scheme_name_character_grouping'] = df['scheme_name'].apply(character_grouping)
  
  """# Grouping based on prefix
  df['funder_prefix_grouping'] = df['funder'].apply(prefix_grouping)
  df['installer_prefix_grouping'] = df['installer'].apply(prefix_grouping)
  df['wpt_name_prefix_grouping'] = df['wpt_name'].apply(prefix_grouping)
  df['subvillage_prefix_grouping'] = df['subvillage'].apply(prefix_grouping)
  df['lga_prefix_grouping'] = df['lga'].apply(prefix_grouping)
  df['ward_prefix_grouping'] = df['ward'].apply(prefix_grouping)
  df['scheme_name_prefix_grouping'] = df['scheme_name'].apply(prefix_grouping)"""
  
  # Compute missing construction year
  ward_construction_year_dict = map_ward_construction_year(df)
  top_ward = df['ward'].describe().top
  top_ward_construction_year =  \
    int(df[df['ward'] == top_ward]['construction_year'].median())
  
  df['construction_year'] = df.apply(compute_construction_year, axis=1, 
                                     args=(ward_construction_year_dict,
                                          top_ward_construction_year,))
  
  # Compute age of well
  df['age'] = df.apply(compute_age, axis=1)
  
  # Fetch Year and Month of date recorded
  df['year_recorded'] = df['date_recorded'].apply(compute_year_recorded)
  df['month_recorded'] = df['date_recorded'].apply(compute_month_recorded)
  
  for col in df.select_dtypes(include=np.number).columns:
    if df[col].isnull().sum() > 0:
      df[col].fillna(df[col].mean(), inplace=True)

feature_engineering(train_features_df)

CPU times: user 15.3 s, sys: 24.8 ms, total: 15.3 s
Wall time: 15.3 s


In [8]:
# Selecting independent and dependent variables.

X = train_features_df.drop(columns=['id', 'funder', 'installer', 'wpt_name', 
                                    'subvillage', 'lga','ward','scheme_name'])
y = train_labels_df.status_group

In [9]:
pd.set_option('display.max_columns', None)
train_features_df.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,funder_aleast_5,installer_aleast_5,wpt_name_character_grouping,subvillage_character_grouping,lga_engineered,ward_character_grouping,scheme_name_character_grouping,age,year_recorded,month_recorded
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,1.0,1.0,,m,rural,m,r,12,2011,3
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1.0,1.0,z,n,rural,n,,3,2013,3
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,1.0,1.0,k,m,rural,n,n,4,2013,2
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,1.0,1.0,z,m,rural,n,,27,2013,1
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,,,True,2004,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,0.0,1.0,s,k,rural,n,,7,2011,7


In [10]:
# Split data into train and test using k-fold cross-validation
# with independent test data set.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25,
                                                    shuffle=True,
                                                    random_state=42
                                                   )

In [11]:
# Get quick initial metrics estimate.

# Using sklearn accuracy_score
import numpy as np
from sklearn.metrics import accuracy_score

majority_class = y_train.mode()[0]
prediction = np.full(shape=y_train.shape, 
                     fill_value=majority_class)

print(f'accuracy score {accuracy_score(y_train, prediction)}')


# Using simple pandas value counts method
print(y_train.value_counts(normalize=True))

accuracy score 0.542334455667789
functional                 0.542334
non functional             0.384871
functional needs repair    0.072795
Name: status_group, dtype: float64


In [12]:
# Data pre-processing, Feature selection and Model selection.

# Imports for pipeline
from sklearn.pipeline import make_pipeline

import category_encoders as ce
from sklearn.preprocessing import RobustScaler
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
# Create pipeline
pipeline = make_pipeline(\
                         ce.BinaryEncoder(),
                         RobustScaler(),
                         XGBClassifier(learning_rate=0.1, n_estimators=1000,
                                       max_depth=4, min_child_weight=6,
                                       gamma=0, subsample=0.8,
                                       colsample_bytree=0.8,
                                       objective= 'multi:softmax', num_class=3,
                                       scale_pos_weight=1,
                                       seed=42, n_jobs=4))

In [None]:
%%time
# Model validation. 

param_grid = {
    'xgbclassifier__max_depth': range(3, 10, 2),
    'xgbclassifier__min_child_weight': range(1, 6, 2)
}

gridsearch1 = GridSearchCV(pipeline, param_grid=param_grid, cv=3, 
                         scoring='accuracy', verbose=20)

gridsearch1.fit(X_train, y_train)

In [None]:
# Interpret the results.

# Best cross validation score
print('Cross Validation Score:', gridsearch1.best_score_)

# Best parameters which resulted in the best score
print('Best Parameters:', gridsearch1.best_params_)

**Output from 1st Iteration.**

Cross Validation Score: 0.7998653198653198

Best Parameters: {'xgbclassifier__max_depth': 7, 'xgbclassifier__min_child_weight': 1}

[CV]  xgbclassifier__max_depth=9, xgbclassifier__min_child_weight=5, score=0.7981682268166207, total= 3.1min
CPU times: user 5h 18min, sys: 36.5 s, total: 5h 18min 36s
Wall time: 1h 20min 28s

In [13]:
import dask.dataframe as dd
from dask.distributed import Client

client = Client(n_workers=16)
client

0,1
Client  Scheduler: tcp://127.0.0.1:42223  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 16  Cores: 16  Memory: 67.53 GB


In [14]:
# Create pipeline with n_jobs set to -1
pipeline2 = make_pipeline(\
                         ce.BinaryEncoder(),
                         RobustScaler(),
                         XGBClassifier(learning_rate=0.1, n_estimators=1000,
                                       max_depth=4, min_child_weight=6,
                                       gamma=0, subsample=0.8,
                                       colsample_bytree=0.8,
                                       objective= 'multi:softmax', num_class=3,
                                       scale_pos_weight=1,
                                       seed=42, n_jobs=-1))

In [16]:
%%time
# Model validation. 

param_grid2 = {
    'xgbclassifier__max_depth': range(3, 10, 2),
    'xgbclassifier__min_child_weight': range(1, 6, 2)
}

gridsearch2 = GridSearchCV(pipeline2, param_grid=param_grid2, cv=3, 
                         scoring='accuracy', verbose=20)

gridsearch2.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=1 ...


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   47.5s remaining:    0.0s


[CV]  xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=1, score=0.7804861625479765, total=  45.9s
[CV] xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=1 ...


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.6min remaining:    0.0s


[CV]  xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=1, score=0.7805387205387205, total=  46.9s
[CV] xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=1 ...


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.4min remaining:    0.0s


[CV]  xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=1, score=0.7797158057781669, total=  45.5s
[CV] xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=3 ...


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  3.2min remaining:    0.0s


[CV]  xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=3, score=0.7808228402127803, total=  45.0s
[CV] xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=3 ...


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.0min remaining:    0.0s


[CV]  xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=3, score=0.7835016835016835, total=  46.4s
[CV] xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=3 ...


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  4.7min remaining:    0.0s


[CV]  xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=3, score=0.7796484611758367, total=  45.1s
[CV] xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=5 ...


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  5.5min remaining:    0.0s


[CV]  xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=5, score=0.7800821493502121, total=  45.1s
[CV] xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=5 ...


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  6.3min remaining:    0.0s


[CV]  xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=5, score=0.7835016835016835, total=  45.5s
[CV] xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=5 ...


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  7.1min remaining:    0.0s


[CV]  xgbclassifier__max_depth=3, xgbclassifier__min_child_weight=5, score=0.7790423597548657, total=  48.1s
[CV] xgbclassifier__max_depth=5, xgbclassifier__min_child_weight=1 ...


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  8.4min remaining:    0.0s


[CV]  xgbclassifier__max_depth=5, xgbclassifier__min_child_weight=1, score=0.7973873813211232, total= 1.2min
[CV] xgbclassifier__max_depth=5, xgbclassifier__min_child_weight=1 ...


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:  9.6min remaining:    0.0s


[CV]  xgbclassifier__max_depth=5, xgbclassifier__min_child_weight=1, score=0.7981144781144781, total= 1.2min
[CV] xgbclassifier__max_depth=5, xgbclassifier__min_child_weight=1 ...


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed: 10.8min remaining:    0.0s


[CV]  xgbclassifier__max_depth=5, xgbclassifier__min_child_weight=1, score=0.7975621253956495, total= 1.2min
[CV] xgbclassifier__max_depth=5, xgbclassifier__min_child_weight=3 ...


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed: 12.0min remaining:    0.0s


[CV]  xgbclassifier__max_depth=5, xgbclassifier__min_child_weight=3, score=0.8002828092384351, total= 1.2min
[CV] xgbclassifier__max_depth=5, xgbclassifier__min_child_weight=3 ...


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed: 13.3min remaining:    0.0s


[CV]  xgbclassifier__max_depth=5, xgbclassifier__min_child_weight=3, score=0.7958249158249159, total= 1.2min
[CV] xgbclassifier__max_depth=5, xgbclassifier__min_child_weight=3 ...


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 14.5min remaining:    0.0s


[CV]  xgbclassifier__max_depth=5, xgbclassifier__min_child_weight=3, score=0.7997844972725436, total= 1.2min
[CV] xgbclassifier__max_depth=5, xgbclassifier__min_child_weight=5 ...


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed: 15.7min remaining:    0.0s


[CV]  xgbclassifier__max_depth=5, xgbclassifier__min_child_weight=5, score=0.7996094539088276, total= 1.2min
[CV] xgbclassifier__max_depth=5, xgbclassifier__min_child_weight=5 ...


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done  17 out of  17 | elapsed: 16.9min remaining:    0.0s


[CV]  xgbclassifier__max_depth=5, xgbclassifier__min_child_weight=5, score=0.7987878787878788, total= 1.2min
[CV] xgbclassifier__max_depth=5, xgbclassifier__min_child_weight=5 ...


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed: 18.1min remaining:    0.0s


[CV]  xgbclassifier__max_depth=5, xgbclassifier__min_child_weight=5, score=0.7993130850562328, total= 1.2min
[CV] xgbclassifier__max_depth=7, xgbclassifier__min_child_weight=1 ...


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done  19 out of  19 | elapsed: 19.8min remaining:    0.0s


[CV]  xgbclassifier__max_depth=7, xgbclassifier__min_child_weight=1, score=0.7993401117769847, total= 1.7min
[CV] xgbclassifier__max_depth=7, xgbclassifier__min_child_weight=1 ...


  if diff:
  if diff:


[CV]  xgbclassifier__max_depth=7, xgbclassifier__min_child_weight=1, score=0.8006734006734006, total= 1.6min
[CV] xgbclassifier__max_depth=7, xgbclassifier__min_child_weight=1 ...


  if diff:
  if diff:


[CV]  xgbclassifier__max_depth=7, xgbclassifier__min_child_weight=1, score=0.7995824634655533, total= 1.7min
[CV] xgbclassifier__max_depth=7, xgbclassifier__min_child_weight=3 ...


  if diff:
  if diff:


[CV]  xgbclassifier__max_depth=7, xgbclassifier__min_child_weight=3, score=0.7985320853814558, total= 1.6min
[CV] xgbclassifier__max_depth=7, xgbclassifier__min_child_weight=3 ...


  if diff:
  if diff:


[CV]  xgbclassifier__max_depth=7, xgbclassifier__min_child_weight=3, score=0.7985858585858586, total= 1.7min
[CV] xgbclassifier__max_depth=7, xgbclassifier__min_child_weight=3 ...


  if diff:
  if diff:


[CV]  xgbclassifier__max_depth=7, xgbclassifier__min_child_weight=3, score=0.8000538756818641, total= 1.6min
[CV] xgbclassifier__max_depth=7, xgbclassifier__min_child_weight=5 ...


  if diff:
  if diff:


[CV]  xgbclassifier__max_depth=7, xgbclassifier__min_child_weight=5, score=0.7979934011177698, total= 1.6min
[CV] xgbclassifier__max_depth=7, xgbclassifier__min_child_weight=5 ...


  if diff:
  if diff:


[CV]  xgbclassifier__max_depth=7, xgbclassifier__min_child_weight=5, score=0.7992592592592592, total= 1.7min
[CV] xgbclassifier__max_depth=7, xgbclassifier__min_child_weight=5 ...


  if diff:
  if diff:


[CV]  xgbclassifier__max_depth=7, xgbclassifier__min_child_weight=5, score=0.7990437066469123, total= 1.6min
[CV] xgbclassifier__max_depth=9, xgbclassifier__min_child_weight=1 ...


  if diff:
  if diff:


[CV]  xgbclassifier__max_depth=9, xgbclassifier__min_child_weight=1, score=0.7958386640630261, total= 2.2min
[CV] xgbclassifier__max_depth=9, xgbclassifier__min_child_weight=1 ...


  if diff:
  if diff:


[CV]  xgbclassifier__max_depth=9, xgbclassifier__min_child_weight=1, score=0.7987878787878788, total= 2.2min
[CV] xgbclassifier__max_depth=9, xgbclassifier__min_child_weight=1 ...


  if diff:
  if diff:


[CV]  xgbclassifier__max_depth=9, xgbclassifier__min_child_weight=1, score=0.7990437066469123, total= 2.2min
[CV] xgbclassifier__max_depth=9, xgbclassifier__min_child_weight=3 ...


  if diff:
  if diff:


[CV]  xgbclassifier__max_depth=9, xgbclassifier__min_child_weight=3, score=0.7975220523870447, total= 2.1min
[CV] xgbclassifier__max_depth=9, xgbclassifier__min_child_weight=3 ...


  if diff:
  if diff:


[CV]  xgbclassifier__max_depth=9, xgbclassifier__min_child_weight=3, score=0.7968350168350168, total= 2.2min
[CV] xgbclassifier__max_depth=9, xgbclassifier__min_child_weight=3 ...


  if diff:
  if diff:


[CV]  xgbclassifier__max_depth=9, xgbclassifier__min_child_weight=3, score=0.797292746986329, total= 2.1min
[CV] xgbclassifier__max_depth=9, xgbclassifier__min_child_weight=5 ...


  if diff:
  if diff:


[CV]  xgbclassifier__max_depth=9, xgbclassifier__min_child_weight=5, score=0.7949633021345364, total= 2.1min
[CV] xgbclassifier__max_depth=9, xgbclassifier__min_child_weight=5 ...


  if diff:
  if diff:


[CV]  xgbclassifier__max_depth=9, xgbclassifier__min_child_weight=5, score=0.8006734006734006, total= 2.1min
[CV] xgbclassifier__max_depth=9, xgbclassifier__min_child_weight=5 ...


  if diff:
  if diff:
[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed: 53.1min finished


[CV]  xgbclassifier__max_depth=9, xgbclassifier__min_child_weight=5, score=0.7981682268166207, total= 2.1min
CPU times: user 13h 39min 33s, sys: 16min 58s, total: 13h 56min 32s
Wall time: 55min 34s


In [17]:
# Interpret the results.

# Best cross validation score
print('Cross Validation Score:', gridsearch2.best_score_)

# Best parameters which resulted in the best score
print('Best Parameters:', gridsearch2.best_params_)

Cross Validation Score: 0.7998653198653198
Best Parameters: {'xgbclassifier__max_depth': 7, 'xgbclassifier__min_child_weight': 1}


**Output from 2nd Iteration using Dask.**

Cross Validation Score: 0.7998653198653198

Best Parameters: {'xgbclassifier__max_depth': 7, 'xgbclassifier__min_child_weight': 1}

[CV]  xgbclassifier__max_depth=9, xgbclassifier__min_child_weight=5, score=0.7981682268166207, total= 2.1min
CPU times: user 13h 39min 33s, sys: 16min 58s, total: 13h 56min 32s
Wall time: 55min 34s


In [19]:
test_features_df = pd.read_csv('test_features.csv', na_values=nan_values_list)
feature_engineering(test_features_df)

X_submission = test_features_df.drop(columns =['id', 'funder', 'installer', 'wpt_name', 'subvillage', 
                                    'lga','ward','scheme_name'])

# Predict with X_submission features
y_submission = gridsearch2.predict(X_submission)

y_submission_df = pd.DataFrame(y_submission, columns=['status_group'])
output_for_submission = test_features_df.join(y_submission_df).loc[:, ['id','status_group']]

  if diff:


In [20]:
output_for_submission.head()

Unnamed: 0,id,status_group
0,50785,non functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional


In [21]:
print(output_for_submission.status_group.value_counts())
print(output_for_submission.shape)

functional                 8545
non functional             5177
functional needs repair     636
Name: status_group, dtype: int64
(14358, 2)
