# Using "Cleaned" Data
In this notebook, we will start with a dataframe as follows:
* drop duplicates (26 records)
* drop columns with missing values (7 columns)
    * funder
    * installer
    * subvillage
    * public_meeting
    * scheme_management
    * scheme_name
    * permit
* drop rows with zero values for certain features (22,783 records)
    * longitude
    * construction_year
    * population
    * gps_height
* reformat status_group to 0s, 1s
* reformat date_recorded as datetime

The result has 36,581 records (reduced from 59,400) and omits the 7 features noted above. This leaves ...

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
SEED = 10

In [2]:
# load the data from two files
dfX = pd.read_csv('../data/training_set_values.csv')
dfy = pd.read_csv('../data/training_set_labels.csv')
# concatenate the files
df = pd.concat([dfX, dfy['status_group']], axis = 1)
# show rows and columns
df.shape

(59400, 41)

In [3]:
# drop the duplicate records
df.drop(df[df.duplicated(subset=df.columns.difference(['id']))].index, inplace=True)
# show rows and columns
df.shape

(59364, 41)

In [4]:
# drop all columns with missing values
df.dropna(axis='columns', inplace=True)
# show rows and columns
df.shape

(59364, 34)

In [5]:
# drop all records with seemingly erroneous zero values
df.drop(df[(df.longitude == 0) | (df.construction_year == 0) | \
           (df.population == 0) | (df.gps_height == 0)].index, inplace=True)
# show rows and columns
df.shape

(36581, 34)

In [6]:
# encode the status_group as 1s ('functional') and 0s ('non functional' or 'functional needs repair')
df.status_group = df.status_group.apply(lambda x: 1 if x == 'functional' else 0)
# convert date_recorded to datetime object
df.date_recorded = pd.to_datetime(df.date_recorded, format = "%Y-%m-%d")

In [7]:
# drop all categorical columns with more than 10 unique values
df.drop(columns = list(df.select_dtypes(include=['object']).loc[:, df.nunique() > 10].columns), inplace=True)
# show rows and columns
df.shape

(36581, 27)

In [8]:
# one-hot encode everything
df = pd.get_dummies(df)
# show rows and columns
df.shape

(36581, 105)

# Building Trees Using scikit-learn
This model follows the steps of lab 26.06.

The input is as specified at the beginning of the document, and from those features only the numerical ones are used. This includes:
* amount_tsh
* gps_height
* longitude
* latitude
* num_private
* region_code
* district_code
* population
* construction_year

In [16]:
df.select_dtypes(include=['number']).shape

(36581, 104)

In [None]:
# create X and y
X = df.select_dtypes(include=['number']).drop(['id', 'status_group'], axis=1)
y = df.status_group

In [None]:
# Perform an 80/20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=SEED)

In [None]:
# Train a DT classifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state=SEED)
classifier.fit(X_train, y_train)

In [None]:
# Make predictions for test data
y_pred = classifier.predict(X_test)

In [None]:
# Calculate accuracy
acc = accuracy_score(y_test,y_pred) * 100
print('Accuracy is: {0}'.format(acc))

# Check the AUC for predictions
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('\nAUC is: {0}'.format(round(roc_auc, 2)))

# Create and print a confusion matrix
print('\nConfusion Matrix')
print('----------------')
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

In [None]:
(y_test == 1).sum() / (y_test).size

# Hyperparameter Tuning and Pruning in Decision Trees
These steps follow 26.08

## maximum tree depth

In [None]:
# Identify the optimal tree depth for given data
max_depths = list(range(1, 24))
train_results = []
test_results = []
for max_depth in max_depths:
    dt = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth, random_state=SEED)
    dt.fit(X_train, y_train)
    train_pred = dt.predict(X_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    # Add auc score to previous train results
    train_results.append(roc_auc)
    y_pred = dt.predict(X_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    # Add auc score to previous test results
    test_results.append(roc_auc)

plt.figure(figsize=(12,6))
plt.plot(max_depths, train_results, 'b', label='Train AUC')
plt.plot(max_depths, test_results, 'r', label='Test AUC')
plt.ylabel('AUC score')
plt.xlabel('Tree depth')
plt.legend()
plt.show()

Stop at depth = 11? (after big spike)

Stop at depth = 6? (first indication of leveling out)

Stop at depth = 9? (after big spike)

## minimum sample split

In [None]:
# Identify the optimal min-samples-split for given data
min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
train_results = []
test_results = []
for min_samples_split in min_samples_splits:
    dt = DecisionTreeClassifier(criterion='entropy', min_samples_split=min_samples_split, random_state=SEED)
    dt.fit(X_train, y_train)
    train_pred = dt.predict(X_train)
    false_positive_rate, true_positive_rate, thresholds =    roc_curve(y_train, train_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    train_results.append(roc_auc)
    y_pred = dt.predict(X_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    test_results.append(roc_auc)

plt.figure(figsize=(12,6))
plt.plot(min_samples_splits, train_results, 'b', label='Train AUC')
plt.plot(min_samples_splits, test_results, 'r', label='Test AUC')
plt.xlabel('Min. Sample splits')
plt.legend()
plt.show()

Stabilizes at 0.2

## minimum sample leafs

In [None]:
# Calculate the optimal value for minimum sample leafs
min_samples_leafs = np.linspace(0.1, 0.5, 5, endpoint=True)
train_results = []
test_results = []
for min_samples_leaf in min_samples_leafs:
    dt = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=min_samples_leaf, random_state=SEED)
    dt.fit(X_train, y_train)
    train_pred = dt.predict(X_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    train_results.append(roc_auc)
    y_pred = dt.predict(X_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    test_results.append(roc_auc)
    
plt.figure(figsize=(12,6))    
plt.plot(min_samples_leafs, train_results, 'b', label='Train AUC')
plt.plot(min_samples_leafs, test_results, 'r', label='Test AUC')
plt.ylabel('AUC score')
plt.xlabel('Min. Sample Leafs')
plt.legend()
plt.show()

Highest at 0.1?

Stable at 0.2?

## maximum features

In [None]:
# Find the best value for optimal maximum feature size
max_features = list(range(1, X_train.shape[1]))
train_results = []
test_results = []
for max_feature in max_features:
    dt = DecisionTreeClassifier(criterion='entropy', max_features=max_feature, random_state=SEED)
    dt.fit(X_train, y_train)
    train_pred = dt.predict(X_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    train_results.append(roc_auc)
    y_pred = dt.predict(X_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    test_results.append(roc_auc)

plt.figure(figsize=(12,6))
plt.plot(max_features, train_results, 'b', label='Train AUC')
plt.plot(max_features, test_results, 'r', label='Test AUC')
plt.ylabel('AUC score')
plt.xlabel('max features')
plt.legend()
plt.show()

No clear effect on train set, highest test result at 6.

## retrain

In [None]:
# Train a classifier with optimal values identified above
dt = DecisionTreeClassifier(criterion='entropy',
                           max_features=50,
                           max_depth=13,
                           min_samples_split=0.2,
                           min_samples_leaf=0.2, 
                           random_state=SEED)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

# Calculate accuracy
acc = accuracy_score(y_test,y_pred) * 100
print('Accuracy is: {0}'.format(acc))

# Check the AUC for predictions
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('\nAUC is: {0}'.format(round(roc_auc, 2)))

# Create and print a confusion matrix
print('\nConfusion Matrix')
print('----------------')
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)