In [1]:
# Import all the required libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import re
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import the random forest classifier - This is a classification problem and I think this is the best model for prediction
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn import compose

In [3]:
catg_feature_names = [
    'funder',
    'basin',
    'region',                    
    'district_code',
    'lga',
    'scheme_management',
    'extraction_type',
    'extraction_type_class',
    'management',
    'payment_type',
    'water_quality',
    'quantity',
    'source',     
    'source_class',
#    'waterpoint_type_group'
]

num_feature_names = [
#    'amount_tsh',
#    'gps_height',
    'longitude',
    'latitude',
#    'population',
    'construction_year',
    'pump_age'
]

date_feature_names = ['date_recorded']

label_names = ['status_group']

In [4]:
# Load the cleaned dataset and view the first few rows
df = pd.read_csv('data/cleaned_df_v2.csv', usecols= date_feature_names + catg_feature_names + num_feature_names + label_names)
#df['date_recorded'] = pd.to_datetime(df['date_recorded'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   date_recorded          59400 non-null  object 
 1   funder                 59400 non-null  object 
 2   longitude              59400 non-null  float64
 3   latitude               59400 non-null  float64
 4   basin                  59400 non-null  object 
 5   region                 59400 non-null  object 
 6   district_code          59400 non-null  int64  
 7   lga                    59400 non-null  object 
 8   scheme_management      59400 non-null  object 
 9   construction_year      59400 non-null  float64
 10  extraction_type        59400 non-null  object 
 11  extraction_type_class  59400 non-null  object 
 12  management             59400 non-null  object 
 13  payment_type           59400 non-null  object 
 14  water_quality          59400 non-null  object 
 15  qu

In [8]:
# Adding date features...
df['year_rec'] = df['date_recorded'].apply(lambda x: pd.to_datetime(x).date().isocalendar()[0])
df['week_rec'] = df['date_recorded'].apply(lambda x: pd.to_datetime(x).date().isocalendar()[1])
df['day_rec'] = df['date_recorded'].apply(lambda x: pd.to_datetime(x).date().isocalendar()[2])


In [9]:
# Transform and combine categorical and numerical features:
catg_transformer = compose.ColumnTransformer(
    transformers=
    [
     ('catg', preprocessing.OneHotEncoder(), catg_feature_names),
    ],
    remainder='drop',
    sparse_threshold=0.0,
    n_jobs=None,
    transformer_weights=None,
    verbose=False
).fit(df.loc[:,catg_feature_names])

In [10]:
def get_name_and_levels(onehot_encoder: compose.ColumnTransformer , catg_names):
    """
    Gets 'transformed catg feature' names from compose.ColumnTransformer object and catg feature names.

    """
    onehot_feature_names = onehot_encoder.get_feature_names()

    f_name_and_level = [catg_names[int(re.search(r'\d+', elm).group())]
                        + '-' + elm.split('__', 1)[-1].split('_', 1)[-1]
                        for elm in onehot_feature_names
                        ]
    return f_name_and_level


catg_features_trans = catg_transformer.transform(df.loc[:, catg_feature_names])
catg_feature_trans_names = get_name_and_levels(catg_transformer, catg_feature_names)
num_features = df.loc[:, num_feature_names]
date_features = df.loc[:, date_feature_names]
# num_feature_trans_names = None

X_trans = np.concatenate([catg_features_trans, num_features, date_features], axis=1)
X_trans_names = catg_feature_trans_names + num_feature_names + date_feature_names
y = df.loc[:,label_names].values

In [11]:
# Training Decision Tree Classifier Model
dt_model = DecisionTreeClassifier(
    criterion='gini',
    splitter='best',
    max_depth=3,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features=None,
    random_state=15,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    class_weight='balanced',
    presort=False)

X_trans_train, X_trans_test, y_train, y_test = train_test_split(X_trans, y, test_size=0.1, random_state=20) # random_state =20

dt_model.fit(X_trans_train, y_train)
print('trained')
y_pred = dt_model.predict(X_trans_test)
y_actual = y_test

ValueError: could not convert string to float: '2011-03-16'

In [None]:
df.info()

In [None]:
# Defining X and y columns to use with our models
X = df.drop('status_group', axis=1)
#X.columns
y = df['status_group']

# Using train_test_split to split our dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

In [None]:
rf = RandomForestClassifier(n_estimators= 1000, criterion='gini')
rf.fit(X_train, y_train)