In [90]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.utils import resample
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

from imblearn.over_sampling import SMOTE

In [20]:
X = pd.read_csv('references/training_set_values.csv', index_col = 'id')
y = pd.read_csv('references/training_set_labels.csv', index_col = 'id')['status_group']
df = X.copy()
df['target'] = y.copy()

In [21]:
df['target'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: target, dtype: int64

In [22]:
# df.drop[col = ['num_private'], inplace = True]

In [23]:
df.head()

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


# Basic FSM

A basic logistic regression model that does not have anything applied to it other than a max_iter to keep errors from popping up.

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.25)


In [25]:
X_train_num = X_train.select_dtypes(exclude = ['object'])
lg = LogisticRegression(max_iter = 1000)
lg.fit(X_train_num, y_train)
cross_val_score(lg, X_train_num, y_train, cv = 5)

array([0.55566779, 0.55185185, 0.54848485, 0.54893378, 0.55252525])

The basic FSM has an r<sup>2</sup> score of around 0.55

In [9]:
X['construction_year'].replace({0: np.nan}, inplace = True)

In [10]:
X.isna().sum()

amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year        20709
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_group                0
quantity

In [11]:
# keep nans in construction year; create a new column that marks if it's nan or not
# decision tree

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.25)
X_train_num = X_train.select_dtypes(exclude = ['object'])
X_test_num = X_test.select_dtypes(exclude = ['object'])

In [53]:
si = SimpleImputer()
X_train_num_si = pd.DataFrame(si.fit_transform(X_train_num), index = X_train_num.index, columns = X_train_num.columns)
X_train_num_si['construction_year'].value_counts()

0.0       15463
2010.0     2001
2008.0     1959
2009.0     1892
2000.0     1595
2007.0     1195
2006.0     1103
2003.0      989
2011.0      943
2012.0      839
2004.0      823
2002.0      790
1978.0      787
1995.0      750
1998.0      745
1999.0      744
1990.0      741
2005.0      740
1985.0      685
1980.0      629
1996.0      613
1984.0      600
1982.0      571
1994.0      556
1972.0      515
1974.0      505
1997.0      484
1992.0      482
1993.0      463
2001.0      403
1988.0      398
1983.0      361
1975.0      326
1986.0      324
1970.0      293
1976.0      292
1991.0      236
1989.0      223
1987.0      219
1981.0      172
1977.0      156
1979.0      150
1973.0      142
2013.0      129
1971.0      112
1960.0       79
1963.0       72
1967.0       65
1968.0       57
1969.0       44
1964.0       28
1962.0       24
1961.0       15
1966.0       14
1965.0       14
Name: construction_year, dtype: int64

# Upsampling our minority targets

In [14]:
# df_0 = df[df.target == 'functional']
# df_1 = df[df.target == 'functional needs repair']
# df_2 = df[df.target == 'non functional']

# df_1_upsample = resample(df_1, replace = True, n_samples = 32259, random_state = 42)
# df_2_upsample = resample(df_2, replace = True, n_samples = 32259, random_state = 42)
# df_up = pd.concat([df_0, df_1_upsample, df_2_upsample])

In [15]:
# df['target'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: target, dtype: int64

In [16]:
# df_up['target'].value_counts()

functional needs repair    32259
non functional             32259
functional                 32259
Name: target, dtype: int64

In [17]:
# X_train, X_test, y_train, y_test = train_test_split(df_up.drop(columns = ['target']), df_up['target'], random_state = 42, test_size = 0.25)
# X_train_num = X_train.select_dtypes(exclude = ['object'])
# lg = LogisticRegression(max_iter = 1000)
# lg.fit(X_train_num, y_train)
# cross_val_score(lg, X_train_num, y_train, cv = 5)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


KeyboardInterrupt: 

In [54]:
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_sample(X_train_num, y_train)

In [55]:
X_train_num.shape

(44550, 9)

In [56]:
X_train.shape

(44550, 39)

In [57]:
X_train_resampled.shape

(72483, 9)

In [58]:
y_train.shape

(44550,)

In [59]:
y_train_resampled.shape

(72483,)

In [60]:
y_train.value_counts()

functional                 24161
non functional             17146
functional needs repair     3243
Name: status_group, dtype: int64

In [61]:
y_train_resampled.value_counts()

functional needs repair    24161
non functional             24161
functional                 24161
Name: status_group, dtype: int64

In [66]:
ss = StandardScaler()
X_train_resampeled_ss = ss.fit_transform(X_train_resampled)
X_test_ss = ss.fit_transform(X_test_num)

In [79]:
lg1 = LogisticRegression(max_iter = 1000)
lg1.fit(X_train_num, y_train)
confusion_matrix(y_test, lg1.predict(X_test_ss))

array([[3800, 2282, 2016],
       [ 404,  423,  247],
       [2478, 1956, 1244]], dtype=int64)

In [67]:
knn1 = KNeighborsClassifier(3)
knn1.fit(X_train_resampeled_ss, y_train_resampled)
confusion_matrix(y_test, knn1.predict(X_test_ss))

KNeighborsClassifier(n_neighbors=3)

In [76]:
knn2 = KNeighborsClassifier(5)
knn2.fit(X_train_resampeled_ss, y_train_resampled)
confusion_matrix(y_test, knn2.predict(X_test_ss))

array([[4588, 1192, 2318],
       [ 395,  390,  289],
       [1850,  644, 3184]], dtype=int64)

In [77]:
knn3 = KNeighborsClassifier(7)
knn3.fit(X_train_resampeled_ss, y_train_resampled)
confusion_matrix(y_test, knn3.predict(X_test_ss))

array([[4685, 1240, 2173],
       [ 383,  420,  271],
       [1873,  705, 3100]], dtype=int64)

In [83]:
dt = DecisionTreeClassifier(random_state = 42)
dt.fit(X_train_resampeled_ss, y_train_resampled)
confusion_matrix(y_test, dt.predict(X_test_ss))

array([[2848, 1833, 3417],
       [ 318,  348,  408],
       [1816, 1040, 2822]], dtype=int64)

In [87]:
pipe = Pipeline([('ss', StandardScaler()), ('tree', DecisionTreeClassifier(random_state = 42))])
pipe.fit(X_train_num, y_train)

Pipeline(steps=[('ss', StandardScaler()),
                ('tree', DecisionTreeClassifier(random_state=42))])

In [89]:
pipe.score(X_test_num, y_test)

0.6676094276094277

In [93]:
grid = [{'tree__max_depth': [None, 2, 6, 10],
        'tree__min_samples_split': [5, 10]}]

gridsearch = GridSearchCV(estimator = pipe, param_grid = grid, scoring = 'accuracy', cv = 5)

gridsearch.fit(X_train_num, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ss', StandardScaler()),
                                       ('tree',
                                        DecisionTreeClassifier(random_state=42))]),
             param_grid=[{'tree__max_depth': [None, 2, 6, 10],
                          'tree__min_samples_split': [5, 10]}],
             scoring='accuracy')

In [94]:
gridsearch.score(X_train_num, y_train)

0.8772839506172839