# Import data

In [1]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('csv_files/training_data_clean')

In [3]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
df.head()

Unnamed: 0,amount_tsh,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,6000.0,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,0.0,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,True,GeoData Consultants Ltd,Other,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,25.0,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,0.0,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,0.0,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,VWC,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


In [5]:
print(len(df))
df["num_private"].value_counts()

59029


0       58272
6          81
1          73
5          46
8          46
32         40
45         36
15         35
39         30
93         28
3          27
7          26
2          23
65         22
47         21
102        20
4          20
17         17
80         15
20         14
25         12
11         11
41         10
34         10
16          8
120         7
150         6
22          6
12          5
24          5
50          5
38          4
58          4
9           4
10          4
14          3
61          3
27          2
26          2
160         1
30          1
698         1
60          1
1402        1
450         1
668         1
131         1
35          1
672         1
42          1
136         1
87          1
300         1
280         1
141         1
62          1
111         1
240         1
1776        1
755         1
180         1
213         1
23          1
55          1
94          1
Name: num_private, dtype: int64

# Create dummy varaibles

In [7]:
for col in df.columns:
    print('______________')
    print(col)
    print('unique values',len(set(df[col])), '\n')
    print(df[col].value_counts().head())
    print('______________')

______________
amount_tsh
unique values 98 

0.0       41268
500.0      3102
50.0       2472
1000.0     1488
20.0       1463
Name: amount_tsh, dtype: int64
______________
______________
funder
unique values 1894 

Government Of Tanzania    12649
Danida                     3114
Hesawa                     2200
Rwssp                      1374
Kkkt                       1286
Name: funder, dtype: int64
______________
______________
gps_height
unique values 2428 

 0     20077
-15       60
-16       55
-13       55
-20       52
Name: gps_height, dtype: int64
______________
______________
installer
unique values 2134 

DWE           21016
Government     1825
RWE            1205
DANIDA         1050
KKKT            898
Name: installer, dtype: int64
______________
______________
longitude
unique values 57145 

0.000000     1812
37.252194       2
37.540901       2
33.010510       2
39.093484       2
Name: longitude, dtype: int64
______________
______________
latitude
unique values 57146 

-2.0000

In [8]:
# >= 8 unique values
large_catagoricals = ['funder', 'installer', 'wpt_name', 'subvillage', 
                      'region', 'region_code', 'district_code', 'lga', 'ward', 
                      'scheme_management', 'extraction_type', 
                      'extraction_type_group', 'management', 'water_quality', 
                      'source', ]
# < 8
small_catagoricals = ['basin', 'public_meeting', 'permit',
                      'extraction_type_class', 'management_group', 'payment',
                      'payment_type', 'quality_group', 'quantity', 'quantity_group', 
                      'source_type', 'source_class', 'waterpoint_type', 
                      'waterpoint_type_group',]

other = ['construction_year', 'year', 'month', 'longitude', 'latitude', 
         'num_private', 'population']

In [9]:
df.drop('recorded_by', axis=1, inplace=True)

In [10]:
def make_dummy(col):
    top_3_values = df[col].value_counts()[:3]
    other_values = df[col].value_counts()[3:]
    
    for name in top_3_values.keys():
        df.loc[df[col] == name, f'{col}: {name}'] =1
        
    for name in other_values.keys():
        df.loc[df[col] == name, f'{col}: other'] =1
    
    df.drop(col, axis=1, inplace=True)

In [11]:
for title in large_catagoricals:
    make_dummy(title)

In [12]:
df.to_csv('csv_files/training_data_dummies')

In [13]:
df = pd.read_csv('csv_files/training_data_dummies')

In [14]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [15]:
df2 = df

In [16]:
for name in small_catagoricals:
    dummies = pd.get_dummies(df2[name], prefix=name)
    df2 = pd.concat([df2, dummies], axis =1)
    df2.drop(name, axis=1, inplace=True)

In [17]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.scatterplot(x='longitude', y='latitude', hue='status_group', data=df2, palette='Paired');
plt.xlim(28, 43)
plt.show()



<Figure size 640x480 with 1 Axes>

In [18]:
df2.loc[(df2['latitude'] >= -6) & (df2['longitude'] <= 35), 'NW'] =1

df2.loc[(df2['latitude'] >= -6) & (df2['longitude'] > 35), 'NE'] =1

df2.loc[(df2['latitude'] < -6) & (df2['longitude'] <= 35), 'SW'] =1

df2.loc[(df2['latitude'] < -6) & (df2['longitude'] > 35), 'SE'] =1

df2.drop(['longitude', 'latitude'], axis=1, inplace=True)

In [19]:
df2 = df2.fillna(0)

In [20]:
set(df2['status_group'])

{'functional', 'functional needs repair', 'non functional'}

In [21]:
df2['status_group'] = df2['status_group'].replace(['functional needs repair'], 2)

In [22]:
df2['status_group'] = df2['status_group'].replace(['functional', ], 1)

In [23]:
df2['status_group'] = df2['status_group'].replace(['non functional'], 0)

In [24]:
df2.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59029 entries, 0 to 59028
Data columns (total 146 columns):
amount_tsh                                     float64
gps_height                                     int64
num_private                                    int64
population                                     int64
construction_year                              int64
status_group                                   int64
funder: Government Of Tanzania                 float64
funder: Danida                                 float64
funder: Hesawa                                 float64
funder: other                                  float64
installer: DWE                                 float64
installer: Government                          float64
installer: RWE                                 float64
installer: other                               float64
wpt_name: none                                 float64
wpt_name: Shuleni                              float64
wpt_name: Zahanati  

# Train Test Split

In [25]:
X = df2.drop(['status_group'], axis = 1)
y = df2['status_group']

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

## Scaled data

In [27]:
from sklearn.preprocessing import MinMaxScaler
mmscaler = MinMaxScaler()


X_train_minmax = mmscaler.fit_transform(X_train)
X_test_minmax = mmscaler.transform(X_test)

# Logistic Regression

## Base Model

In [28]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(fit_intercept=False, C=1e12, multi_class = 'multinomial', solver='lbfgs')
model_log = logreg.fit(X_train_minmax, y_train)
model_log

LogisticRegression(C=1000000000000.0, fit_intercept=False,
                   multi_class='multinomial')

In [29]:
y_hat_test = logreg.predict(X_test_minmax)
y_hat_train = logreg.predict(X_train_minmax)

In [30]:
from sklearn.metrics import roc_auc_score

y_score = logreg.fit(X_train_minmax, y_train).predict_proba(X_test_minmax)

auc = roc_auc_score(y_test, y_score, multi_class = 'ovo')

In [31]:
from sklearn.metrics import classification_report


target_names = ['non functional', 'functional', 'functional needs repair']

print('Test data')
print(classification_report(y_test, y_hat_test, target_names= target_names))
print(f'AUC: {auc}')


Test data
                         precision    recall  f1-score   support

         non functional       0.77      0.60      0.68      5664
             functional       0.70      0.90      0.79      8006
functional needs repair       0.45      0.07      0.12      1088

               accuracy                           0.72     14758
              macro avg       0.64      0.52      0.53     14758
           weighted avg       0.71      0.72      0.70     14758

AUC: 0.7888792974951123


In [32]:
from sklearn.metrics import confusion_matrix


cnf_matrix = confusion_matrix(y_test, y_hat_test)
print('Confusion Matrix:\n', cnf_matrix)

Confusion Matrix:
 [[3417 2211   36]
 [ 782 7172   52]
 [ 215  801   72]]


## Weighted

In [33]:
y_train.value_counts()

1    24048
0    16995
2     3228
Name: status_group, dtype: int64

In [34]:
weights = [None, 'balanced', {1:2, 0:1, 1:10}, {1:10, 0:1, 1:100}, {1:100, 0:1, 1:1000}, {1:1000, 0:1, 1:10000}]
names = ['None', 'Balanced', '2 to 1 to 10', '10 to 1 to 100', '100 to 1 to 1000', '1000 to 1 to 10000']


for n, weight in enumerate(weights):
    # Fit a model
    logreg = LogisticRegression(fit_intercept=False, C=1e20, class_weight=weight, 
                                multi_class= 'multinomial', solver='lbfgs')
    model_log = logreg.fit(X_train_minmax, y_train)
    print(model_log)

    # Predict
    y_hat_test = logreg.predict(X_test_minmax)

    y_score = logreg.fit(X_train_minmax, y_train).predict_proba(X_test_minmax)

    auc = roc_auc_score(y_test, y_score, multi_class = 'ovo')
    
    print(f'AUC for {names[n]}: {auc}')
    print('-------------------------------------------------------------------------------------')

LogisticRegression(C=1e+20, fit_intercept=False, multi_class='multinomial')
AUC for None: 0.7888795889362745
-------------------------------------------------------------------------------------
LogisticRegression(C=1e+20, class_weight='balanced', fit_intercept=False,
                   multi_class='multinomial')
AUC for Balanced: 0.7966830736982177
-------------------------------------------------------------------------------------
LogisticRegression(C=1e+20, class_weight={0: 1, 1: 10}, fit_intercept=False,
                   multi_class='multinomial')
AUC for 2 to 1 to 10: 0.7715455301335149
-------------------------------------------------------------------------------------
LogisticRegression(C=1e+20, class_weight={0: 1, 1: 100}, fit_intercept=False,
                   multi_class='multinomial')
AUC for 10 to 1 to 100: 0.7579327874919971
-------------------------------------------------------------------------------------


KeyboardInterrupt: 

In [None]:
logreg = LogisticRegression(fit_intercept=False, C=1e20, class_weight='balanced', 
                            multi_class= 'multinomial', solver='lbfgs')
model_log = logreg.fit(X_train_minmax, y_train)
print(model_log)

# Predict
y_hat_test = logreg.predict(X_test_minmax)

print(classification_report(y_test, y_hat_test, target_names= target_names))

# Check the AUC for predictions
y_score = knn.fit(X_train_minmax, y_train).predict_proba(X_test_minmax)
roc_auc = roc_auc_score(y_test, y_score, multi_class = 'ovo')
print(f'\nAUC is :{auc}')

## Smote

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_sample(X_train_minmax, y_train) 

In [None]:
print(pd.Series(y_train_resampled).value_counts()) 

In [None]:
C_param_range = [0.005, 0.1, 0.2, 0.5, 0.8, 1, 1.25, 1.5, 2, 1e13]
names = [0.005, 0.1, 0.2, 0.5, 0.8, 1, 1.25, 1.5, 2, 1e13]

for n, c in enumerate(C_param_range):
    # Fit a model
    logreg = LogisticRegression(fit_intercept=False, C=c, multi_class = 'multinomial', solver='lbfgs')
    model_log = logreg.fit(X_train_resampled, y_train_resampled)
    print(model_log) # Preview model params

    # Predict
    y_hat_test = logreg.predict(X_test_minmax)

    y_score = logreg.fit(X_train_resampled, y_train_resampled).predict_proba(X_test_minmax)

    auc = roc_auc_score(y_test, y_score, multi_class = 'ovo')
    
    print(f'AUC for {names[n]}: {auc}')
    print('-------------------------------------------------------')

In [None]:
    logreg = LogisticRegression(fit_intercept=False, C=1e13, multi_class = 'multinomial', solver='lbfgs')
    model_log = logreg.fit(X_train_resampled, y_train_resampled)
    
    y_hat_test = logreg.predict(X_test_minmax)

In [None]:
print('Test data')
print(classification_report(y_test, y_hat_test, target_names= target_names))
print(f'AUC: {auc}')

# Decision Tree

## Initial model

In [None]:
X = df2.drop('status_group',axis=1)
y = df2.status_group

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)

In [None]:
from sklearn.tree import DecisionTreeClassifier 
clf = DecisionTreeClassifier(criterion='entropy')

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)  

In [None]:
from sklearn.metrics import accuracy_score, roc_curve, auc

# Look at the classification report
print('\n',classification_report(y_test, y_pred, target_names= target_names))

# Check the AUC for predictions
y_score = clf.fit(X_train, y_train).predict_proba(X_test)
auc = roc_auc_score(y_test, y_score, multi_class = 'ovo')
print(f'\nAUC is :{auc}')


# Create and print a confusion matrix 
print('\nConfusion Matrix')
print('----------------')
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

In [None]:
from sklearn import tree

plt.figure(figsize=(12,12), dpi=500)
tree.plot_tree(clf, 
               feature_names=X.columns,
               class_names=np.unique(y).astype('str'),
               filled=True, rounded=True)
plt.show()

In [None]:

def plot_feature_importances(model):
    n_features = 10
    plt.figure(figsize=(8,8))
    plt.barh(range(n_features), np.sort(clf.feature_importances_)[:-11:-1], align='center') 
    plt.yticks(np.arange(n_features), X_train.columns.values) 
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')

plot_feature_importances(clf)



## Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier

X = df2.drop('status_group',axis=1)
y = df2.status_group

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)

forest = RandomForestClassifier(n_estimators=100, max_depth= 10)
forest.fit(X_train, y_train)

In [None]:
y_pred = forest.predict(X_test)
print(classification_report(y_test, y_pred, target_names= target_names))

# Check the AUC for predictions
y_score = forest.fit(X_train, y_train).predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_score, multi_class = 'ovo')
print(f'\nAUC is :{auc}')

##  Grid search

In [None]:
from sklearn.model_selection import  GridSearchCV

In [None]:
dt_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_features' : [20, 30, 40, 50],
    'max_depth': [16,17,18,19],
    'min_samples_split': [3,4,5,6],
    'min_samples_leaf': [1, 2, 3]
}

In [None]:
# Instantiate GridSearchCV
dt_clf = DecisionTreeClassifier()
dt_grid_search = GridSearchCV(dt_clf, dt_param_grid, cv=3, return_train_score=True)

# Fit to the data
dt_grid_search.fit(X_train, y_train)

In [None]:
dt_grid_search.best_params_

In [None]:
forest = RandomForestClassifier(criterion='gini',
                           max_features= 50,
                           max_depth=16,
                           min_samples_split=6,
                           min_samples_leaf=3, 
                           random_state=123)
forest.fit(X_train, y_train)

y_pred = forest.predict(X_test)
print(classification_report(y_test, y_pred, target_names= target_names))

#Check the AUC for predictions
y_score = forest.fit(X_train, y_train).predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_score, multi_class = 'ovo')
print(f'\nAUC is :{auc}')

In [None]:
fn = X_train.columns
cn = 'status_group'
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (8,4), dpi=800)
tree.plot_tree(forest.estimators_[3],
               feature_names = fn, 
               class_names=cn,
               filled = True);

In [None]:
plot_feature_importances(forest)

In [None]:
dt = DecisionTreeClassifier(criterion='gini',
                           max_features=50,
                           max_depth=16,
                           min_samples_split=6,
                           min_samples_leaf=2, 
                           random_state=123)
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)
print(classification_report(y_test, y_pred, target_names= target_names))

# Check the AUC for predictions
y_score = dt.fit(X_train, y_train).predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_score, multi_class = 'ovo')
print(f'\nAUC is :{auc}')

# XGBoost

In [None]:
import xgboost as xgb

In [None]:
clf = xgb.XGBClassifier()
clf.fit(X_train_minmax, y_train)

y_pred = clf.predict(X_test_minmax)
print(classification_report(y_test, y_pred, target_names= target_names))

# Check the AUC for predictions
y_score = clf.fit(X_train_minmax, y_train).predict_proba(X_test_minmax)
roc_auc = roc_auc_score(y_test, y_score, multi_class = 'ovo')
print(f'\nAUC is :{auc}')

In [None]:
param_grid = {
    "learning_rate": [0.8],
    'max_depth': [15, 20, 30, 40],
}

In [None]:
grid_clf = GridSearchCV(clf, param_grid, scoring='accuracy', cv=None, n_jobs=1)
grid_clf.fit(X_train_minmax, y_train)

grid_clf.best_params_

# K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Instantiate KNeighborsClassifier
knn = KNeighborsClassifier()

# Fit the classifier
knn.fit(X_train_minmax, y_train)

# Predict on the test set
y_pred = knn.predict(X_test_minmax)

In [None]:
print(classification_report(y_test, y_pred, target_names= target_names))

# Check the AUC for predictions
y_score = knn.fit(X_train_minmax, y_train).predict_proba(X_test_minmax)
roc_auc = roc_auc_score(y_test, y_score, multi_class = 'ovo')
print(f'\nAUC is :{auc}')

In [None]:
def find_best_k(X_train, y_train, X_test, y_test, min_k=1, max_k=25):
    best_k = 0
    best_score = 0.0
    for k in range(min_k, max_k+1, 2):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        preds = knn.predict(X_test)
        f1 = f1_score(y_test, preds, average = 'micro')
        if f1 > best_score:
            best_k = k
            best_score = f1
    
    print("Best Value for k: {}".format(best_k))
    print("F1-Score: {}".format(best_score))

In [None]:
from sklearn.metrics import f1_score
find_best_k(X_train_minmax, y_train, X_test_minmax, y_test)