In [None]:
def plot_distribution(df, col, title, bins='auto'):
    
    fig, ax_hist = plt.subplots(figsize=(10, 6))

    sns.histplot(data=df, x=col, ax=ax_hist, color='#037bfc', edgecolor='black')
    plt.suptitle(title)
    plt.tight_layout()
    
plot_distribution(df, 'gps_height', 'Altitude Column Distribution')

### Gradient Boosting 

Boosting Algorithms are also known as weak learners , they work by training a single weak learner, figure out which examples the weak learner got wrong, build another weak learner that focuses on the areas the first weak learner got wrong, continue this process untill a predetermined stopping condition is met, such as until a set number of weak learners have been created, or the the models performance plateaued. In this way, each new weak learner is specifically tuned to focus on the weak points of the previous weak learner(s). 

pipe = Pipeline([('gbc', GradientBoostingClassifier())])

# create a grid parameter

param_grid = {'gbc__learning_rate': [0.075,0.07],
                'gbc__max_depth': [6,7],
                'gbc__min_samples_leaf': [7,8],
                'gbc__max_features': [1.0],
                'gbc__n_estimators':[100, 200]}
                
gbc = GridSearchCV(estimator=pipe,
                    param_grid=param_grid,
                    n_jobs=-1)


# fit training data using grid search

gbc.fit(X_train, y_train)

# predict testing data using grid search

y_pred_1 = gbc.predict(X_test)

In [22]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

from sklearn.tree import DecisionTreeClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

In [23]:
data = pd.read_csv(r'C:\Users\user\Documents\Tanzania Water Wells\clean_data.csv')
data.head()

Unnamed: 0,id,amount_tsh,funder,gps_height,installer,longitude,latitude,basin,region,population,public_meeting,construction_year,extraction_type_group,management,payment_type,water_quality,quantity,source_type,waterpoint_type,status_group
0,69572,6000.0,Roman,1390.0,Roman,34.938093,-9.856322,Lake Nyasa,Iringa,109.0,True,1999,gravity,vwc,annually,soft,enough,spring,communal standpipe,functional
1,34310,25.0,Lottery Club,686.0,World vision,37.460664,-3.821329,Pangani,Manyara,250.0,True,2009,gravity,vwc,per bucket,soft,enough,dam,communal standpipe multiple,functional
2,67743,0.0,Unicef,263.0,UNICEF,38.486161,-11.155298,Ruvuma / Southern Coast,Mtwara,58.0,True,1986,submersible,vwc,never pay,soft,dry,borehole,communal standpipe multiple,non functional
3,19728,0.0,Action In A,0.0,Artisan,31.130847,-1.825359,Lake Victoria,Kagera,0.0,True,0,gravity,other,never pay,soft,seasonal,rainwater harvesting,communal standpipe,functional
4,9944,20.0,Mkinga Distric Coun,0.0,DWE,39.172796,-4.765587,Pangani,Tanga,1.0,True,2009,submersible,vwc,per bucket,salty,enough,other,communal standpipe multiple,functional


In [24]:
data['status_group'].value_counts()

functional                 28643
non functional             20188
functional needs repair     3729
Name: status_group, dtype: int64

In [25]:
data['status_group'].value_counts().sum()

52560

Pick 6k samples from functional and 4k samples from non functional and add them to functional but needs repair

Make sure the positions are still the same, functional has the most, non functional second and functional needs repair the least .

Total should still be the same 

In [26]:
functional_adjusted = (data['status_group'] == 'functional').sum() - 6000
non_functional_adjusted = (data['status_group'] == 'non functional').sum() - 4000
functional_needs_repair_adjusted = (data['status_group'] == 'functional needs repair').sum() + 10000

In [27]:
replacement_map = {
                    'functional' : 'functional_adjusted',
                    'non functional' : 'non_functional_adjusted',
                    'functional needs repair' : 'functional_needs_repair_adjusted' }

In [28]:
data['status_group'] = data['status_group'].map(replacement_map)

In [29]:
data['status_group'].value_counts()

functional_adjusted                 28643
non_functional_adjusted             20188
functional_needs_repair_adjusted     3729
Name: status_group, dtype: int64

In [30]:
data.loc[data['status_group'] == 'functional', 'status_group'] = 'functional_adjusted'
data.loc[data['status_group'] == 'non functional', 'status_group'] = 'non functional_adjusted'
data.loc[data['status_group'] == 'functional needs repair', 'status_group'] = 'functional needs repair_adjusted'

# Subtract 6000 from the count of 'functional_adjusted'
data.loc[data['status_group'] == 'functional_adjusted', 'status_group'] = 'functional_adjusted_minus_6000'

# Subtract 4000 from the count of 'non functional_adjusted'
data.loc[data['status_group'] == 'non functional_adjusted', 'status_group'] = 'non functional_adjusted_minus_4000'

# Add 10000 to the count of 'functional needs repair_adjusted'
data.loc[data['status_group'] == 'functional needs repair_adjusted', 'status_group'] = 'functional needs repair_adjusted_plus_10000'


In [31]:
data['status_group'].value_counts()

functional_adjusted_minus_6000      28643
non_functional_adjusted             20188
functional_needs_repair_adjusted     3729
Name: status_group, dtype: int64