In [57]:
import pandas as pd
import joblib as jb 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import yaml
from tqdm import tqdm_notebook
from scipy.stats import norm, boxcox
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split

In [58]:
with open('A:/projectwater/config/config.yaml') as file:
    config = yaml.safe_load(file)
config

{'data_raw': 'A:/projectwater/data/raw/water_potability.csv',
 'data_final': 'A:/projectwater/data/process/ori_new.pkl',
 'path_train': ['A:/projectwater/data/process/x_train.pkl',
  'A:/projectwater/data/process/y_train.pkl'],
 'path_valid': ['A:/projectwater/data/process/x_valid.pkl',
  'A:/projectwater/data/process/y_valid.pkl'],
 'path_test': ['A:/projectwater/data/process/x_test.pkl',
  'A:/projectwater/data/process/y_test.pkl'],
 'path_train_feat': ['A:/projectwater/data/process/x_train_feat.pkl',
  'A:/projectwater/data/process/y_train_feat.pkl'],
 'path_valid_feat': ['A:/projectwater/data/process/x_valid_feat.pkl',
  'A:/projectwater/data/process/y_valid_feat.pkl'],
 'path_test_feat': ['A:/projectwater/data/process/x_test_feat.pkl',
  'A:/projectwater/data/process/y_test_feat.pkl'],
 'final_model_path': 'A:/projectwater/models/production_model.pkl',
 'training_log_path': 'A:/projectwater/log/training_log.json',
 'new_cols': ['ph',
  'hardness',
  'solids',
  'chloramines',
  's

In [59]:
dt_water = pd.read_csv('A:/projectwater/data/raw/water_potability.csv')
dt_water

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.802160,8.061362,,392.449580,19.903225,,2.798243,1
3273,9.419510,175.762646,33155.578218,7.350233,,432.044783,11.039070,69.845400,3.298875,1
3274,5.126763,230.603758,11983.869376,6.303357,,402.883113,11.168946,77.488213,4.708658,1


In [60]:
dt_water.isnull().sum().sort_values()

Hardness             0
Solids               0
Chloramines          0
Conductivity         0
Organic_carbon       0
Turbidity            0
Potability           0
Trihalomethanes    162
ph                 491
Sulfate            781
dtype: int64

In [61]:
cols = ['Sulfate','ph','Trihalomethanes']
for col in cols: 
    mean = dt_water[col].mean()
    dt_water[col].fillna(mean, inplace=True)

In [62]:
wt_new = dt_water.copy()
wt_new.columns = config['new_cols']
wt_new

Unnamed: 0,ph,hardness,solids,chloramines,sulfate,conductivity,organic_carbon,trihalomethes,turbidity,potability
0,7.080795,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.057858,6.635246,333.775777,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,333.775777,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.802160,8.061362,333.775777,392.449580,19.903225,66.396293,2.798243,1
3273,9.419510,175.762646,33155.578218,7.350233,333.775777,432.044783,11.039070,69.845400,3.298875,1
3274,5.126763,230.603758,11983.869376,6.303357,333.775777,402.883113,11.168946,77.488213,4.708658,1


LIAT IMBALANCE DATA ATAU LIAT SKEWNESS

In [71]:
X = wt_new.iloc[:,:9].values
y = wt_new['potability'].values

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [73]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [75]:
jb.dump(scaler, 'A:/projectwater/models/scalers.save')

['A:/projectwater/models/scalers.save']

In [74]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

In [76]:
y_pred = model.predict(X_test_scaled)
y_pred

array([0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,

In [77]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.69      0.88      0.77       402
           1       0.66      0.37      0.47       254

    accuracy                           0.68       656
   macro avg       0.68      0.62      0.62       656
weighted avg       0.68      0.68      0.66       656



In [70]:
jb.dump(model, 'A:/projectwater/models/modelss.pkl')

['A:/projectwater/models/modelss.pkl']