DATA PROCESSING

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,Wellhead_Temp. (C),Wellhead_Press (psi),MMCFD- gas,BOPD (barrel of oil produced per day),BWPD (barrel of water produced per day),BSW basic solid and water(%),CO2 mol. (%) @ 25C & 1 Atm,Gas Grav,CR-corrosion defect,leak_status
0,51.088486,1364.601753,6.784434,404.673699,293.125995,5.265676,1.038911,0.601146,0.144365,1.0
1,51.096512,1474.635955,11.280674,364.764774,208.47237,5.331766,0.961022,0.618705,0.135026,0.0
2,57.631303,1233.116598,8.786931,414.652564,228.530184,5.69627,1.295514,0.712029,0.154846,1.0
3,55.413455,1564.958133,10.793411,511.357422,161.711604,6.073352,0.801719,0.640946,0.108198,0.0
4,36.821261,1500.806676,12.748623,537.593023,185.303039,4.867634,1.049567,0.644026,0.114479,0.0


In [14]:
print(data.isnull().sum())

Wellhead_Temp. (C)                         0
Wellhead_Press (psi)                       0
MMCFD- gas                                 0
BOPD (barrel of oil produced per day)      0
BWPD (barrel of water produced per day)    0
BSW basic solid and water(%)               0
CO2 mol. (%) @ 25C & 1 Atm                 0
Gas Grav                                   0
CR-corrosion defect                        0
leak_status                                0
dtype: int64


FEATURES AND TARGET

In [15]:
X = data.drop('leak_status', axis=1)
y = data['leak_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print('done')

done


PARAMETER TUNING

In [16]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint


param_dist = {
    'n_estimators': randint(10, 200),
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'bootstrap': [True, False]
}


rf_model = RandomForestClassifier()

random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=100, cv=5, verbose=2, n_jobs=-1, random_state=42)

random_search.fit(X_train, y_train)

print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END bootstrap=True, max_depth=7, min_samples_leaf=19, min_samples_split=12, n_estimators=97; total time=   1.5s
[CV] END bootstrap=True, max_depth=7, min_samples_leaf=19, min_samples_split=12, n_estimators=97; total time=   1.7s
[CV] END bootstrap=True, max_depth=7, min_samples_leaf=19, min_samples_split=12, n_estimators=97; total time=   1.7s
[CV] END bootstrap=True, max_depth=7, min_samples_leaf=19, min_samples_split=12, n_estimators=97; total time=   1.6s
[CV] END bootstrap=True, max_depth=7, min_samples_leaf=19, min_samples_split=12, n_estimators=97; total time=   1.6s
[CV] END bootstrap=True, max_depth=4, min_samples_leaf=8, min_samples_split=4, n_estimators=159; total time=   1.8s
[CV] END bootstrap=True, max_depth=15, min_samples_leaf=11, min_samples_split=9, n_estimators=198; total time=   3.8s
[CV] END bootstrap=True, max_depth=15, min_samples_leaf=11, min_samples_split=9, n_estimators=198; total time=   3.9s


In [17]:
best_params = random_search.best_params_
print("Best Parameters:", best_params)
print("Best Score:", random_search.best_score_)

Best Parameters: {'bootstrap': True, 'max_depth': 14, 'min_samples_leaf': 2, 'min_samples_split': 19, 'n_estimators': 44}
Best Score: 0.8348571428571429


MODEL TRAINNING

In [18]:
model = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    bootstrap=best_params['bootstrap']
)
model.fit(X_train, y_train)

SAVE MODEL

In [19]:
import pickle

with open('rf_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print('done')

done


MODEL EVALUATION

In [25]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

with open('rf_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

y_pred = loaded_model.predict(X_test)
y_pred_proba = loaded_model.predict_proba(X_test)

CONFUSION MATRIX

In [21]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[1476   60    0    0]
 [ 137  384   68    0]
 [   4   78  469   42]
 [   0    0   67  215]]


CLASSIFICATION REPORT

In [26]:
class_report = classification_report(y_test, y_pred, target_names=['Normal', 'Minor Leak', 'Moderate Leak', 'Severe Leak'])
print("Classification Report:")
print(class_report)

Classification Report:
               precision    recall  f1-score   support

       Normal       0.91      0.96      0.94      1536
   Minor Leak       0.74      0.65      0.69       589
Moderate Leak       0.78      0.79      0.78       593
  Severe Leak       0.84      0.76      0.80       282

     accuracy                           0.85      3000
    macro avg       0.82      0.79      0.80      3000
 weighted avg       0.84      0.85      0.84      3000



ROC-AUC SCORE

In [23]:

roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
print("ROC-AUC Score:", roc_auc)

ROC-AUC Score: 0.9610537219617605
