# **PREDICTING WIND TURBINE FAILURES**

kaggle dataset: https://www.kaggle.com/datasets/mariyamalshatta/renewind

True positives (TP) are failures correctly predicted by the model.

False negatives (FN) are real failures in a generator where there is no detection by model.

False positives (FP) are failure detections in a generator where there is no failure.

# import libraries & datasets 

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

> RandomizedSearchCV already performs internal cross-validation — it handles the train/validation splitting for you.

In [41]:
df_train = pd.read_csv('Train.csv')
df_test = pd.read_csv('Test.csv')

# data preprocessing

In [42]:
df_train['Target'].value_counts()

0    18890
1     1110
Name: Target, dtype: int64

In [43]:
df_train.describe()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V32,V33,V34,V35,V36,V37,V38,V39,V40,Target
count,19982.0,19982.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,...,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,-0.271996,0.44043,2.484699,-0.083152,-0.053752,-0.995443,-0.879325,-0.548195,-0.016808,-0.012998,...,0.303799,0.049825,-0.462702,2.22962,1.514809,0.011316,-0.344025,0.890653,-0.87563,0.0555
std,3.441625,3.150784,3.388963,3.431595,2.104801,2.04097,1.761626,3.295756,2.160568,2.193201,...,5.5004,3.575285,3.183841,2.937102,3.80086,1.788165,3.948147,1.753054,3.012155,0.228959
min,-11.876451,-12.319951,-10.708139,-15.082052,-8.603361,-10.227147,-7.949681,-15.657561,-8.596313,-9.853957,...,-19.876502,-16.898353,-17.985094,-15.349803,-14.833178,-5.47835,-17.375002,-6.43888,-11.023935,0.0
25%,-2.737146,-1.640674,0.20686,-2.34766,-1.535607,-2.347238,-2.030926,-2.642665,-1.494973,-1.411212,...,-3.420469,-2.242857,-2.136984,0.336191,-0.943809,-1.255819,-2.987638,-0.27225,-2.940193,0.0
50%,-0.747917,0.471536,2.255786,-0.135241,-0.101952,-1.000515,-0.917179,-0.389085,-0.067597,0.100973,...,0.052073,-0.066249,-0.255008,2.098633,1.566526,-0.128435,-0.316849,0.919261,-0.920806,0.0
75%,1.840112,2.543967,4.566165,2.130615,1.34048,0.38033,0.223695,1.722965,1.409203,1.477045,...,3.761722,2.255134,1.436935,4.064358,3.983939,1.175533,2.279399,2.05754,1.119897,0.0
max,15.493002,13.089269,17.090919,13.236381,8.133797,6.975847,8.006091,11.679495,8.13758,8.108472,...,23.633187,16.692486,14.358213,15.291065,19.329576,7.467006,15.289923,7.759877,10.654265,1.0


In [44]:
df_test.isnull().sum()

V1        5
V2        6
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
V29       0
V30       0
V31       0
V32       0
V33       0
V34       0
V35       0
V36       0
V37       0
V38       0
V39       0
V40       0
Target    0
dtype: int64

In [45]:
df_train.isnull().sum()

V1        18
V2        18
V3         0
V4         0
V5         0
V6         0
V7         0
V8         0
V9         0
V10        0
V11        0
V12        0
V13        0
V14        0
V15        0
V16        0
V17        0
V18        0
V19        0
V20        0
V21        0
V22        0
V23        0
V24        0
V25        0
V26        0
V27        0
V28        0
V29        0
V30        0
V31        0
V32        0
V33        0
V34        0
V35        0
V36        0
V37        0
V38        0
V39        0
V40        0
Target     0
dtype: int64

In [46]:
df_train.fillna(df_train['V1'].mean(), inplace=True)
df_train.fillna(df_train['V2'].mean(), inplace=True)

df_test.fillna(df_test['V1'].mean(), inplace=True)
df_test.fillna(df_test['V2'].mean(), inplace=True)

# model implmentation - balanced dataset

In [47]:
os_models ={
    'dtree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'classifier__criterion': ['gini','entropy'],
            'classifier__max_depth': [3,5,10]
        }
    },
    'gaussian': {
        'model': GaussianNB(),
        'params': {
            'classifier__var_smoothing': np.logspace(0,-9,num=100)
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'classifier__n_estimators': [15,50],
            'classifier__max_depth': [None, 5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear'),
        'params': {
            'classifier__C': [1,5,10]
        }
    },
    'xgb': {
        'model': xgb.XGBClassifier(eval_metric='logloss'),
        'params': {
            'classifier__n_estimators': [50,100],
            'classifier__max_depth': [3,5],
            'classifier__learning_rate': [0.01,0.1]
        }
    }
}

In [48]:
X_train= df_train.drop('Target', axis=1)
y_train= df_train['Target']

X_test= df_test.drop('Target', axis=1)
y_test= df_test['Target']

In [49]:
os_scores =[]
for os_model, os_mp in os_models.items():
    try:
        print(f'Training model: {os_model}')
        pipeline = Pipeline([
            ('smote', SMOTE(random_state=42)), # hence not using random state in above models params
            ('classifier', os_mp['model'])
        ])
        
        rscv = RandomizedSearchCV(
            pipeline,
            param_distributions= os_mp['params'],
            random_state=42,
            cv=3,
            n_jobs=-1,
            scoring= 'f1_macro',
            n_iter=10
        )
    
        rscv.fit(X_train, y_train)
    
        os_scores.append({
            'model': os_model,
            'best_score': rscv.best_score_,
            'bes_param': rscv.best_params_,
            'best_estimator': rscv.best_estimator_
        })
    except Exception as e:
        print(e)


Training model: dtree
Training model: gaussian
Training model: random_forest
Training model: logistic_regression
Training model: xgb


In [50]:
result_os_df= pd.DataFrame(os_scores)
result_os_df

Unnamed: 0,model,best_score,bes_param,best_estimator
0,dtree,0.817477,"{'classifier__max_depth': 10, 'classifier__cri...","(SMOTE(random_state=42), DecisionTreeClassifie..."
1,gaussian,0.647386,{'classifier__var_smoothing': 0.00028480358684...,"(SMOTE(random_state=42), GaussianNB(var_smooth..."
2,random_forest,0.941501,"{'classifier__n_estimators': 50, 'classifier__...","(SMOTE(random_state=42), (DecisionTreeClassifi..."
3,logistic_regression,0.681303,{'classifier__C': 1},"(SMOTE(random_state=42), LogisticRegression(C=..."
4,xgb,0.916922,"{'classifier__n_estimators': 100, 'classifier_...","(SMOTE(random_state=42), XGBClassifier(base_sc..."


In [51]:
os_scores

[{'model': 'dtree',
  'best_score': 0.8174769066682656,
  'bes_param': {'classifier__max_depth': 10,
   'classifier__criterion': 'entropy'},
  'best_estimator': Pipeline(steps=[('smote', SMOTE(random_state=42)),
                  ('classifier',
                   DecisionTreeClassifier(criterion='entropy', max_depth=10))])},
 {'model': 'gaussian',
  'best_score': 0.6473861288007832,
  'bes_param': {'classifier__var_smoothing': 0.0002848035868435802},
  'best_estimator': Pipeline(steps=[('smote', SMOTE(random_state=42)),
                  ('classifier',
                   GaussianNB(var_smoothing=0.0002848035868435802))])},
 {'model': 'random_forest',
  'best_score': 0.941501106196479,
  'bes_param': {'classifier__n_estimators': 50, 'classifier__max_depth': None},
  'best_estimator': Pipeline(steps=[('smote', SMOTE(random_state=42)),
                  ('classifier', RandomForestClassifier(n_estimators=50))])},
 {'model': 'logistic_regression',
  'best_score': 0.6813028505024228,
  'bes_

In [54]:
# predicting on balanced dataset

# getting best estimator (full pipeline) from result_os_df
best_os_model = result_os_df.loc[result_os_df['model'] == 'random_forest','best_estimator'].values[0]

# predict on X_test (no SMOTE here, as the pipeline handles it during training only)
y_pred = best_os_model.predict(X_test)

# performance metrics 

In [55]:
print('Confusion matrix:\n',confusion_matrix(y_test,y_pred))

Confusion matrix:
 [[4706   12]
 [  43  239]]


In [56]:
print('classification report:\n', classification_report(y_test,y_pred))

classification report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      4718
           1       0.95      0.85      0.90       282

    accuracy                           0.99      5000
   macro avg       0.97      0.92      0.95      5000
weighted avg       0.99      0.99      0.99      5000

