In [282]:
# packages
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier








In [283]:
train=pd.read_csv('train.csv')
val=pd.read_csv('val_df.csv')

#need to split the prediction variable out from the dataset
X_train = train.drop('fail', axis=1)  # Features (input)
y_train = train['fail']               # Target (label)

X_val = val.drop('fail', axis=1)
y_val = val['fail']

In [284]:
#scale the data, so the data is on the same scale. Mean=0, sd=1
scaler=StandardScaler()
scaler.fit(X_train) #fit only on training

#transform both
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)

## Baseline using logistic regression

In [285]:
baseline=LogisticRegression()
#trained and predicts using the scaled versions
baseline.fit(X_train_scaled, y_train)

y_pred = baseline.predict(X_val_scaled)
score = f1_score(y_val, y_pred)
acc = accuracy_score(y_val, y_pred)
print(f'score {score}')
print(f'acc {acc}')

score 0.8888888888888888
acc 0.8829787234042553


In [286]:
#logistic regressin futher

#q coefficient of +3.0 is just as influential as -3.0
coefs = np.abs(baseline.coef_[0])
# Create DataFrame
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': coefs
})
# Sort by strength
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
# Show top features
print(feature_importance_df)

       Feature  Importance
5          VOC    2.420079
3          USS    1.297031
2           AQ    0.825277
4           CS    0.603889
0     footfall    0.266227
6           RP    0.128148
8  Temperature    0.105504
7           IP    0.103049
1     tempMode    0.092369


In [310]:
pipe = Pipeline([
    #scale the data
    ('scaler', StandardScaler()),
    #based on ANOVA, find the best variables against the fail.
    ('selector', SelectKBest(score_func=f_classif, k=4)),
    #tuning the logisticregression
    ('logreg', LogisticRegression(penalty="l1", C=1.0, max_iter=1000, solver='saga'))
])

# Define cross-validation strategy
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# Evaluate using F1 score
scores = cross_val_score(pipe, X_train, y_train, scoring='f1', cv=cv, n_jobs=-1)
print(f"Mean F1 score: {scores.mean():.4f}")

# Accuracy Score
acc_scores = cross_val_score(pipe, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
print(f"Mean Accuracy: {acc_scores.mean():.4f}")


#Fit on full training set
pipe.fit(X_train, y_train)

#Predict on the hold-out validation set
y_val_pred = pipe.predict(X_val)

#Evaluate
acc_val = accuracy_score(y_val, y_val_pred)
f1_val = f1_score(y_val, y_val_pred)

print(f"Validation Accuracy: {acc_val:.4f}")
print(f"Validation F1 score: {f1_val:.4f}")

Mean F1 score: 0.9038
Mean Accuracy: 0.9219
Validation Accuracy: 0.8830
Validation F1 score: 0.8889


## RandomForestClassifier

In [297]:
pipe = Pipeline([
    ('selector', SelectKBest(score_func=f_classif, k=9)),
    ('randomForest', RandomForestClassifier(random_state=42,n_estimators=100, min_samples_split=2, max_depth=5, ))
])
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# Get best results
scores = cross_val_score(pipe, X_train, y_train, scoring='f1', cv=cv, n_jobs=-1)
acc_scores = cross_val_score(pipe, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

print(f"Mean F1 score: {scores.mean():.4f}")
print(f"Mean Accuracy: {acc_scores.mean():.4f}")

#Fit on full training set
pipe.fit(X_train, y_train)

#Predict on the hold-out validation set
y_val_pred = pipe.predict(X_val)

#Evaluate
acc_val = accuracy_score(y_val, y_val_pred)
f1_val = f1_score(y_val, y_val_pred)

print(f"Validation Accuracy: {acc_val:.4f}")
print(f"Validation F1 score: {f1_val:.4f}")


Mean F1 score: 0.8913
Mean Accuracy: 0.9108
Validation Accuracy: 0.8723
Validation F1 score: 0.8776
