In [14]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

# Define the remove_outliers function
def remove_outliers(df, columns):
    for col in columns:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

# Load the dataset
df = pd.read_csv('/Users/sarithavuppula/Downloads/Summer School/final_version_data.csv')

# Remove outliers
df = remove_outliers(df, ['pin', 'po', 'pdmp'])

# Feature extraction with tqdm to track progress
def extract_features(df):
    features = []
    for _, group in tqdm(df.groupby(['condition', 'mode', 'cycle']), desc="Extracting features"):
        pin_mean = group['pin'].mean()
        po_mean = group['po'].mean()
        pdmp_mean = group['pdmp'].mean()
        pin_var = group['pin'].var()
        po_var = group['po'].var()
        pdmp_var = group['pdmp'].var()
        pin_skew = group['pin'].skew()
        po_skew = group['po'].skew()
        pdmp_skew = group['pdmp'].skew()
        pin_kurt = group['pin'].kurtosis()
        po_kurt = group['po'].kurtosis()
        pdmp_kurt = group['pdmp'].kurtosis()
        features.append([group['cycle'].iloc[0], group['condition'].iloc[0], group['mode'].iloc[0], 
                        pin_mean, po_mean, pdmp_mean, pin_var, po_var, pdmp_var, 
                         pin_skew, po_skew, pdmp_skew, pin_kurt, po_kurt, pdmp_kurt])
    return pd.DataFrame(features, columns=['cycle', 'condition', 'mode', 
                                           'pin_mean', 'po_mean', 'pdmp_mean', 
                                           'pin_var', 'po_var', 'pdmp_var', 
                                           'pin_skew', 'po_skew', 'pdmp_skew',
                                           'pin_kurt', 'po_kurt', 'pdmp_kurt'])

features_df = extract_features(df)

# Normalize features
scaler = StandardScaler()
X = features_df.iloc[:, 3:]
y = features_df['mode']

X_scaled = scaler.fit_transform(X)

# Address class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


Extracting features: 100%|██████████| 34045/34045 [00:12<00:00, 2641.16it/s]


In [None]:
# Save the preprocessed features for later use
df_sample.to_csv('/Users/sarithavuppula/Downloads/Summer School/preprocessed_features.csv', index=False)

In [15]:
# Simplified Hyperparameter tuning for RandomForest with tqdm to track progress
param_grid = {
    'n_estimators': [100, 150],  # Reduced number of estimators
    'max_depth': [10, 15],       # Fewer depth options
    'min_samples_split': [2, 5], # Fewer options for splits
}

# Initialize GridSearchCV with fewer options and tqdm progress tracking
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), 
                           param_grid=param_grid, 
                           cv=3,  # Reduced to 3 folds
                           n_jobs=-1, 
                           verbose=0)

# Fit the model with progress tracking
with tqdm(total=len(param_grid['n_estimators']) * len(param_grid['max_depth']) * 
          len(param_grid['min_samples_split']), desc="Hyperparameter Tuning") as pbar:
    grid_search.fit(X_train, y_train)
    pbar.update()

# Best model from grid search
best_model = grid_search.best_estimator_

# Cross-validation for robustness
cv_scores = cross_val_score(best_model, X_resampled, y_resampled, cv=3)
print(f"Cross-Validation Accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

# Predict on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Display the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix for Fault Classification')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Print detailed classification report
print(classification_report(y_test, y_pred))

Hyperparameter Tuning:  12%|█▎        | 1/8 [00:40<04:43, 40.55s/it]


Cross-Validation Accuracy: 0.8685 ± 0.0615
Model Accuracy: 0.9354
              precision    recall  f1-score   support

           1       0.85      0.83      0.84       678
           2       1.00      1.00      1.00       687
           3       0.95      0.97      0.96       716
           4       1.00      1.00      1.00       672
           5       0.84      0.86      0.85       722
           6       0.99      1.00      0.99       665
           7       0.91      0.91      0.91       720
           8       0.89      0.87      0.88       715
           9       0.97      0.96      0.96       707
          10       0.90      0.91      0.91       691
          11       1.00      1.00      1.00       710

    accuracy                           0.94      7683
   macro avg       0.94      0.94      0.94      7683
weighted avg       0.94      0.94      0.94      7683



  plt.show()


In [12]:
# Function to preprocess and apply the model to new data
def apply_model_to_new_data(new_data_path, model, scaler):
    # Load new data
    new_df = pd.read_csv(new_data_path)

    # Preprocess the new data (similar to the steps above)
    new_features_df = extract_features(new_df)
    X_new = new_features_df.iloc[:, 3:]  # Use the extracted features
    X_new_scaled = scaler.transform(X_new)  # Normalize the new data

    # Predict using the trained model
    y_new_pred = model.predict(X_new_scaled)

    # Add predictions to the new dataframe
    new_features_df['predicted_mode'] = y_new_pred

    # Save predictions
    new_features_df.to_csv('/Users/sarithavuppula/Downloads/Summer School/new_data_predictions.csv', index=False)
    print("Predictions saved to new_data_predictions.csv")

In [None]:
# Apply the model to new data
new_data_path = '/Users/sarithavuppula/Downloads/Data_Challenge_PHM2022_validation_data 2/data_pdmp7.csv'  # Replace with your actual path 
apply_model_to_new_data(new_data_path, model, scaler)

In [16]:
import streamlit as st

st.title("Simple Streamlit App")
st.write("This is a simple example of a Streamlit app.")