In [2]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, make_scorer
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
import xgboost as xgb
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

In [3]:
#giving access to collab  for drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##EDA

In [4]:
#loading and preview dataset
df = pd.read_csv('drive/MyDrive/device_failure.csv', encoding='latin-1')
df.head()

Unnamed: 0,date,device,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9
0,2015-01-01,S1F01085,0,215630672,56,0,52,6,407438,0,0,7
1,2015-01-01,S1F0166B,0,61370680,0,3,0,6,403174,0,0,0
2,2015-01-01,S1F01E6Y,0,173295968,0,0,0,12,237394,0,0,0
3,2015-01-01,S1F01JE0,0,79694024,0,0,0,6,410186,0,0,0
4,2015-01-01,S1F01R2B,0,135970480,0,0,0,15,313173,0,0,3


In [5]:
#observing data types of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124494 entries, 0 to 124493
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   date        124494 non-null  object
 1   device      124494 non-null  object
 2   failure     124494 non-null  int64 
 3   attribute1  124494 non-null  int64 
 4   attribute2  124494 non-null  int64 
 5   attribute3  124494 non-null  int64 
 6   attribute4  124494 non-null  int64 
 7   attribute5  124494 non-null  int64 
 8   attribute6  124494 non-null  int64 
 9   attribute7  124494 non-null  int64 
 10  attribute8  124494 non-null  int64 
 11  attribute9  124494 non-null  int64 
dtypes: int64(10), object(2)
memory usage: 11.4+ MB


In [6]:
#statistical information of the dataset
df.describe()

Unnamed: 0,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9
count,124494.0,124494.0,124494.0,124494.0,124494.0,124494.0,124494.0,124494.0,124494.0,124494.0
mean,0.000851,122388100.0,159.484762,9.940455,1.74112,14.222669,260172.657726,0.292528,0.292528,12.451524
std,0.029167,70459330.0,2179.65773,185.747321,22.908507,15.943028,99151.078547,7.436924,7.436924,191.425623
min,0.0,0.0,0.0,0.0,0.0,1.0,8.0,0.0,0.0,0.0
25%,0.0,61284760.0,0.0,0.0,0.0,8.0,221452.0,0.0,0.0,0.0
50%,0.0,122797400.0,0.0,0.0,0.0,10.0,249799.5,0.0,0.0,0.0
75%,0.0,183309600.0,0.0,0.0,0.0,12.0,310266.0,0.0,0.0,0.0
max,1.0,244140500.0,64968.0,24929.0,1666.0,98.0,689161.0,832.0,832.0,18701.0


In [8]:
from pandas_profiling import ProfileReport
# Create the report using pandas_profiling
profile = ProfileReport(df, title="Pandas Profiling Report", explorative=True)
profile.to_notebook_iframe()

Output hidden; open in https://colab.research.google.com to view.

Summary
This dataset is clean, no missing values. All attributes are integer data type.

It is imbalanced data set, as the failuer class is about 0.1% of unfailure class.Here oversampling approach is used to deal with imbalanced dataset.

metric 7 and 8 seems like exactly same to each other, we can drop one of them.

Some attributes have limited number of distictive values, very sparse, indicating that they are likely to be categorical variable, such as metric3, 5,7,9.

metric 2,3,4,7,9 are highly skewed.

metric differ in their magnitudes. Scaling or centering is requried

In [1]:
#dropping duplicated feature attribute8
df.drop('attribute8', axis=1, inplace=True)
df.drop_duplicates(inplace=True)
df.duplicated().sum()

NameError: name 'df' is not defined

In [None]:
#converting date column to datetime
df['date'] = pd.to_datetime(df['date'])

In [None]:
#extracting active days for every observation
df['active_days'] = (df['date'] - df['date'].min()).dt.days

In [None]:
#extracting month and weekday
df['month'] = df['date'].dt.month
df['weekday'] = df['date'].dt.weekday
df.head()

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.drop(['device', 'date'], axis=1).corr(), annot=True)

In [None]:
#number of devices every month
df.groupby('month')['device'].nunique().plot()

This figure shows that as time move on, the number of devices are getting less and less.

## Aggregating data
Now we group observations by device ID and aggregate the values ​for all attributes. The hypothesis is that aggregated data on attributes will give us information about the propensity of devices to break down

In [None]:
#saving failed devices with their history in a separate dataset
failed_devices = df[df.failure==1]
failed_device_ids = failed_devices.device.values
failed_devices_history = df[df.device.isin(failed_device_ids)]
failed_devices_history.head()

In [None]:
#applying method groupby by device and aggregating features
#aggregating by mean values by all attributes, it gives us statistical information about every attributes
#leaving last value for weekday and active_days gives us when a devaice failed
failed_devices_history_agg = failed_devices_history.groupby('device').agg({'failure': np.max, 'attribute1':np.mean, 'attribute2':np.mean, 'attribute3':np.mean, 'attribute4': np.mean,
                                                                           'attribute5': np.mean, 'attribute6': np.mean, 'attribute7': np.mean, 'attribute9': np.mean,
                                                                           'weekday': 'last', 'active_days': 'last'}).reset_index()
failed_devices_history_agg = failed_devices_history_agg.astype({
    'attribute1': 'int',
    'attribute2': 'int',
    'attribute3': 'int',
    'attribute4': 'int',
    'attribute5': 'int',
    'attribute6': 'int',
    'attribute7': 'int',
    'attribute9': 'int'
})
failed_devices_history_agg.info()

Now we can see that all devices aggregated correctly, because we have 106 abservation that equals to failed number of devices

In [None]:
#applying steps above to not failed devices
not_failed_devices = df[df.failure==0]
not_failed_device_ids = not_failed_devices.device.values
not_failed_devices_history = df[df.device.isin(not_failed_device_ids)]
not_failed_devices_history_agg = not_failed_devices_history.groupby('device').agg({'failure': np.max, 'attribute1':np.mean, 'attribute2':np.mean, 'attribute3':np.mean,
                                                                                   'attribute4': np.mean, 'attribute5': np.mean, 'attribute6': np.mean, 'attribute7': np.mean,
                                                                                   'attribute9': np.mean, 'weekday': 'last', 'active_days': 'last'}).reset_index()
not_failed_devices_history_agg = not_failed_devices_history_agg.astype({
    'attribute1': 'int',
    'attribute2': 'int',
    'attribute3': 'int',
    'attribute4': 'int',
    'attribute5': 'int',
    'attribute6': 'int',
    'attribute7': 'int',
    'attribute9': 'int'
})
not_failed_devices_history_agg.info()

In [None]:
#For non-broken devices, you need to erase the historical broken data of the devices. To do this, the .isin() method is used to filter the data.
not_failed_devices_history_agg = not_failed_devices_history_agg[~not_failed_devices_history_agg.device.isin(failed_devices_history_agg.device)]
not_failed_devices_history_agg.head()

In [None]:
#we glue two datasets together
merged_agg = pd.concat([failed_devices_history_agg, not_failed_devices_history_agg], ignore_index=True)
merged_agg.info()

We created a dataset with consolidated data for broken and non-broken devices based on their historical data

In [None]:
merged_agg.device.duplicated().sum()

The dataset has no duplicated davice id. This means that historical data on broken devices is correctly divided and is not repeated

In [None]:
merged_agg.failure.value_counts()

Number of devices broken and not broken

In [None]:
ax, plt = plt.subplots(figsize=(12,8))
sns.heatmap(merged_agg.drop("device", axis=1).corr(), annot=True)

You can notice that the correlation between the target variable and features has improved

## Training and evaluating model
Let's train our data on GradientBoostingClasssifier model

In [None]:
#creating features and target data
X = merged_agg.drop(['failure', 'device'], axis=1)
y = merged_agg['failure']

In [None]:
#splitting data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#function to train, predict and evaluate model
def base_model(model_name, x_train, x_test, y_train, y_test):
    model = model_name
    model.fit(x_train, y_train)
    train_pred = model.predict(x_train)
    test_pred = model.predict(x_test)
    print(f'train f1 score: , {f1_score(y_train, train_pred)}')
    print(f'test f1 score: , {f1_score(y_test, test_pred)}')

base_model(GradientBoostingClassifier(), X_train, X_test, y_train, y_test)

we can see that our model overfitted

In [None]:
def tune_model_with_imbalanced_data(data, target_column, drop_columns, model, param_grid, sampler=SMOTE, metric=f1_score, test_size=0.1, random_state=42):
    """
    Function for hyperparameter tuning with imbalanced data consideration.

    Parameters:
    data (pd.DataFrame): DataFrame with the data.
    target_column (str): Name of the target column.
    drop_columns (list): List of columns to drop from the data.
    model (sklearn estimator): Model to be tuned.
    param_grid (dict): Grid of hyperparameters for tuning.
    sampler (imblearn sampler, optional): Resampling method, default is SMOTE.
    metric (sklearn metric, optional): Metric for model evaluation, default is f1_score.
    test_size (float, optional): Size of the test set, default is 0.2.
    random_state (int, optional): Random state for reproducibility, default is 42.

    Returns:
    best_model: Best tuned model.
    best_params: Best hyperparameters.
    best_train_score: Best F1-score on the training set.
    test_score: F1-score on the test set.
    """

    # Splitting the data
    X = data.drop(columns=drop_columns)
    y = data[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=test_size,
                                                        random_state=random_state,
                                                        stratify=y)

    # Creating a pipeline with oversampling
    pipeline = Pipeline([
        ('sampler', sampler(random_state=random_state)),
        ('classifier', model)
    ])

    # Defining the metric
    scorer = make_scorer(metric)

    # Grid Search
    grid_search = GridSearchCV(pipeline, param_grid, scoring=scorer, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Best parameters and scores
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    best_model = grid_search.best_estimator_

    # Evaluation on the test set
    y_pred = best_model.predict(X_test)
    test_score = metric(y_test, y_pred)

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)

    # Printing results
    print("Best parameters:", best_params)
    print("Best F1-score on training set:", best_score)
    print("F1-score on test set:", test_score)
    print("Confusion Matrix:")
    print(cm)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    # Visualizing the Confusion Matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm,
                annot=True,
                fmt='d',
                cmap='Blues',
                xticklabels=['Predicted Negative', 'Predicted Positive'],
                yticklabels=['Actual Negative', 'Actual Positive'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

    # Cross-validation on the training set
    cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='f1')
    print("Average F1-score on training set (cross-validation):", cv_scores.mean())

    # Predicting probabilities of failure for each device in the test set
    probabilities = best_model.predict_proba(X_test)[:, 1]

    # Creating a DataFrame to store the results
    results = pd.DataFrame({
    'device': data.loc[X_test.index, 'device'],
    'probability_of_failure': probabilities
    })

    # Display the first few rows of the results
    print(results.head())

    return best_model, best_params, best_score, test_score, results
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__scale_pos_weight': [1, 10, 25]  # This parameter helps with imbalanced data
}

best_model, best_params, best_train_score, test_score, results = tune_model_with_imbalanced_data(
    merged_agg,
    'failure',
    ['device', 'failure'],
    XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    param_grid
)

The model shows a strong performance with an overall accuracy of 97%. This high accuracy indicates that the model is able to correctly classify the majority of the samples.
The F1-score on the training set is 0.8094, while the F1-score on the test set is slightly higher at 0.8293. This indicates that the model is not overfitting and generalizes well to unseen data.
The confusion matrix reveals that the model has a very low number of false positives (5) and false negatives (2), which further supports the model's robustness.
The precision and recall for the negative class (majority class) are both very high, at 0.99 and 0.98 respectively. For the positive class (minority class), the precision is 0.77 and recall is 0.89. The higher recall for the positive class indicates that the model is able to identify the majority of positive samples, although there is a slight trade-off with precision.
The macro average and weighted average F1-scores are both high (0.91 and 0.97 respectively), reflecting the model's strong performance across both classes.