# Predictive Maintanence
Predictive maintenance is a technique that uses data analysis tools and techniques to detect anomalies in your operation and possible defects in equipment and processes so you can fix them before they result in failure.


# Data Description¶
There are 5 CSV files consisting of:

***Telemetry Time Series Data (PdM_telemetry.csv)*** : It consists of hourly average of voltage, rotation, pressure, vibration collected from 100 machines for the year 2015.

***Error (PdM_errors.csv)***: These are errors encountered by the machines while in operating condition. Since, these errors don't shut down the machines, these are not considered as failures. The error date and times are rounded to the closest hour since the telemetry data is collected at an hourly rate.

***Maintenance (PdM_maint.csv)***: If a component of a machine is replaced, that is captured as a record in this table. Components are replaced under two situations:

During the regular scheduled visit, the technician replaced it (Proactive Maintenance)
A component breaks down and then the technician does an unscheduled maintenance to replace the component (Reactive Maintenance). This is considered as a failure and corresponding data is captured under Failures. Maintenance data has both 2014 and 2015 records. This data is rounded to the closest hour since the telemetry data is collected at an hourly rate.

***Failures (PdM_failures.csv)**: Each record represents replacement of a component due to failure. This data is a subset of Maintenance data. This data is rounded to the closest hour since the telemetry data is collected at an hourly rate.

***Metadata of Machines (PdM_Machines.csv)***: Model type & age of the Machines.

Predictive Maintanence
Predictive maintenance is a technique that uses data analysis tools and techniques to detect anomalies in your operation and possible defects in equipment and processes so you can fix them before they result in failure.

In [None]:
import anai
from anai.preprocessing import Preprocessor
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

In [None]:
telemetry_df = pd.read_csv('DATA/PdM_telemetry.csv')
errors_df = pd.read_csv('DATA/PdM_errors.csv')
maint_df = pd.read_csv('DATA/PdM_maint.csv')
failures_df = pd.read_csv('DATA/PdM_failures.csv')
machines_df = pd.read_csv('DATA/PdM_machines.csv')

In [None]:
tables = [telemetry_df, maint_df, failures_df, errors_df]
for df in tables:
    df["datetime"] = pd.to_datetime(df["datetime"], format="%Y-%m-%d %H:%M:%S")
    df.sort_values(["datetime", "machineID"], inplace=True, ignore_index=True)

# Data Insights



## Telemetry Data¶
> This data consists of hourly average of voltage, rotation, pressure, vibration collected from 100 machines for the year 2015.¶

In [None]:
print(f"Shape of the Telemetry : {telemetry_df.shape}")
print("\n")
telemetry_df.head()

In [None]:
print(f"No.Of Machine in the Telemetry : {telemetry_df.machineID.nunique()}")

### Missing Values in the Telemetry data 

In [None]:
telemetry_df.datetime.describe()

In [None]:
print('missing Dates : ' , telemetry_df.datetime.isna().sum())

In [None]:
telemetry_df.isna().sum()

In [None]:
telemetry_df.describe()  ##info on this is Required

## Error Data

## Maintainance Data

## Machine Data

In [None]:
print(f"Shape of the Machines Data: {machines_df.shape}")
print("\n")
machines_df.head()

## Failure Data

# Exploratory Data Analysis

## EDA Functions

In [None]:
def create_date_features(source_df, target_df, feature_name):
    '''
    Create new features related to dates
    
    source_df : DataFrame consisting of the timestamp related feature
    target_df : DataFrame where new features will be added
    feature_name : Name of the feature of date type which needs to be decomposed.
    '''
    target_df.loc[:, 'year'] = source_df.loc[:, feature_name].dt.year.astype('uint16')
    target_df.loc[:, 'month'] = source_df.loc[:, feature_name].dt.month.astype('uint8')
    target_df.loc[:, 'quarter'] = source_df.loc[:, feature_name].dt.quarter.astype('uint8')
    target_df.loc[:, 'weekofyear'] = source_df.loc[:, feature_name].dt.isocalendar().week.astype('uint8')
    
    target_df.loc[:, 'hour'] = source_df.loc[:, feature_name].dt.hour.astype('uint8')
    
    target_df.loc[:, 'day'] = source_df.loc[:, feature_name].dt.day.astype('uint8')
    target_df.loc[:, 'dayofweek'] = source_df.loc[:, feature_name].dt.dayofweek.astype('uint8')
    target_df.loc[:, 'dayofyear'] = source_df.loc[:, feature_name].dt.dayofyear.astype('uint8')
    target_df.loc[:, 'is_month_start'] = source_df.loc[:, feature_name].dt.is_month_start
    target_df.loc[:, 'is_month_end'] = source_df.loc[:, feature_name].dt.is_month_end
    target_df.loc[:, 'is_quarter_start']= source_df.loc[:, feature_name].dt.is_quarter_start
    target_df.loc[:, 'is_quarter_end'] = source_df.loc[:, feature_name].dt.is_quarter_end
    target_df.loc[:, 'is_year_start'] = source_df.loc[:, feature_name].dt.is_year_start
    target_df.loc[:, 'is_year_end'] = source_df.loc[:, feature_name].dt.is_year_end
    
    # This is of type object
    target_df.loc[:, 'month_year'] = source_df.loc[:, feature_name].dt.to_period('M')
    
    return target_df



def plot_histogram(data, x_column, color_column, title, nbins=1000, width=1000, height=600, log_x=False, log_y=False):
    """
    Generates a Plotly histogram.
    """
    fig = px.histogram(
        data,
        x=x_column,
        color=color_column,
        title=title,
        nbins=nbins,
        width=width,
        height=height,
        log_x=log_x,
        log_y=log_y
    )
    
    fig.update_layout(
        xaxis_title=x_column,
        yaxis_title="Count"
    )
    
    return fig

def plot_boxplot(data, x_column, y_column, title, width=1000, height=900, xaxis_title=None, yaxis_title=None):
    """
    Generates a Plotly boxplot.

   
    """
    fig = px.box(
        data,
        x=x_column,
        y=y_column,
        title=title,
        width=width,
        height=height
    )
    
    # Update layout with custom axis titles if provided
    fig.update_layout(
        xaxis_title=xaxis_title if xaxis_title else x_column,
        yaxis_title=yaxis_title if yaxis_title else y_column
    )
    
    return fig

import plotly.express as px

def plot_scatter(df, feature_x, feature_y, title=None, xlabel=None, ylabel=None, width=800, height=600):
    """
    Create a scatter plot using Plotly.
    """
    fig = px.scatter(
        df,
        x=feature_x,
        y=feature_y,
        title=title,
        width=width,
        height=height
    )
    
    # Update axis labels if provided
    fig.update_layout(
        xaxis_title=xlabel if xlabel else feature_x,
        yaxis_title=ylabel if ylabel else feature_y
    )
    
    return fig



## EDA On Telemetry Data

 Vibration of Machine1 for 2015

In [None]:
df_vib_machine_1 = telemetry_df[
    telemetry_df.machineID == 1][["datetime", "vibration"]]


In [None]:
fig = px.line(x = df_vib_machine_1['datetime'].values, y = df_vib_machine_1['vibration'].values ,title="Vibration of Machine 1",template='plotly_dark')
fig.update_layout(xaxis_title='Time', yaxis_title='Vibration')
fig.show() 

Voltage for Machine1 for January Month

In [None]:
plot_df = telemetry_df.loc[(telemetry_df['machineID'] == 1) &
                        (telemetry_df['datetime'] > pd.to_datetime('2015-01-01')) &
                        (telemetry_df['datetime'] < pd.to_datetime('2015-02-01')), ['datetime', 'volt']]


In [None]:
fig = px.line(x=plot_df['datetime'].values, y=plot_df['volt'].values, title='Voltage over time', template='plotly_dark')
fig.update_layout(xaxis_title='Time', yaxis_title='Voltage')
fig.show()


Machine2 Voltage First Two weeks of 2015

In [None]:
df_vib_machine_1 = telemetry_df[
    (telemetry_df.machineID == 2) & (
        telemetry_df.datetime.dt.isocalendar().week.isin(
            [1, 2, 3]))][["datetime", "volt"]]


In [None]:
fig = px.line(x=df_vib_machine_1['datetime'].values, y=df_vib_machine_1['volt'].values, title='Voltage over time', template='plotly_dark')
fig.update_layout(xaxis_title='Time', yaxis_title='Voltage')
fig.show()


Plot the distribution of voltage across various months. Ideally there should be some amount seasonality in the data¶

In [None]:
telemetry_df = create_date_features(telemetry_df, telemetry_df, "datetime")
telemetry_df.head()

In [None]:
telemetry_df = create_date_features(telemetry_df, telemetry_df, "datetime")
telemetry_df.head()

In [None]:
telemetry_df['month_year'] = telemetry_df['month_year'].astype(str)

fig = plot_boxplot(
    telemetry_df,
    x_column="volt",
    y_column="month_year",
    title="Distribution of volt by month_year"
)
fig.show()

It shows the voltage across Machines are not varying over month.

We can ignore the entry for 2016 since we only have data for one day in 2016.

In [None]:


fig = px.box(
    telemetry_df[telemetry_df.machineID == 80], 
    x="volt",  # Horizontal axis
    y="month_year",  # Grouping variable
    title="Distribution of volt by month_year",
    width=1000,  # Adjust width (optional)
    height=900   # Adjust height (optional)
)

fig.update_layout(
    xaxis_title="volt", 
    yaxis_title="month_year"
    )

fig.show()

In [None]:
fig = plot_histogram(
    telemetry_df,
    x_column="volt",
    color_column="month_year",
    title="Distribution of volt",
    nbins=1000
)
fig.show()

Thank you for sharing the histogram output! Here’s an analysis of the provided plot:

Observations:

	1.	Overall Distribution (Shape):
	•	The volt values exhibit a bell-shaped curve, which is indicative of a normal distribution. This suggests that most of the volt values are clustered around the mean, with fewer occurrences at the extremes.
	2.	Spread Across month_year:
	•	Each month_year is represented by a different color in the stacked histogram.
	•	There is a consistent distribution across months; no month appears to deviate significantly in terms of the volt distribution’s central tendency or spread.
	•	All months seem to have similar peak counts, with most data points centered around volt values between 160 and 180.
	

Insights:

	1.	Consistency Over Time:
	•	The near-identical distributions across months suggest that the volt readings are stable over time. This could indicate that the monitored system operates consistently, with no drastic changes or anomalies month-to-month.
	


In [None]:

for name in ['rotate', 'pressure', 'vibration']:
    fig  =plot_histogram(telemetry_df, x_column=name, color_column="month_year",  title=f"Distribution of {name}")
    fig.show()

Observations about Telemetry Data¶
1. This may be synthetically generated data distributed between 1st Jan 2015 to 1st Jan 2016.
2. Each row represents the state of a machine on a particular hour. Voltage, vibration, pressure & rotation of a machine have been averaged hourly.
3. There are 100 unique Machines.
4. There are no duplicates or missing values in the dataset.
The four parameters voltage, vibration, pressure & rotation are normally distributed.

## EDA on Machine Data 

In [None]:
fig = plot_boxplot(
    machines_df,
    x_column="age",
    y_column="model",
    title="Distribution of age by model",
   
    height = 400
)
fig.show()

The age of the Machines is distributed between 0 to 20. The median age is to ~12.5. There are no outliers. Another indication that this is a synthetic data.



In [None]:
# Create a DF with number of errors, maintenance records and failure records across machines

# Create a DF consisting of number of erros across Machines
erros_across_machine = errors_df.groupby("machineID").size()
erros_across_machine = pd.DataFrame(erros_across_machine, columns=["num_errors"]).reset_index()

machines_errors_df = pd.merge(machines_df, erros_across_machine, how='left', on="machineID")

# Create a DF consisting of number of maintenance records across Machines
maint_across_machine = maint_df.groupby("machineID").size()
maint_across_machine = pd.DataFrame(maint_across_machine, columns=["num_maint"]).reset_index()

machines_errors_df = pd.merge(machines_errors_df, maint_across_machine, how='left', on="machineID")

# Create a DF consisting of number of failure records across Machines
failure_across_machine = failures_df.groupby("machineID").size()
failure_across_machine = pd.DataFrame(failure_across_machine, columns=["num_failure"]).reset_index()

machines_errors_df = pd.merge(machines_errors_df, failure_across_machine, how='left', on="machineID")

machines_errors_df.head()

In [None]:
fig = plot_scatter(
    df=machines_errors_df,
    feature_x="age",
    feature_y="num_errors",
    title="Age vs Number of Errors",
    xlabel="Age",
    ylabel="Number of Errors"
)
fig.show()

In [None]:
fig = plot_scatter(
    df=machines_errors_df,
    feature_x="age",
    feature_y="num_maint",
    title="Age vs Number of Maintainance Records",
    xlabel="Age",
    ylabel="Number of Maintainance"
)
fig.show()

In [None]:
fig = plot_scatter(
    df=machines_errors_df,
    feature_x="age",
    feature_y="num_failure",
    title="Age vs Number of Failure Records",
    xlabel="Age",
    ylabel="Number of Failure"
)
fig.show()

From the above three plots, it appears only Number of Failures is slightly correlated with Age.¶


# Feature Engineering

### Identifying Lag Features from Telemetry Data on a window of 24 hours

In [None]:
temp = []
fields = ['volt', 'rotate', 'pressure', 'vibration']
for col in fields:
    temp.append(pd.pivot_table(telemetry_df,
                               index='datetime',
                               columns='machineID',
                               values=col).resample('3H', closed='left', label='right').mean().unstack())
telemetry_mean_3h = pd.concat(temp, axis=1)
telemetry_mean_3h.columns = [i + 'mean_3h' for i in fields]
telemetry_mean_3h.reset_index(inplace=True)


temp = []

for col in fields:
    temp.append(pd.pivot_table(telemetry_df,
                               index='datetime',
                               columns='machineID',
                               values=col).resample('3H', closed='left', label='right').std().unstack())
telemetry_sd_3h = pd.concat(temp, axis=1)
telemetry_sd_3h.columns = [i + 'sd_3h' for i in fields]
telemetry_sd_3h.reset_index(inplace=True)

telemetry_mean_3h.head()

	•	Each row represents a 3-hour time interval.
	•	Each column corresponds to a specific machineID.
	•	The values are the mean of the selected column (col) for that machineID during that time interval.

In [None]:
temp = []
fields = ['volt', 'rotate', 'pressure', 'vibration']
for col in fields:
    temp.append(pd.pivot_table(telemetry_df,
    index='datetime',
    columns='machineID',
    values=col).resample('3H',closed='left',
    label='right',).first().unstack().rolling(window=24, center=False).mean())

telemetry_mean_24h = pd.concat(temp, axis=1)
telemetry_mean_24h.columns = [i + 'mean_24h' for i in fields]
telemetry_mean_24h.reset_index(inplace=True)
telemetry_mean_24h = telemetry_mean_24h.loc[-telemetry_mean_24h['voltmean_24h'].isnull()]

temp = []
fields = ['volt', 'rotate', 'pressure', 'vibration']
for col in fields:
    temp.append(pd.pivot_table(telemetry_df,
    index='datetime',
    columns='machineID',
    values=col).resample('3H',
    closed='left',
    label='right',
    ).first().unstack().rolling(window=24, center=False).std())
    
telemetry_sd_24h = pd.concat(temp, axis=1)
telemetry_sd_24h.columns = [i + 'sd_24h' for i in fields]
telemetry_sd_24h.reset_index(inplace=True)
telemetry_sd_24h = telemetry_sd_24h.loc[-telemetry_sd_24h['voltsd_24h'].isnull()]

telemetry_mean_24h.head(10)

	A loop iterates over each field in fields.
	•	For each field:
	•	The pd.pivot_table() function transforms the telemetry data to have machineID as columns and datetime as the index.
	•	It uses resample('3H') to downsample the data to 3-hour intervals, taking the first value in each interval (.first()).
	•	The unstacked data undergoes a rolling 24-hour window computation for the mean (rolling(window=24).mean()).

In [None]:
telemetry_feat = pd.concat([telemetry_mean_3h,
                            telemetry_sd_3h.iloc[:, 2:6],
                            telemetry_mean_24h.iloc[:, 2:6],
                            telemetry_sd_24h.iloc[:, 2:6]], axis=1).dropna()
telemetry_feat.describe()

In [None]:
telemetry_feat.head()

### Identifying Lag Features from Error Data on a window of 24 hours

In [None]:
error_count = pd.get_dummies(errors_df.set_index('datetime')).reset_index()

In [None]:
error_count.head()

In [None]:
error_count.columns = ['datetime', 'machineID',
                       'error1', 'error2', 'error3', 'error4', 'error5']

In [None]:
error_count.head(13)

In [None]:
error_count = telemetry_df[['datetime', 'machineID']].merge(
    error_count, on=['machineID', 'datetime'], how='left').fillna(0.0)
error_count.describe()

In [None]:
temp = []
fields = ['error%d' % i for i in range(1, 6)]
for col in fields:
    temp.append(pd.pivot_table(error_count,
                                              index='datetime',
                                              columns='machineID',
                                              values=col).resample('3H',
                                                                               closed='left',
                                                                               label='right',
                                                                               ).first().unstack().rolling(window=24, center=False).sum())
error_count = pd.concat(temp, axis=1)
error_count.columns = [i + 'count' for i in fields]
error_count.reset_index(inplace=True)
error_count = error_count.dropna()
error_count.describe()

In [None]:
error_count.head()


### Identifying Days Since Last Replacement using Maintainence on a window of 24 hours

The code is designed to calculate the metric “Days Since Last Replacement” for maintenance data using a rolling 24-hour window. The main goal is to determine how many days have passed since the last replacement or maintenance event for each machine, based on chronological timestamps.

Key Steps and Functions Used:

1.	Sorting the Data:
The dataset is first sorted using functions like sort_values to ensure that timestamps for each machine are in ascending order. Sorting is critical for performing accurate chronological calculations.
2.	Rolling Window Implementation:
A 24-hour rolling window is applied using methods like rolling or custom filtering logic. This step identifies all maintenance events that occurred in the past 24 hours for each timestamp.
3.	Calculating Time Differences:
The difference between the current timestamp and the most recent maintenance event is calculated. Functions like shift (to access the previous row) and arithmetic operations on datetime objects (e.g., timedelta) are used to derive the difference in days.
4.	Handling Missing Values:
For timestamps where no maintenance occurred within the rolling window, methods like fillna or conditional logic (if-else) are used to handle missing data, ensuring the output remains consistent.
5.	Updating the Dataset:
The calculated “Days Since Last Replacement” is stored in a new column, enhancing the dataset for further analysis. This is typically done using assign or by directly adding a new column.

In [None]:
comp_rep = pd.get_dummies(maint_df.set_index('datetime')).reset_index()
comp_rep.columns = ['datetime', 'machineID',
                    'comp1', 'comp2', 'comp3', 'comp4']

comp_rep = telemetry_df[['datetime', 'machineID']].merge(comp_rep,
                                                      on=['datetime',
                                                          'machineID'],
                                                      how='outer').fillna(0).sort_values(by=['machineID', 'datetime'])

In [None]:
components = ['comp1', 'comp2', 'comp3', 'comp4']
for comp in components:
    comp_rep.loc[comp_rep[comp] < 1, comp] = None
    comp_rep.loc[-comp_rep[comp].isnull(),
                 comp] = comp_rep.loc[-comp_rep[comp].isnull(), 'datetime']
    comp_rep[comp] = comp_rep[comp].fillna(method='ffill')

comp_rep = comp_rep.loc[comp_rep['datetime'] > pd.to_datetime('2015-01-01')]

In [None]:
for comp in components:
    comp_rep[comp] = (comp_rep["datetime"] - pd.to_datetime(comp_rep[comp])) / np.timedelta64(1, "D") 

comp_rep.describe()

In [None]:
comp_rep.head()

### Machine Features: Descriptive Statistics about the Machine

In [None]:
final_feat = telemetry_feat.merge(
    error_count, on=['datetime', 'machineID'], how='left')
final_feat = final_feat.merge(
    comp_rep, on=['datetime', 'machineID'], how='left')
final_feat = final_feat.merge(machines_df, on=['machineID'], how='left')

final_feat.head()

In [None]:
final_feat.describe()

# Label Construnction

In [None]:
labeled_features = final_feat.merge(
    failures_df, on=['datetime', 'machineID'], how='left')
labeled_features = labeled_features.fillna(
    method='bfill', limit=7)
labeled_features = labeled_features.fillna('none')
labeled_features.head()

In [None]:
labeled_features.loc[labeled_features['failure'] == 'comp4'][:16]

# Modelling

In [None]:
threshold_dates = [[pd.to_datetime('2015-07-31 01:00:00'), pd.to_datetime('2015-08-01 01:00:00')],
                   [pd.to_datetime('2015-08-31 01:00:00'),
                    pd.to_datetime('2015-09-01 01:00:00')],
                   [pd.to_datetime('2015-09-30 01:00:00'), pd.to_datetime('2015-10-01 01:00:00')]]

test_results = []
anai_models = []
train_dfs = []
for last_train_date, first_test_date in threshold_dates:
    print('Training on %s to %s' % (last_train_date, first_test_date))
    train_y = labeled_features.loc[labeled_features['datetime']
                                   < last_train_date, 'failure']
    train_X = pd.get_dummies(labeled_features.loc[labeled_features['datetime'] < last_train_date]
                         .drop(['datetime', 'machineID', 'failure'], axis=1))
    df = pd.concat([train_X, train_y], axis=1)
    train_dfs.append(df)


In [None]:
threshold_dates = [[pd.to_datetime('2015-10-01 01:00:00'), pd.to_datetime('2015-10-31 01:00:00')],
                   [pd.to_datetime('2015-11-01 01:00:00'),
                    pd.to_datetime('2015-11-30 01:00:00')],
                   [pd.to_datetime('2015-12-01 01:00:00'), pd.to_datetime('2016-01-01 01:00:00')]]


test_dfs = []
for last_train_date, first_test_date in threshold_dates:
    print('Testing on %s to %s' % (last_train_date, first_test_date))
    test_y = labeled_features.loc[labeled_features['datetime']
                                   < last_train_date, 'failure']
    test_X = pd.get_dummies(labeled_features.loc[labeled_features['datetime'] < last_train_date]
                         .drop(['datetime', 'machineID', 'failure'], axis=1))
    df = pd.concat([test_X, test_y], axis=1)
    test_dfs.append(df)


In [None]:
train_dfs[0].head()

In [None]:
ai1 = anai.run(target = 'failure', df = train_dfs[0], predictor = ['xgb', 'cat','lgbm', 'gbc', 'rfc'], ensemble = False)

In [None]:
ai1.explain('shap')

In [None]:
ai1.explain('perm')

In [None]:
test_dfs[0]

In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize the encoder
label_encoder = LabelEncoder()

# Fit and transform your labels (both for training and testing)
y_test_numeric = label_encoder.fit_transform(test_y)

# Display the mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_mapping)

print(y_test_numeric[:5])

In [None]:
# Make predictions
y_pred = ai1.predict(test_X)  # Predicted class labels

from sklearn.metrics import classification_report, accuracy_score

# Classification Report
print("Classification Report:")
print(classification_report(y_test_numeric, y_pred))

# Accuracy Score


In [None]:
import numpy as np
import plotly.graph_objects as go
from sklearn.metrics import confusion_matrix

# Generate predictions
y_pred = ai1.predict(test_X)  # Predicted class labels

# Create Confusion Matrix
cm = confusion_matrix(y_test_numeric, y_pred)
labels = sorted(set(y_test_numeric))  # Unique class labels

# Plot confusion matrix as a heatmap in Plotly
fig = go.Figure(data=go.Heatmap(
    z=cm,
    x=labels,  # Predicted labels
    y=labels,  # Actual labels
    colorscale='Blues',
    hoverongaps=False,
    texttemplate='%{z}',
    textfont={"size": 10},
    colorbar=dict(title="Count")
))

fig.update_layout(
    title="Confusion Matrix",
    xaxis=dict(title="Predicted Label"),
    yaxis=dict(title="Actual Label"),
    template="plotly_dark"
)

fig.show()

In [None]:
ai1.result()

In [None]:
# Identify the best model
print("Best Classifier Name:", ai1.classifier_name)  # Name of the best classifier
print("Best Classifier Details:", ai1.best_classifier)  # Details about the best classifier
print("Classifier Predictor:", ai1.predictor)