In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
file_path= '../Dataset/Refined_Dataset.csv'
df = pd.read_csv(file_path)

In [3]:
df.head()

Unnamed: 0,Machine,Component,Parameter,Value,Probability of Failure,Status
0,Excavator_1,Engine,Temparature,104.0,0.1,Low Risk
1,Articulated_Truck_1,Fuel,Water in Fuel,70.0,0.1,Low Risk
2,Articulated_Truck_1,Engine,Temparature,104.0,0.1,Low Risk
3,Backhoe_Loader_1,Drive,Transmission Pressure,231.0,0.155,Low Risk
4,Excavator_1,Engine,Oil Pressure,39.0,0.1,Low Risk


In [4]:
df.isnull().sum()

Machine                   0
Component                 0
Parameter                 0
Value                     0
Probability of Failure    0
Status                    0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Machine                 1000 non-null   object 
 1   Component               1000 non-null   object 
 2   Parameter               1000 non-null   object 
 3   Value                   1000 non-null   float64
 4   Probability of Failure  1000 non-null   float64
 5   Status                  1000 non-null   object 
dtypes: float64(2), object(4)
memory usage: 47.0+ KB


In [6]:
df.duplicated().sum()

187

In [7]:
df['Status'].value_counts()

Status
Low Risk       771
High Risk      159
Medium Risk     70
Name: count, dtype: int64

In [8]:
df.describe()

Unnamed: 0,Value,Probability of Failure
count,1000.0,1000.0
mean,273.115148,0.836531
std,472.142807,1.923236
min,0.0,0.0
25%,12.422067,0.1
50%,61.0,0.1
75%,286.25,0.262667
max,1870.0,9.0


In [9]:
from sklearn.utils import resample

# 1. Separate the classes
low_risk = df[df['Status'] == 'Low Risk']
medium_risk = df[df['Status'] == 'Medium Risk']
high_risk = df[df['Status'] == 'High Risk']

# 2. Downsample the low-risk class to reduce its size slightly
low_risk_downsampled = resample(low_risk, 
                                replace=False,    # don't resample with replacement
                                n_samples=350,    # set the desired number of samples (adjust as needed)
                                random_state=42)  # set a random state for reproducibility

# 3. Upsample the high-risk and medium-risk classes to increase their size
medium_risk_upsampled = resample(medium_risk, 
                                 replace=True,     # resample with replacement
                                 n_samples=280,    # set the desired number of samples
                                 random_state=42)

high_risk_upsampled = resample(high_risk, 
                               replace=True,     # resample with replacement
                               n_samples=200,    # set the desired number of samples
                               random_state=42)

# 4. Combine the resampled dataframes back together
df_resampled = pd.concat([low_risk_downsampled, medium_risk_upsampled, high_risk_upsampled])

# 5. Shuffle the combined dataframe to mix up the order of samples
df_resampled = df_resampled.sample(frac=1, random_state=42).reset_index(drop=True)

# 6. Check the distribution of the 'Status' column after resampling
print(df_resampled['Status'].value_counts())

Status
Low Risk       350
Medium Risk    280
High Risk      200
Name: count, dtype: int64


### Data Preprocessing

In [10]:
#Splitting the Dataset into Dependent Variable and Independent Variable
X= df.iloc[:,:-1]
y=df.iloc[:,-1]

In [11]:
X

Unnamed: 0,Machine,Component,Parameter,Value,Probability of Failure
0,Excavator_1,Engine,Temparature,104.0,0.100
1,Articulated_Truck_1,Fuel,Water in Fuel,70.0,0.100
2,Articulated_Truck_1,Engine,Temparature,104.0,0.100
3,Backhoe_Loader_1,Drive,Transmission Pressure,231.0,0.155
4,Excavator_1,Engine,Oil Pressure,39.0,0.100
...,...,...,...,...,...
995,Asphalt_Paver_1,Misc,Exhaust Gas Temparature,263.0,0.100
996,Asphalt_Paver_1,Fuel,Level,7.0,6.000
997,Asphalt_Paver_1,Misc,Air Filter Pressure,23.0,0.150
998,Asphalt_Paver_1,Misc,Air Filter Pressure,26.0,0.300


In [12]:
y

0       Low Risk
1       Low Risk
2       Low Risk
3       Low Risk
4       Low Risk
         ...    
995     Low Risk
996    High Risk
997     Low Risk
998     Low Risk
999     Low Risk
Name: Status, Length: 1000, dtype: object

In [13]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 2, 0, 1, 1, 1, 1, 1, 1, 1,
       2, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1, 0, 2, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 2, 0, 0, 1, 1, 1, 1, 1, 2, 1, 0, 1, 1, 2, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,

In [14]:
encoded_X = pd.get_dummies(X[['Machine', 'Component', 'Parameter']])
X = pd.concat([X, encoded_X], axis=1)
X.drop(['Machine', 'Component', 'Parameter'], axis=1, inplace=True)

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [16]:
X_train

Unnamed: 0,Value,Probability of Failure,Machine_Articulated_Truck_1,Machine_Asphalt_Paver_1,Machine_Backhoe_Loader_1,Machine_Dozer_1,Machine_Excavator_1,Component_Drive,Component_Engine,Component_Fuel,...,Parameter_Hydraulic Pump Rate,Parameter_Level,Parameter_Oil Pressure,Parameter_Pedal Sensor,Parameter_Pressure,Parameter_Speed,Parameter_System Voltage,Parameter_Temparature,Parameter_Transmission Pressure,Parameter_Water in Fuel
541,3.900000,0.170213,True,False,False,False,False,True,False,False,...,False,False,False,True,False,False,False,False,False,False
440,212.000000,0.100000,False,False,False,True,False,False,False,True,...,False,False,False,False,False,False,False,True,False,False
482,4.827449,0.027117,False,False,False,False,True,True,False,False,...,False,False,False,True,False,False,False,False,False,False
422,1737.000000,0.100000,False,False,False,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,True
778,26.000000,0.300000,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,1760.000000,0.022222,False,False,False,False,True,False,True,False,...,False,False,False,False,False,True,False,False,False,False
270,14.996015,0.000266,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
860,58.000000,0.100000,False,True,False,False,False,False,False,True,...,False,False,False,False,True,False,False,False,False,False
435,8.000000,7.000000,False,False,False,False,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [17]:
#Normalizing the data
from sklearn.preprocessing import StandardScaler #sklearn library for standardizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) # Fit and transform the training data

In [18]:
X_train_scaled

array([[-0.56944383, -0.33427305,  2.        , ..., -0.39626354,
        -0.30338702, -0.26521009],
       [-0.1243048 , -0.3719141 , -0.5       , ...,  2.52357307,
        -0.30338702, -0.26521009],
       [-0.56745996, -0.41098677, -0.5       , ..., -0.39626354,
        -0.30338702, -0.26521009],
       ...,
       [-0.45372052, -0.3719141 , -0.5       , ..., -0.39626354,
        -0.30338702, -0.26521009],
       [-0.56067367,  3.32717436, -0.5       , ..., -0.39626354,
        -0.30338702, -0.26521009],
       [-0.52217054, -0.26469414, -0.5       , ..., -0.39626354,
        -0.30338702, -0.26521009]])

In [19]:
# Importing the models from the sklearn library
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [20]:
models={
    'randomforest':RandomForestClassifier(),
    'bayes': GaussianNB(),
    'svc': SVC(),
    'knn': KNeighborsClassifier()
}
def evaluate_model(X_train, y_train, X_test, y_test, models):
    report = {}
    for model_name, model in models.items():
        # Training the model
        model.fit(X_train, y_train)
        # Predicting with the model
        y_pred = model.predict(X_test)
        # Evaluating the model
        test_model_score = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        report[model_name] = f'R2-Score:- {test_model_score}---Classification Report:- {report}'
    return report

In [21]:
evaluate_model(X_train_scaled,y_train,X_test_scaled,y_test,models)

NameError: name 'y_train_temp' is not defined