# MetroPT-3 Dataset

### Making MetroPT3.cvs readable in Google Collab from Google Drive

In [1]:
#Giving access to the metroPT3.csv in google coolab
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
#2.1 Get the file
downloaded = drive.CreateFile({'id':'1hhqV3s6HEzbw8bdC6zZPqKqAJu7p5SMz'})
downloaded.GetContentFile('MetroPT3.csv')



### Importing the necessary libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

### Instructions for making the target from given dataset

In [3]:
# Fail times
# Nr.     Start Time	          End Time	   Failure	      Severity	         Report
#1	4/18/2020 0:00	   4/18/2020 23:59       Air leak	    High stress
#1	5/29/2020 23:30     5/30/2020 6:00	    Air Leak	    High stress	Maintenance on 30Apr at 12:00
#3	6/5/2020 10:00	   6/7/2020 14:30	        Air Leak	    High stress	Maintenance on 8Jun at 16:00

#4	7/15/2020 14:30     7/15/2020 19:00      Air Leak	    High stress	Maintenance on 16Jul at 00:00

### Reading the csv as a dataframe then setting the 'failure' column as 0

In [4]:
train_data = pd.read_csv('MetroPT3.csv')
df = pd.DataFrame(train_data)
df['failure'] = 0
df


Unnamed: 0.1,Unnamed: 0,timestamp,TP2,TP3,H1,DV_pressure,Reservoirs,Oil_temperature,Motor_current,COMP,DV_eletric,Towers,MPG,LPS,Pressure_switch,Oil_level,Caudal_impulses,failure
0,0,2020-02-01 00:00:00,-0.012,9.358,9.340,-0.024,9.358,53.600,0.0400,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0
1,10,2020-02-01 00:00:10,-0.014,9.348,9.332,-0.022,9.348,53.675,0.0400,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0
2,20,2020-02-01 00:00:19,-0.012,9.338,9.322,-0.022,9.338,53.600,0.0425,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0
3,30,2020-02-01 00:00:29,-0.012,9.328,9.312,-0.022,9.328,53.425,0.0400,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0
4,40,2020-02-01 00:00:39,-0.012,9.318,9.302,-0.022,9.318,53.475,0.0400,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1516943,15169430,2020-09-01 03:59:10,-0.014,8.918,8.906,-0.022,8.918,59.675,0.0425,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0
1516944,15169440,2020-09-01 03:59:20,-0.014,8.904,8.888,-0.020,8.904,59.600,0.0450,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0
1516945,15169450,2020-09-01 03:59:30,-0.014,8.890,8.876,-0.022,8.892,59.600,0.0425,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0
1516946,15169460,2020-09-01 03:59:40,-0.012,8.876,8.864,-0.022,8.878,59.550,0.0450,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0


### Making target - 'failure' column

In [5]:
#For the given time duration of failure of system, we set rows of failure column to '1' (with reference to timestamps)
df['timestamp'] = pd.to_datetime(df['timestamp'])
start_timestamp_1 = pd.to_datetime('2020-04-18 00:00:00')
end_timestamp_1 = pd.to_datetime('2020-04-18 23:59:00')
start_timestamp_2 = pd.to_datetime('2020-05-29 23:30:00')
end_timestamp_2= pd.to_datetime('2020-05-30 06:00:00')
start_timestamp_3 = pd.to_datetime('2020-06-05 10:00:00')
end_timestamp_3 = pd.to_datetime('2020-06-07 14:30:00')
start_timestamp_4 = pd.to_datetime('2020-07-15 14:30:00')
end_timestamp_4 = pd.to_datetime('2020-07-15 19:00:00')
j=0
for i in df["timestamp"]:
  if (((i >= start_timestamp_1) and (i <= end_timestamp_1)) or ((i >= start_timestamp_2) and (i <= end_timestamp_2)) or ((i >= start_timestamp_3) and (i <= end_timestamp_3)) or ((i >= start_timestamp_4) and (i <= end_timestamp_4))):
    df.at[j, 'failure'] = 1
  j+=1



### Checking results in a dataframe

In [6]:
result_df = df[df['failure'] == 1]
result_df.head()

Unnamed: 0.1,Unnamed: 0,timestamp,TP2,TP3,H1,DV_pressure,Reservoirs,Oil_temperature,Motor_current,COMP,DV_eletric,Towers,MPG,LPS,Pressure_switch,Oil_level,Caudal_impulses,failure
562564,5625640,2020-04-18 00:00:01,-0.018,8.248,8.238,-0.024,8.248,49.45,0.04,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1
562565,5625650,2020-04-18 00:00:13,-0.018,8.248,8.238,-0.024,8.248,49.45,0.04,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1
562566,5625660,2020-04-18 00:00:24,-0.018,8.248,8.238,-0.024,8.248,49.45,0.04,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1
562567,5625670,2020-04-18 00:00:36,-0.018,8.248,8.238,-0.024,8.248,49.45,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
562568,5625680,2020-04-18 00:00:49,-0.018,8.248,8.238,-0.024,8.248,49.45,0.04,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1


### Splitting data into testing and training

In [7]:
x = df[['TP2', 'TP3', 'H1', 'DV_pressure', 'Reservoirs', 'Oil_temperature', 'Motor_current', 'COMP', 'DV_eletric', 'Towers', 'MPG', 'LPS', 'Pressure_switch', 'Oil_level', 'Caudal_impulses']]  # Features
y = df['failure']  # Target

# Splitting the data for traning and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [8]:
x_train

Unnamed: 0,TP2,TP3,H1,DV_pressure,Reservoirs,Oil_temperature,Motor_current,COMP,DV_eletric,Towers,MPG,LPS,Pressure_switch,Oil_level,Caudal_impulses
65256,-0.008,8.878,8.862,-0.016,8.880,58.250,0.0350,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
1409954,-0.010,9.176,9.162,-0.020,9.176,65.575,0.0400,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
674985,-0.012,9.670,9.656,-0.022,9.668,62.475,3.8200,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
160529,-0.012,8.864,8.848,-0.022,8.864,52.450,0.0400,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
730739,5.792,7.442,-0.008,1.732,7.444,69.125,4.8600,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,8.904,8.474,-0.016,-0.024,8.474,69.650,5.7375,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
1414414,-0.010,9.630,9.616,-0.020,9.632,67.450,3.8050,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
131932,-0.014,8.958,8.942,-0.016,8.960,57.075,0.0375,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
671155,-0.016,8.752,8.740,-0.022,8.752,52.900,0.0400,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0


In [9]:
y_train

65256      0
1409954    0
674985     0
160529     0
730739     0
          ..
259178     0
1414414    0
131932     0
671155     0
121958     0
Name: failure, Length: 1213558, dtype: int64

# Model 1 - sklearn Logistic Regression

### Training

In [10]:
logistic_regression = LogisticRegression(random_state=23, max_iter=1000).fit(x_train,y_train)

### Predicting

In [11]:
print(logistic_regression.score(x_test,y_test))
#Accuracy with test data
y_pred = logistic_regression.predict(x_test)

0.991061010580441


### Evaluation

In [12]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.991061010580441
Precision: 0.7590118085767558
Recall: 0.8079722130334105
F1-score: 0.7827271270629708
Confusion Matrix:
 [[295793   1551]
 [  1161   4885]]


# Model-2 XGBoost    &    Model-3 LightGBM

### Training

In [13]:
# XGBoost
model_xgb = xgb.XGBClassifier(random_state=42)
model_xgb.fit(x_train, y_train)

# LightGBM
model_lgb = lgb.LGBMClassifier(random_state=42)
model_lgb.fit(x_train, y_train)

#For big Datasets as we had to set iteration limit in sklearn,
#Now we use XGS BOOST and Light GBM as they handle large dataset

[LightGBM] [Info] Number of positive: 23908, number of negative: 1189650
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.218702 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1801
[LightGBM] [Info] Number of data points in the train set: 1213558, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019701 -> initscore=-3.907201
[LightGBM] [Info] Start training from score -3.907201


### Predicting

In [14]:
y_pred_xgb = model_xgb.predict(x_test)
y_pred_lgb = model_lgb.predict(x_test)

### Evaluation

In [15]:
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
roc_auc_xgb = roc_auc_score(y_test, y_pred_xgb)
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)
print('Model 2')
print("Accuracy:", accuracy_xgb)
print("Precision:", precision_xgb)
print("Recall:", recall_xgb)
print("F1-score:", f1_xgb)
print("Confusion Matrix (XGBoost):\n",conf_matrix_xgb)
print('\n')

accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
precision_lgb = precision_score(y_test, y_pred_lgb)
recall_lgb = recall_score(y_test, y_pred_lgb)
f1_lgb = f1_score(y_test, y_pred_lgb)
roc_auc_lgb = roc_auc_score(y_test, y_pred_lgb)
conf_matrix_lgb = confusion_matrix(y_test, y_pred_lgb)
print('model 3')
print("Accuracy:", accuracy_lgb)
print("Precision:", precision_lgb)
print("Recall:", recall_lgb)
print("F1-score:", f1_lgb)
print("Confusion Matrix (LightGBM):\n",conf_matrix_lgb)

Model 2
Accuracy: 0.9993308942285507
Precision: 0.9812222039202767
Recall: 0.9852795236520013
F1-score: 0.9832466782206817
Confusion Matrix (XGBoost):
 [[297230    114]
 [    89   5957]]


model 3
Accuracy: 0.9994001120669765
Precision: 0.9841479524438573
Recall: 0.9857757194839564
F1-score: 0.9849611634440589
Confusion Matrix (LightGBM):
 [[297248     96]
 [    86   5960]]


# Model 4 - SVM

### Scaling the training and test data

In [16]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)


### Training

In [17]:
linear_svm_model = LinearSVC()
linear_svm_model.fit(x_train_scaled, y_train)



### Prediction

In [18]:
y_pred_linear_svm = linear_svm_model.predict(x_test_scaled)

### Evaluation

In [19]:
accuracy_linear_svm = accuracy_score(y_test, y_pred_linear_svm)
precision_linear_svm = precision_score(y_test, y_pred_linear_svm)
recall_linear_svm = recall_score(y_test, y_pred_linear_svm)
f1_linear_svm = f1_score(y_test, y_pred_linear_svm)
confusion_matrix_linear_svm = confusion_matrix(y_test, y_pred_linear_svm)
print("Accuracy:", accuracy_linear_svm)
print("Precision:", precision_linear_svm)
print("Recall :", recall_linear_svm)
print("F1 Score:", f1_linear_svm)
print("Confusion Matrix:\n", confusion_matrix_linear_svm)

Accuracy: 0.9902040278189789
Precision: 0.7219815135759676
Recall : 0.8268276546477009
F1 Score: 0.7708558211256745
Confusion Matrix:
 [[295419   1925]
 [  1047   4999]]
