# Ai4i2020 - Predictive maintenance

####  Problem Statement : To predict the Air temperature when there is machine failure in industry due to following failure mode
    - tool wear failure (TWF)
    - heat dissipation failure (HDF)
    - power failure (PWF)
    - verstrain failure (OSF)
    - random failures (RNF)


## Import reqired libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pp
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge,Lasso,LassoCV, RidgeCV, LarsCV,ElasticNet,ElasticNetCV
import statsmodels.api as sm
from datetime import datetime
import logging
import pickle
import os
os.chdir(os.getcwd())
logging.basicConfig(filename='Ai4i_2020.log',level = logging.INFO,force = True)
logging.info("Ai4i 2020 Predictive Maintenance")

## EDA

In [2]:
df = pd.read_csv("ai4i2020.csv")

In [3]:
df

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,M24855,M,298.8,308.4,1604,29.5,14,0,0,0,0,0,0
9996,9997,H39410,H,298.9,308.4,1632,31.8,17,0,0,0,0,0,0
9997,9998,M24857,M,299.0,308.6,1645,33.4,22,0,0,0,0,0,0
9998,9999,H39412,H,299.0,308.7,1408,48.5,25,0,0,0,0,0,0


In [4]:
pf = pp.ProfileReport(df)
pd_f = pf.to_widgets()
pf.to_file("AI4I_Profiling.html")

Summarize dataset:   0%|          | 0/27 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## Data Preprocessing

In [5]:
df = df.drop(['UDI','Product ID','Type','Machine failure'], axis = 1)
df.columns = df.columns.str.replace('[',"",regex=False)
df.columns = df.columns.str.replace(']',"",regex=False)
df.columns = df.columns.str.replace(' ',"",regex=False)

In [6]:
df

Unnamed: 0,AirtemperatureK,ProcesstemperatureK,Rotationalspeedrpm,TorqueNm,Toolwearmin,TWF,HDF,PWF,OSF,RNF
0,298.1,308.6,1551,42.8,0,0,0,0,0,0
1,298.2,308.7,1408,46.3,3,0,0,0,0,0
2,298.1,308.5,1498,49.4,5,0,0,0,0,0
3,298.2,308.6,1433,39.5,7,0,0,0,0,0
4,298.2,308.7,1408,40.0,9,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
9995,298.8,308.4,1604,29.5,14,0,0,0,0,0
9996,298.9,308.4,1632,31.8,17,0,0,0,0,0
9997,299.0,308.6,1645,33.4,22,0,0,0,0,0
9998,299.0,308.7,1408,48.5,25,0,0,0,0,0


In [72]:
df.to_csv("Cleaned_ai4i.csv")

## Separate independent and target variable

In [7]:
X = df.drop("AirtemperatureK", axis=1)

In [8]:
X

Unnamed: 0,ProcesstemperatureK,Rotationalspeedrpm,TorqueNm,Toolwearmin,TWF,HDF,PWF,OSF,RNF
0,308.6,1551,42.8,0,0,0,0,0,0
1,308.7,1408,46.3,3,0,0,0,0,0
2,308.5,1498,49.4,5,0,0,0,0,0
3,308.6,1433,39.5,7,0,0,0,0,0
4,308.7,1408,40.0,9,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
9995,308.4,1604,29.5,14,0,0,0,0,0
9996,308.4,1632,31.8,17,0,0,0,0,0
9997,308.6,1645,33.4,22,0,0,0,0,0
9998,308.7,1408,48.5,25,0,0,0,0,0


In [9]:
y = df['AirtemperatureK']

In [10]:
y

0       298.1
1       298.2
2       298.1
3       298.2
4       298.2
        ...  
9995    298.8
9996    298.9
9997    299.0
9998    299.0
9999    299.0
Name: AirtemperatureK, Length: 10000, dtype: float64

## Data PreProcessing - Standardization

In [11]:
scaler = StandardScaler()
arr = scaler.fit_transform(X)

In [12]:
arr

array([[-0.94735989,  0.06818514,  0.28219976, ..., -0.09793424,
        -0.09948362, -0.04363046],
       [-0.879959  , -0.72947151,  0.63330802, ..., -0.09793424,
        -0.09948362, -0.04363046],
       [-1.01476077, -0.22744984,  0.94428963, ..., -0.09793424,
        -0.09948362, -0.04363046],
       ...,
       [-0.94735989,  0.59251888, -0.66077672, ..., -0.09793424,
        -0.09948362, -0.04363046],
       [-0.879959  , -0.72947151,  0.85400464, ..., -0.09793424,
        -0.09948362, -0.04363046],
       [-0.879959  , -0.2162938 ,  0.02137647, ..., -0.09793424,
        -0.09948362, -0.04363046]])

In [13]:
df1 = pd.DataFrame(arr)

In [14]:
df1.columns = X.columns
df1

Unnamed: 0,ProcesstemperatureK,Rotationalspeedrpm,TorqueNm,Toolwearmin,TWF,HDF,PWF,OSF,RNF
0,-0.947360,0.068185,0.282200,-1.695984,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
1,-0.879959,-0.729472,0.633308,-1.648852,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
2,-1.014761,-0.227450,0.944290,-1.617430,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
3,-0.947360,-0.590021,-0.048845,-1.586009,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
4,-0.879959,-0.729472,0.001313,-1.554588,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
...,...,...,...,...,...,...,...,...,...
9995,-1.082162,0.363820,-1.052012,-1.476034,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
9996,-1.082162,0.520005,-0.821283,-1.428902,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
9997,-0.947360,0.592519,-0.660777,-1.350349,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
9998,-0.879959,-0.729472,0.854005,-1.303217,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363


In [15]:
df1.describe()

Unnamed: 0,ProcesstemperatureK,Rotationalspeedrpm,TorqueNm,Toolwearmin,TWF,HDF,PWF,OSF,RNF
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,3.841483e-15,-2.554568e-16,5.259244e-16,4.147793e-17,1.447314e-16,3.394757e-16,2.058548e-16,4.610007e-16,1.07535e-15
std,1.00005,1.00005,1.00005,1.00005,1.00005,1.00005,1.00005,1.00005,1.00005
min,-2.901986,-2.068196,-3.630149,-1.695984,-0.06797983,-0.10786,-0.09793424,-0.09948362,-0.04363046
25%,-0.8125581,-0.6458012,-0.6808401,-0.8633176,-0.06797983,-0.10786,-0.09793424,-0.09948362,-0.04363046
50%,0.0636534,-0.1995597,0.01134481,0.0007698234,-0.06797983,-0.10786,-0.09793424,-0.09948362,-0.04363046
75%,0.7376623,0.4084443,0.6834663,0.8491466,-0.06797983,-0.10786,-0.09793424,-0.09948362,-0.04363046
max,2.557486,7.51484,3.672902,2.278819,14.71024,9.271274,10.21093,10.05191,22.91977


## Check Multicolliarity

In [16]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [17]:
vif = pd.DataFrame()
vif['vif_factor'] =  [variance_inflation_factor(df1.values, i) for i in range(df1.shape[1])]
vif['vif_predictors'] = X.columns
vif

Unnamed: 0,vif_factor,vif_predictors
0,1.004777,ProcesstemperatureK
1,5.154221,Rotationalspeedrpm
2,5.222899,TorqueNm
3,1.039904,Toolwearmin
4,1.015584,TWF
5,1.024976,HDF
6,1.212152,PWF
7,1.082597,OSF
8,1.002015,RNF


- we found multicollinearity in MAchinefailure column VIF > 10, dropping feature "Machinefailure"

## Splitting the data

In [18]:
x_train, x_test, y_train, y_test = train_test_split(df1,y,test_size=0.2,random_state=100)

In [19]:
x_train

Unnamed: 0,ProcesstemperatureK,Rotationalspeedrpm,TorqueNm,Toolwearmin,TWF,HDF,PWF,OSF,RNF
8369,-0.205950,2.310549,-1.944830,-1.303217,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
9722,-0.138549,2.076272,-1.844513,0.283562,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
6950,0.872464,-0.919124,1.315461,-0.659079,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
1919,-1.351765,0.665033,-1.001853,-1.648852,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
5713,1.276869,0.319196,-0.199320,0.754883,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
...,...,...,...,...,...,...,...,...,...
350,-1.149563,-0.774096,0.432675,-1.020424,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
79,-0.745157,-0.762940,-0.209352,-1.664562,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
8039,1.276869,0.118387,-0.821283,-1.366059,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
6936,0.737662,-0.378056,0.071535,-1.287506,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363


In [20]:
x_test

Unnamed: 0,ProcesstemperatureK,Rotationalspeedrpm,TorqueNm,Toolwearmin,TWF,HDF,PWF,OSF,RNF
8018,1.276869,1.953555,-1.653912,1.414731,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
9225,-0.610355,1.568672,-1.643880,0.189298,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
3854,0.670261,-1.119933,2.770053,1.351888,-0.06798,-0.10786,10.210933,10.051906,-0.04363
2029,-1.014761,0.720813,-0.480207,-0.486261,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
3539,0.670261,0.821218,-0.751062,-1.303217,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
...,...,...,...,...,...,...,...,...,...
6923,1.142068,-0.590021,0.934258,1.540417,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
1207,-1.284364,0.190902,-0.510302,-0.234890,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
7960,1.479072,-0.266496,0.262136,-0.879028,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363
2339,-1.216963,-0.405946,0.131725,-1.570298,-0.06798,-0.10786,-0.097934,-0.099484,-0.04363


In [21]:
y_train

8369    298.8
9722    298.8
6950    300.7
1919    298.2
5713    302.2
        ...  
350     297.6
79      298.8
8039    300.8
6936    300.6
5640    302.6
Name: AirtemperatureK, Length: 8000, dtype: float64

In [22]:
y_test

8018    301.0
9225    298.0
3854    302.4
2029    298.7
3539    302.0
        ...  
6923    301.2
1207    297.0
7960    301.1
2339    299.2
6637    301.5
Name: AirtemperatureK, Length: 2000, dtype: float64

## Training the model

In [23]:
lr = LinearRegression()
lr.fit(x_train,y_train)

LinearRegression()

In [24]:
lr.intercept_

300.0071494966973

In [25]:
lr.coef_

array([ 1.73515458,  0.01990572, -0.00698736,  0.00813639,  0.00951657,
        0.17966539,  0.01333701, -0.01425019, -0.00286809])

## Save linear model

In [26]:
pickle.dump(lr,open('Ai4i_predictive_Maintenance.pickle','wb'))

In [27]:
lm = pickle.load(open('Ai4i_predictive_Maintenance.pickle','rb'))

In [28]:
lr_score = lm.score(x_test,y_test)
lr_score

0.7965460964724072

In [29]:
df

Unnamed: 0,AirtemperatureK,ProcesstemperatureK,Rotationalspeedrpm,TorqueNm,Toolwearmin,TWF,HDF,PWF,OSF,RNF
0,298.1,308.6,1551,42.8,0,0,0,0,0,0
1,298.2,308.7,1408,46.3,3,0,0,0,0,0
2,298.1,308.5,1498,49.4,5,0,0,0,0,0
3,298.2,308.6,1433,39.5,7,0,0,0,0,0
4,298.2,308.7,1408,40.0,9,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
9995,298.8,308.4,1604,29.5,14,0,0,0,0,0
9996,298.9,308.4,1632,31.8,17,0,0,0,0,0
9997,299.0,308.6,1645,33.4,22,0,0,0,0,0
9998,299.0,308.7,1408,48.5,25,0,0,0,0,0


In [30]:
test1 = scaler.transform([[308.6,1551,42.8,0,0,0,0,0,0]])
test1

array([[-0.94735989,  0.06818514,  0.28219976, -1.69598374, -0.06797983,
        -0.10786004, -0.09793424, -0.09948362, -0.04363046]])

In [31]:
lr.predict(test1)

array([298.3291309])

In [32]:
test2 = scaler.transform([[308.7,1500,40.2,30,0,0,0,0,0]])
test2

array([[-0.879959  , -0.2162938 ,  0.02137647, -1.22466331, -0.06797983,
        -0.10786004, -0.09793424, -0.09948362, -0.04363046]])

In [33]:
lm.predict(test2)

array([298.44607641])

## Checking for Adj - r2

In [34]:
def adj_r2(x,y,lr):
    r2 = lr.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    adj_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adj_r2

adj_r2_lr = adj_r2(x_test,y_test,lr)

In [35]:
adj_r2_lr

0.7956259531901215

## Lasso Regularization

In [36]:
lassocv = LarsCV(cv=10,max_iter=10000,normalize=True)
lassocv.fit(x_train,y_train)

LarsCV(cv=10, max_iter=10000)

In [37]:
lassocv.alpha_

0.0001204521111487555

In [38]:
lasso = Lasso(alpha=lassocv.alpha_)
lasso.fit(x_train,y_train)

Lasso(alpha=0.0001204521111487555)

In [39]:
lasso.score(x_test,y_test)

0.7965500827069432

In [40]:
adj_r2_lasso = adj_r2(x_test,y_test,lasso)

In [41]:
adj_r2_lasso

0.795629957452854

## Ridge regularization

In [42]:
ridgecv = RidgeCV(cv=5,normalize=True)
ridgecv.fit(x_train,y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=5, normalize=True)

In [43]:
ridgecv.alpha_

0.1

In [44]:
ridge_lr = Ridge(alpha=ridgecv.alpha_)
ridge_lr.fit(x_train,y_train)

Ridge(alpha=0.1)

In [45]:
ridge_lr.score(x_test,y_test)

0.7965457602683094

In [46]:
adj_r2_ridge = adj_r2(x_test,y_test,ridge_lr)

In [47]:
adj_r2_ridge

0.7956256154655027

## Elastic net 

In [48]:
elastic = ElasticNetCV(alphas=None,cv=10)
elastic.fit(x_train,y_train)

ElasticNetCV(cv=10)

In [49]:
elastic.alpha_

0.0074166085296927305

In [50]:
elastic.l1_ratio_

0.5

In [51]:
elastic_lr = ElasticNet(alpha=elastic.alpha_,l1_ratio=elastic.l1_ratio_)
elastic_lr.fit(x_train,y_train)

ElasticNet(alpha=0.0074166085296927305)

In [52]:
elastic_lr.score(x_test,y_test)

0.7964254616226731

In [54]:
adj_r2_elastic = adj_r2(x_test,y_test,elastic_lr)

In [55]:
adj_r2_elastic

0.79550477275564

In [65]:
score_df = {"LR" : adj_r2_lr,"Lasso" : adj_r2_lasso,"Ridge" : adj_r2_ridge,"Elastic" :adj_r2_elastic}
score_df

{'LR': 0.7956259531901215,
 'Lasso': 0.795629957452854,
 'Ridge': 0.79550477275564,
 'Elastic': 0.79550477275564}

In [68]:
pickle.dump(lasso,open('Ai4i_lasso_final.pickle','wb'))

In [69]:
pickle.dump(scaler,open('scaler_std.pickle','wb'))

In [74]:
df.head(100)

Unnamed: 0,AirtemperatureK,ProcesstemperatureK,Rotationalspeedrpm,TorqueNm,Toolwearmin,TWF,HDF,PWF,OSF,RNF
0,298.1,308.6,1551,42.8,0,0,0,0,0,0
1,298.2,308.7,1408,46.3,3,0,0,0,0,0
2,298.1,308.5,1498,49.4,5,0,0,0,0,0
3,298.2,308.6,1433,39.5,7,0,0,0,0,0
4,298.2,308.7,1408,40.0,9,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
95,299.0,309.0,1351,52.2,44,0,0,0,0,0
96,299.0,309.0,1575,35.3,47,0,0,0,0,0
97,298.9,308.9,1750,29.9,50,0,0,0,0,0
98,298.9,308.8,1529,32.7,53,0,0,0,0,0


In [79]:
df.loc[df['AirtemperatureK'] > 300.2]

Unnamed: 0,AirtemperatureK,ProcesstemperatureK,Rotationalspeedrpm,TorqueNm,Toolwearmin,TWF,HDF,PWF,OSF,RNF
2787,300.3,309.6,1524,38.4,54,0,0,0,0,0
2788,300.3,309.6,1326,52.4,57,0,0,0,0,0
2789,300.3,309.5,1438,44.0,59,0,0,0,0,0
2790,300.4,309.6,1356,45.9,64,0,0,0,0,0
2791,300.4,309.6,1416,43.3,66,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8115,300.4,311.8,1367,57.2,10,0,0,0,0,0
8116,300.3,311.8,2004,19.6,12,0,0,0,0,0
8117,300.3,311.7,1406,47.5,14,0,0,0,0,0
8118,300.3,311.8,1548,33.0,16,0,0,0,0,0
