<a href="https://colab.research.google.com/github/a-singh03/BDDA/blob/main/Missing_values_pipelining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Last amended: 1st June, 2023
# Ref: https://www.kaggle.com/code/alexisbcook/missing-values
# Data Source: https://www.kaggle.com/competitions/tabular-playground-series-aug-2022/data


This [data represents](https://www.kaggle.com/competitions/tabular-playground-series-aug-2022/data) the results of a large product testing study. For each product_code you are given a number of product attributes (fixed for the code) as well as a number of measurement values for each individual product, representing various lab testing methods. Each product is used in a simulated real-world environment experiment, and and absorbs a certain amount of fluid (loading) to see whether or not it fails.

Your task is to use the data to predict individual product failures of new codes with their individual lab test results.

# Simple Data pipelining

In [None]:
# 1.0

import pandas as pd

# 1.0.1
from sklearn.model_selection import train_test_split
from sklearn.ensemble import  RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

# 1.0.2
from pathlib import Path


In [None]:
# 1.1
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
# 1.2 Display from a cell outputs of multiple commands:

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# 1.2.1 Where is our data:

pathToFolder = "/gdrive/MyDrive/BDDA1"


In [None]:
# 1.2.2 And our file?

path = Path(pathToFolder) / "producttesting.zip"

In [None]:
# 2.1 Load the data
data = pd.read_csv(path)

In [None]:
# 2.2
data.shape  #(26570, 26)
data.head()


(26570, 26)

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,0,A,80.1,material_7,material_8,9,5,7,8,4,...,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0
1,1,A,84.89,material_7,material_8,9,5,14,3,3,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,2,A,82.43,material_7,material_8,9,5,12,1,5,...,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0
3,3,A,101.07,material_7,material_8,9,5,13,2,6,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0
4,4,A,188.06,material_7,material_8,9,5,9,2,8,...,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26570 entries, 0 to 26569
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              26570 non-null  int64  
 1   product_code    26570 non-null  object 
 2   loading         26320 non-null  float64
 3   attribute_0     26570 non-null  object 
 4   attribute_1     26570 non-null  object 
 5   attribute_2     26570 non-null  int64  
 6   attribute_3     26570 non-null  int64  
 7   measurement_0   26570 non-null  int64  
 8   measurement_1   26570 non-null  int64  
 9   measurement_2   26570 non-null  int64  
 10  measurement_3   26189 non-null  float64
 11  measurement_4   26032 non-null  float64
 12  measurement_5   25894 non-null  float64
 13  measurement_6   25774 non-null  float64
 14  measurement_7   25633 non-null  float64
 15  measurement_8   25522 non-null  float64
 16  measurement_9   25343 non-null  float64
 17  measurement_10  25270 non-null 

In [None]:
data.nunique()

id                26570
product_code          5
loading           11950
attribute_0           2
attribute_1           3
attribute_2           4
attribute_3           4
measurement_0        29
measurement_1        30
measurement_2        25
measurement_3      4721
measurement_4      4692
measurement_5      4671
measurement_6      4704
measurement_7      4734
measurement_8      4713
measurement_9      4708
measurement_10     6177
measurement_11     6526
measurement_12     6392
measurement_13     5271
measurement_14     6389
measurement_15     6577
measurement_16     7035
measurement_17    23612
failure               2
dtype: int64

In [None]:
# 2.2.1 Get target
y = data.pop('failure')

In [None]:
y

0        0
1        0
2        0
3        0
4        0
        ..
26565    0
26566    0
26567    0
26568    0
26569    0
Name: failure, Length: 26570, dtype: int64

In [None]:
X = data.select_dtypes(exclude = ['object'])
X.pop("id")

0            0
1            1
2            2
3            3
4            4
         ...  
26565    26565
26566    26566
26567    26567
26568    26568
26569    26569
Name: id, Length: 26570, dtype: int64

In [None]:
column_name = list(X.columns)

In [None]:
column_name

['loading',
 'attribute_2',
 'attribute_3',
 'measurement_0',
 'measurement_1',
 'measurement_2',
 'measurement_3',
 'measurement_4',
 'measurement_5',
 'measurement_6',
 'measurement_7',
 'measurement_8',
 'measurement_9',
 'measurement_10',
 'measurement_11',
 'measurement_12',
 'measurement_13',
 'measurement_14',
 'measurement_15',
 'measurement_16',
 'measurement_17']

In [None]:
X.isnull().sum()

loading            250
attribute_2          0
attribute_3          0
measurement_0        0
measurement_1        0
measurement_2        0
measurement_3      381
measurement_4      538
measurement_5      676
measurement_6      796
measurement_7      937
measurement_8     1048
measurement_9     1227
measurement_10    1300
measurement_11    1468
measurement_12    1601
measurement_13    1774
measurement_14    1874
measurement_15    2009
measurement_16    2110
measurement_17    2284
dtype: int64

In [None]:
si = SimpleImputer(strategy = "median")

In [None]:
si.fit(X) #X_train

In [None]:
X = si.transform(X) #X_test

In [None]:
X = pd.DataFrame(X, columns=column_name)

In [None]:
X

Unnamed: 0,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,...,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
0,80.10,9.0,5.0,7.0,8.0,4.0,18.040,12.518,15.748,19.292,...,20.155,10.672,15.859,17.5940,15.193,15.029,16.040,13.034,14.684,764.100
1,84.89,9.0,5.0,14.0,3.0,3.0,18.213,11.540,17.717,17.893,...,17.889,12.448,17.947,17.9150,11.755,14.732,15.425,14.395,15.631,682.057
2,82.43,9.0,5.0,12.0,1.0,5.0,18.057,11.652,16.738,18.240,...,18.288,12.715,15.607,19.2115,13.798,16.711,18.631,14.094,17.946,663.376
3,101.07,9.0,5.0,13.0,2.0,6.0,17.295,11.188,18.576,18.339,...,19.060,12.471,16.346,18.3770,10.020,15.250,15.562,16.154,17.172,826.282
4,188.06,9.0,5.0,9.0,2.0,8.0,19.346,12.950,16.990,15.746,...,18.093,10.337,17.082,19.9320,12.428,16.182,12.760,13.153,16.412,579.885
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26565,158.95,6.0,9.0,6.0,16.0,4.0,16.301,13.259,18.068,15.505,...,19.354,11.430,12.177,17.9420,10.112,15.795,18.572,16.144,16.436,729.131
26566,146.02,6.0,9.0,10.0,12.0,8.0,17.543,11.733,17.984,19.078,...,19.563,11.242,14.179,20.5640,10.234,14.450,14.322,13.146,16.471,853.924
26567,115.62,6.0,9.0,1.0,10.0,1.0,15.670,11.535,16.778,18.385,...,19.279,11.407,16.437,17.4760,8.668,15.069,16.599,15.590,14.065,750.364
26568,106.38,6.0,9.0,2.0,9.0,4.0,18.059,11.733,16.918,18.101,...,19.358,11.392,17.064,17.8140,14.928,16.273,15.485,13.624,12.865,730.156


In [None]:
X.isnull().sum().sum()

0

In [None]:
# Pre-Processing should be done on x train

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
pipe = make_pipeline(SimpleImputer(strategy = 'median'), StandardScaler(), PCA(n_components=0.95), RandomForestClassifier())

In [None]:
pipe

In [None]:
pipe.fit(X_train, y_train)

In [None]:
pred= pipe.predict(X_test)

In [None]:
accu = accuracy_score(y_test,pred)
accu

0.7845852777359626

In [None]:
(pred == y_test).sum()/y_test.shape[0]

0.7845852777359626

In [None]:
rf = RandomForestClassifier()

In [None]:
rf.fit(X,y)

In [None]:
s = rf.predict(X)

In [None]:
(s == y).sum()/X.shape[0]

1.0

In [None]:
pipe = make_pipeline(SimpleImputer(strategy = 'median'), RandomForestClassifier())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
pipe.fit(X_train,y_train)

In [None]:
pred = pipe.predict(X_test)

In [None]:
accu = accuracy_score(y_test,pred)
accu   # 79%

0.7844347433388529

In [None]:
# 3.0 Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [None]:
pipe.fit(X_train,y_train)

In [None]:
pred = pipe.predict(X_valid)

In [None]:
accu = accuracy_score(y_valid,pred)
accu   # 79%

0.7933759879563418

# Your Turn

a. Add Standard Scaler and PCA to it  
b. Add cross validation to it

In [None]:
### ----------------------- Done -----------------