## Processing Interim Data For Modelling:

In [1]:
#imports
import os
import pandas as pd

In [2]:
#Getting the interim data:
interim_data_file = os.path.join(os.pardir,'data','interim','interim_iris.csv')

interim_df = pd.read_csv(interim_data_file, index_col='Id')
interim_df.head()

Unnamed: 0_level_0,SL,SW,PL,PW,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [3]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
interim_df.Species=encoder.fit_transform(interim_df.Species)
interim_df.head()


Unnamed: 0_level_0,SL,SW,PL,PW,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5.1,3.5,1.4,0.2,0
2,4.9,3.0,1.4,0.2,0
3,4.7,3.2,1.3,0.2,0
4,4.6,3.1,1.5,0.2,0
5,5.0,3.6,1.4,0.2,0


In [4]:
#Actual label to Numerical encoding stored in dictionary:
dict_actual_labels = dict(zip(encoder.classes_,encoder.transform(encoder.classes_)))
dict_actual_labels

{'setosa': 0, 'versicolor': 1, 'virginica': 2}

In [5]:
from sklearn.model_selection import train_test_split
#Splitting the data into train and test:
X = interim_df.iloc[:,:-1]
y=interim_df.iloc[:,-1]
print(X.shape, y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

(150, 4) (150,)


In [6]:
X_train

Unnamed: 0_level_0,SL,SW,PL,PW
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
23,4.6,3.6,1.0,0.2
16,5.7,4.4,1.5,0.4
66,6.7,3.1,4.4,1.4
12,4.8,3.4,1.6,0.2
43,4.4,3.2,1.3,0.2
...,...,...,...,...
72,6.1,2.8,4.0,1.3
107,4.9,2.5,4.5,1.7
15,5.8,4.0,1.2,0.2
93,5.8,2.6,4.0,1.2


We're now going to preprocess the interim data for modelling using the concept of Pipeline:
- Pipeline:
    - StandardScaler ---> Logistic Regression Model

In [7]:
#Making a pipeline for preprocessing:
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import set_config
set_config(display="diagram")

In [8]:
pipe=make_pipeline(StandardScaler(),LogisticRegression())
pipe.fit(X_train,y_train)

In [11]:
pred =pipe.predict(X_test)

In [19]:
accuracy_score(y_test,pred)

1.0