## Creating Pipelines using Scikit-Learn(Machine Learning) 

In [None]:
#### Load Packages
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [92]:
 #load Data
iris  = load_iris()

In [93]:
# print(dir(iris))
# print(iris.data)
# print(iris.feature_names)
# print(iris.target)
# print(iris.target_names)'''

#### Converted into DataFrame

In [32]:
## Create IRIS DataFrame
df = pd.DataFrame(data=iris.data, 
                  columns=['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'])
df['Type'] = iris.target

In [45]:
#Split dataset into training and testing
def split_data(x, y, test_size=0.30, random_state=0, **kwargs):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=random_state, **kwargs)
    return x_train, x_test, y_train, y_test

In [89]:
x_train, x_test, y_train, y_test = split_data(iris.data, iris.target)
print(len(x_train)/len(iris.data)*100)
print(len(y_test)/len(iris.data)*100)

70.0
30.0


### Pipline cretation 
##### 1 - Data Preprocessing using standred scalar
##### 2 - Reduce Dimension using PCA(Principle Component Analysis)
##### 3 - Apply classifier(LR, DT, RF)

In [75]:
best_accuracy=0
best_pipline=0
best_classifier=""

#### Pipeline for LR, DT, RF

In [76]:
pipline_lr = Pipeline([('scalar1', StandardScaler()),
                      ('pca1', PCA(n_components = 2)),
                      ('lr_classifier', LogisticRegression(random_state=0))])

In [77]:
pipline_dt = Pipeline([('scalar2', StandardScaler()),
                      ('pca2', PCA(n_components=2)),
                      ('dt_classifier', DecisionTreeClassifier())])

In [78]:
pipline_rf = Pipeline([('scalar3', StandardScaler()),
                      ('pca2', PCA(n_components=2)),
                      ('rf_classifier', RandomForestClassifier())])

In [79]:
piplines = [pipline_lr, pipline_dt, pipline_rf] #create list of pipelines

In [80]:
pip_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'Random Forest'} #create dictionary
for pipe in piplines:
    pipe.fit(x_train, y_train)

#### Accuracy

In [90]:
# Print Test Accuracy
for i, model in enumerate(piplines):
    print("{} Test Accuracy : {}".format(pip_dict[i], model.score(x_test, y_test)))

Logistic Regression Test Accuracy : 0.8666666666666667
Decision Tree Test Accuracy : 0.9111111111111111
Random Forest Test Accuracy : 0.8888888888888888


In [91]:
for i, model in enumerate(piplines):
    if model.score(x_test, y_test) > best_accuracy:
        best_accuracy=model.score(x_test, y_test)
        best_pipline = model
        best_classifier=i
print("Best classifier is : {}".format(pip_dict[best_classifier]))

Best classifier is : Decision Tree


In [None]:
################################ END ####################################