# Module 3 Pipeline

In [2]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

iris = datasets.load_iris()
X,y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

  return f(*args, **kwds)


In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

scaler = StandardScaler().fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

clf = DecisionTreeClassifier()
clf.fit(X_train_std, y_train)
clf.score(X_test_std, y_test)  

0.95

## Standardizing Data

In [3]:
from sklearn.preprocessing import StandardScaler
data = [[1, 0], [2, 0], [3, 1], [4, 1]]

scaler = StandardScaler()
scaler.fit(data)
scaler.transform(data)

array([[-1.34164079, -1.        ],
       [-0.4472136 , -1.        ],
       [ 0.4472136 ,  1.        ],
       [ 1.34164079,  1.        ]])

## Pipeline

In [8]:
from sklearn.pipeline import make_pipeline
clf = make_pipeline(StandardScaler(), DecisionTreeClassifier())
clf.fit(X_train, y_train)
clf.score(X_test, y_test) 

0.9333333333333333

In [7]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('scaler', StandardScaler()), 
    ('tree', DecisionTreeClassifier())])
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9333333333333333

## Chain Multiple Pipelines

In [9]:
from sklearn.impute import SimpleImputer

preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

clf = Pipeline([('preprocessor', preprocessor), 
                ('tree', DecisionTreeClassifier())])
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9333333333333333

## Applications of Pipeline

### Evaluate Multiple Classifiers

In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf"),
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
    ]

for classifier in classifiers:
    pipe = Pipeline([('preprocessor', preprocessor),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train)   
    print(classifier.__class__.__name__)
    print("model score: %.3f" % pipe.score(X_test, y_test))


KNeighborsClassifier
model score: 0.933
SVC
model score: 0.933
LogisticRegression
model score: 0.800
DecisionTreeClassifier
model score: 0.950
RandomForestClassifier
model score: 0.950
AdaBoostClassifier
model score: 0.883




GradientBoostingClassifier
model score: 0.950


### Sentimental Analysis

In [15]:
import pandas as pd
training = pd.DataFrame([
    (0, "This movie is nice", 1.0),
    (1, "The plot is bad", 0.0),
    (2, "The actors are excellent", 1.0),
    (3, "The acting is lousy", 0.0),
    (4, "The plot is good", 1.0),
    (5, "plot is good", 1.0),
    (6, "The actors are bad", 0.0)
], columns= ["id", "text", "label"])


In [16]:
training

Unnamed: 0,id,text,label
0,0,This movie is nice,1.0
1,1,The plot is bad,0.0
2,2,The actors are excellent,1.0
3,3,The acting is lousy,0.0
4,4,The plot is good,1.0
5,5,plot is good,1.0
6,6,The actors are bad,0.0


In [17]:
X_train = training.text.values
y_train = training.label.values

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

pipe = Pipeline([('tv', TfidfVectorizer()),('clf', LogisticRegression())])
pipe.fit(X_train,y_train)



Pipeline(memory=None,
         steps=[('tv',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,

In [19]:
X_test = ['the actors are good']
pipe.predict(X_test)

array([1.])

### LDA with CV

In [None]:
# # Create a pipeline that standardizes the data then creates a model
# from pandas import read_csv
# from sklearn.model_selection import KFold
# from sklearn.model_selection import cross_val_score
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import Pipeline
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# # load data
# url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
# names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

# dataframe = read_csv(url, names=names)
# array = dataframe.values
# X = array[:,0:8]
# Y = array[:,8]

# # create pipeline
# estimators = []
# estimators.append(('standardize', StandardScaler()))
# estimators.append(('lda', LinearDiscriminantAnalysis()))
# model = Pipeline(estimators)

# # evaluate pipeline
# kfold = KFold(n_splits=10, random_state=7)
# results = cross_val_score(model, X, Y, cv=kfold)
# print(results.mean())