<a href="https://colab.research.google.com/github/amitranjan02/Berkeley-433.1/blob/master/Example_14_Increase_Model_Capacity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Real world data set examples
#!pip install pandas==0.23.4

print(__doc__)


# Code source: Gaël Varoquaux
#              Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris, load_diabetes, load_digits, load_wine, load_breast_cancer 
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import pandas as pd
h = .02  # step size in the mesh

names = ["Linear SVM", "RBF SVM",
         "Decision Tree", "Random Forest", "Neural Net",
         "2-layer Neural Net", "3-layer Neural Net", "AdaBoost"]

classifiers = [
    LinearSVC(C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10),
    MLPClassifier(alpha=1),
    MLPClassifier(hidden_layer_sizes=(100, 100), alpha=1),
    MLPClassifier(hidden_layer_sizes=(100, 100, 100), alpha=1),
    AdaBoostClassifier()]


datasets = [load_iris(),
            load_digits(),
            load_wine(),
            load_breast_cancer()]
print(pd.__version__)
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
    results = {}
    # preprocess dataset, split into training and test part
    X = ds['data']
    y = ds['target']
    print('data size (%d, %d)' % (len(X), len(X[0])))
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.4, random_state=42)

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        import time
        cur_time = time.time()
        clf.fit(X_train, y_train)
        predicted = clf.predict(X_train)

        from sklearn.metrics import confusion_matrix
  
        training_time = time.time() - cur_time
        cur_time = time.time()
        testing_time = time.time() - cur_time
        score = clf.score(X_test, y_test)
        baseline_accuracy = max([float(list(y_train).count(label)) for label in set(y_train)]) / len(y_train)
        train_accuracy = clf.score(X_train, y_train)
        results[name] = [training_time, testing_time, train_accuracy,
                         score, baseline_accuracy, len(y_train)]

        df = pd.DataFrame.from_dict(
            results, orient='index',
            columns=['Train Time', 'Test Time', 'Train Acc', 'Test Acc',
                     'Baseline Train Acc', 'Num Examples'])
    display(df)



Automatically created module for IPython interactive environment
0.24.2
data size (150, 4)




Unnamed: 0,Train Time,Test Time,Train Acc,Test Acc,Baseline Train Acc,Num Examples
Linear SVM,0.005965,2.384186e-07,0.833333,0.85,0.355556,90
RBF SVM,0.001427,2.384186e-07,0.977778,0.983333,0.355556,90
Decision Tree,0.000652,0.0,0.988889,0.983333,0.355556,90
Random Forest,0.010327,0.0,0.977778,0.983333,0.355556,90
Neural Net,0.215851,0.0,0.944444,0.983333,0.355556,90
2-layer Neural Net,0.383531,2.384186e-07,0.988889,0.983333,0.355556,90
3-layer Neural Net,0.591968,4.768372e-07,0.988889,0.983333,0.355556,90
AdaBoost,0.082428,2.384186e-07,0.955556,0.933333,0.355556,90


data size (1797, 64)




Unnamed: 0,Train Time,Test Time,Train Acc,Test Acc,Baseline Train Acc,Num Examples
Linear SVM,0.064162,2.384186e-07,0.974026,0.958275,0.103896,1078
RBF SVM,0.447536,2.384186e-07,1.0,0.119611,0.103896,1078
Decision Tree,0.008888,2.384186e-07,0.687384,0.67872,0.103896,1078
Random Forest,0.019671,2.384186e-07,0.939703,0.884562,0.103896,1078
Neural Net,2.083563,2.384186e-07,1.0,0.977747,0.103896,1078
2-layer Neural Net,3.614911,2.384186e-07,1.0,0.980529,0.103896,1078
3-layer Neural Net,5.268327,2.384186e-07,1.0,0.973574,0.103896,1078
AdaBoost,0.237685,2.384186e-07,0.30705,0.301808,0.103896,1078


data size (178, 13)




Unnamed: 0,Train Time,Test Time,Train Acc,Test Acc,Baseline Train Acc,Num Examples
Linear SVM,0.003947,0.0,1.0,0.972222,0.415094,106
RBF SVM,0.002242,0.0,1.0,0.375,0.415094,106
Decision Tree,0.000744,2.384186e-07,1.0,0.930556,0.415094,106
Random Forest,0.010046,0.0,1.0,0.958333,0.415094,106
Neural Net,0.165486,0.0,1.0,0.972222,0.415094,106
2-layer Neural Net,0.445406,2.384186e-07,1.0,1.0,0.415094,106
3-layer Neural Net,0.633792,2.384186e-07,1.0,0.986111,0.415094,106
AdaBoost,0.091396,2.384186e-07,0.962264,0.944444,0.415094,106


data size (569, 30)




Unnamed: 0,Train Time,Test Time,Train Acc,Test Acc,Baseline Train Acc,Num Examples
Linear SVM,0.012167,2.384186e-07,0.982405,0.991228,0.612903,341
RBF SVM,0.027668,2.384186e-07,1.0,0.649123,0.612903,341
Decision Tree,0.006837,2.384186e-07,0.991202,0.938596,0.612903,341
Random Forest,0.02063,2.384186e-07,0.991202,0.95614,0.612903,341
Neural Net,0.514139,4.768372e-07,0.98827,0.97807,0.612903,341
2-layer Neural Net,1.005696,2.384186e-07,1.0,0.982456,0.612903,341
3-layer Neural Net,1.47559,2.384186e-07,1.0,0.969298,0.612903,341
AdaBoost,0.144332,2.384186e-07,1.0,0.95614,0.612903,341


In [0]:
# Increase capacity of Linear SVM
# Real world data set examples
#!pip install pandas==0.23.4

print(__doc__)


# Code source: Gaël Varoquaux
#              Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris, load_diabetes, load_digits, load_wine, load_breast_cancer 
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
#poly = PolynomialFeatures(2)
#poly.fit_transform(X)
import pandas as pd
h = .02  # step size in the mesh

names = ["Linear SVM",
         "Linear SVM C=1",
         "Linear SVM C=100",
         "Linear SVM C=10000",
         "Linear SVM deg 2",
         "Linear SVM deg 3",
         "Linear SVM deg 4",
         "Linear SVM deg 1000"]

classifiers = [
    LinearSVC(C=0.025),
    LinearSVC(C=1),
    LinearSVC(C=100),
    LinearSVC(C=10000),
    make_pipeline(PolynomialFeatures(2), LinearSVC(C=0.025)),
    make_pipeline(PolynomialFeatures(3), LinearSVC(C=0.025)),
    make_pipeline(PolynomialFeatures(4), LinearSVC(C=0.025)),
    make_pipeline(PolynomialFeatures(1000), LinearSVC(C=0.025)),
]


datasets = [load_iris()]
print(pd.__version__)
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
    results = {}
    # preprocess dataset, split into training and test part
    X = ds['data']
    y = ds['target']
    print('data size (%d, %d)' % (len(X), len(X[0])))
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.4, random_state=42)

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        import time
        cur_time = time.time()
        clf.fit(X_train, y_train)

        
        # Show misclassified samples:
        predicted = clf.predict(X_train)

        from sklearn.metrics import confusion_matrix

        training_time = time.time() - cur_time
        cur_time = time.time()
        testing_time = time.time() - cur_time
        score = clf.score(X_test, y_test)
        baseline_accuracy = max([float(list(y_train).count(label)) for label in set(y_train)]) / len(y_train)
        train_accuracy = clf.score(X_train, y_train)
        results[name] = [training_time, testing_time, train_accuracy,
                         score, baseline_accuracy, len(y_train)]

        df = pd.DataFrame.from_dict(
            results, orient='index',
            columns=['Train Time', 'Test Time', 'Train Acc', '*Test Acc*',
                     'Baseline Train Acc', 'Num Examples'])
    display(df)

# *Faster and simpler configuration but less flexibility.
# Grid search of linear SVM
from sklearn.model_selection import GridSearchCV
parameters = {'C': [2**x for x in range(0, 10)]}
svc = LinearSVC()
cur = time.time()
clf = GridSearchCV(svc, parameters, n_jobs=10, cv=5)
cv_time = time.time() - cur
print("CV time:", cv_time)
print(clf)
clf.fit(X_train, y_train)
print("Best Linear SVM Test Acc:", clf.score(X_test, y_test))

# Grid search of linear SVM with poly-k feature mapping.
from sklearn.pipeline import Pipeline
estimators = [('polyk', PolynomialFeatures()),
              ('linear_svm', LinearSVC())]
polyk_linear_svm = Pipeline(estimators)
parameters = {'polyk__degree': range(1, 10), 'linear_svm__C': [2**x for x in range(0, 10)]}
cur = time.time()
clf = GridSearchCV(polyk_linear_svm, parameters, n_jobs=10, cv=5)
cv_time = time.time() - cur
print("CV time:", cv_time)
print(clf)
clf.fit(X_train, y_train)
print("Best Poly SVM Test Acc:", clf.score(X_test, y_test))


Automatically created module for IPython interactive environment
0.24.2
data size (150, 4)




In [12]:
# Increase capacity of Decision Tree and Adaboost
# Real world data set examples
#!pip install pandas==0.23.4

print(__doc__)


# Code source: Gaël Varoquaux
#              Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris, load_diabetes, load_digits, load_wine, load_breast_cancer 
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
#poly = PolynomialFeatures(2)
#poly.fit_transform(X)
import pandas as pd
h = .02  # step size in the mesh

names = [
    "DT depth=5",
    "DT depth=50",
    "DT depth=500",
    "Adaboost",
    "Adaboost DT depth=2",
    "Adaboost DT depth=5",
    "Adaboost DT depth=10",]

classifiers = [
    DecisionTreeClassifier(max_depth=5),
    DecisionTreeClassifier(max_depth=50),
    DecisionTreeClassifier(max_depth=500),
    AdaBoostClassifier(),
    AdaBoostClassifier(DecisionTreeClassifier(max_depth=2)),
    AdaBoostClassifier(DecisionTreeClassifier(max_depth=5)),
    AdaBoostClassifier(DecisionTreeClassifier(max_depth=8)),
]


datasets = [load_digits()]
print(pd.__version__)
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
    results = {}
    # preprocess dataset, split into training and test part
    X = ds['data']
    y = ds['target']
    print('data size (%d, %d)' % (len(X), len(X[0])))
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.4, random_state=42)

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        import time
        cur_time = time.time()
        clf.fit(X_train, y_train)

        
        # Show misclassified samples:
        predicted = clf.predict(X_train)

        from sklearn.metrics import confusion_matrix

        training_time = time.time() - cur_time
        cur_time = time.time()
        testing_time = time.time() - cur_time
        score = clf.score(X_test, y_test)
        baseline_accuracy = max([float(list(y_train).count(label)) for label in set(y_train)]) / len(y_train)
        train_accuracy = clf.score(X_train, y_train)
        results[name] = [training_time, testing_time, train_accuracy,
                         score, baseline_accuracy, len(y_train)]

        df = pd.DataFrame.from_dict(
            results, orient='index',
            columns=['Train Time', 'Test Time', 'Train Acc', 'Test Acc',
                     'Baseline Train Acc', 'Num Examples'])
    display(df)



Automatically created module for IPython interactive environment
0.24.2
data size (1797, 64)


Unnamed: 0,Train Time,Test Time,Train Acc,Test Acc,Baseline Train Acc,Num Examples
DT depth=5,0.008355,2.384186e-07,0.687384,0.682893,0.103896,1078
DT depth=50,0.013109,0.0,1.0,0.842837,0.103896,1078
DT depth=500,0.013422,2.384186e-07,1.0,0.837274,0.103896,1078
Adaboost,0.197836,2.384186e-07,0.30705,0.301808,0.103896,1078
Adaboost DT depth=2,0.264472,2.384186e-07,0.616883,0.628651,0.103896,1078
Adaboost DT depth=5,0.485944,0.0,1.0,0.947149,0.103896,1078
Adaboost DT depth=10,0.673263,2.384186e-07,1.0,0.974965,0.103896,1078
