In [4]:
# Run some setup code for this notebook.
import random
import numpy as np
import pandas as pd
from sklearn import datasets
# Estimators Transformers Pipeline, Evaluation Model

In [5]:
#from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [7]:
boston = datasets.load_boston()
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

In [8]:
print(boston.keys())
print("Header : ", boston.feature_names)
print("Target : ", boston.target[np.random.choice(150,20)])
print(boston.data.shape)

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename', 'data_module'])
Header :  ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
Target :  [18.4 23.4 25.3 12.7 11.8 35.4 26.6 22.9 20.2 14.5 20.4 13.5 38.7 14.4
 18.4 18.8 24.2 18.9 18.7 14.6]
(506, 13)


In [9]:
boston_df = pd.DataFrame(boston.data, columns=boston.feature_names)
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [6]:
iris = datasets.load_iris()
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [4]:
print(iris.keys())
print("Header : ", iris.feature_names)
print("Target : ", iris.target[np.random.choice(150,20)])
print(iris.data.shape)

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
Header :  ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Target :  [1 1 1 0 2 1 1 1 1 2 1 0 2 0 0 0 1 1 2 0]
(150, 4)


In [5]:
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [6]:
from sklearn.model_selection import train_test_split
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4)

In [15]:
#Logistic Regression
kn = KNeighborsClassifier(3)

kn.fit(X_train,y_train)

# predict the response for new observations
y_pred=kn.predict(X_test)
print(f"Accuracy for KNeighborsClassifier(3): {metrics.accuracy_score(y_test, y_pred)}")

Accuracy for KNeighborsClassifier(3): 0.9666666666666667


In [16]:
from sklearn import svm

# Create the SVC model
svc_model = svm.SVC(C=0.025, kernel='linear')

# Fit the data to the SVC model
svc_model.fit(X_train, y_train)
y_pred = svc_model.predict(X_test)
print(f"Accuracy for SVC(kernel='linear', C=0.025): {metrics.accuracy_score(y_test, y_pred)}")

Accuracy for SVC(kernel='linear', C=0.025): 0.95


In [17]:
from sklearn import svm

# Create the SVC model
svc_model = svm.SVC(gamma=2,C=1)

# Fit the data to the SVC model
svc_model.fit(X_train, y_train)
y_pred = svc_model.predict(X_test)
print(f"Accuracy for SVC(gamma=2,C=1): {metrics.accuracy_score(y_test, y_pred)}")

Accuracy for SVC(gamma=2,C=1): 0.9833333333333333


In [18]:
from sklearn import gaussian_process

# Create the SVC model
g_model = gaussian_process.GaussianProcessClassifier(1.0 * gaussian_process.kernels.RBF(1.0))

# Fit the data to the SVC model
g_model.fit(X_train, y_train)
y_pred = g_model.predict(X_test)
print(f"Accuracy for GaussianProcessClassifier(1.0 * RBF(1.0)): {metrics.accuracy_score(y_test, y_pred)}")

Accuracy for GaussianProcessClassifier(1.0 * RBF(1.0)): 0.9666666666666667


In [19]:
from sklearn.tree import DecisionTreeClassifier

# Create the SVC model
decision_tree_model = DecisionTreeClassifier(max_depth=5)

# Fit the data to the SVC model
decision_tree_model.fit(X_train, y_train)
y_pred = decision_tree_model.predict(X_test)
print(f"Accuracy for DecisionTreeClassifier(max_depth=5): {metrics.accuracy_score(y_test, y_pred)}")

Accuracy for DecisionTreeClassifier(max_depth=5): 0.9666666666666667
