In [1]:
# Ref https://www.journaldev.com/18341/python-scikit-learn-tutorial

In [2]:
# checking versions 
! pip freeze | grep -E 'pandas|numpy|scipy|learn'

imbalanced-learn==0.3.3
numpy==1.14.5
pandas==0.23.4
scikit-learn==0.19.2
scipy==1.1.0
sklearn==0.0


In [3]:
import sklearn

In [4]:
# import datasets
from sklearn import datasets

In [5]:
#load the data
iris = datasets.load_iris()

In [6]:
type(iris)

sklearn.utils.Bunch

In [7]:
# printing shape of data 
iris.data.shape

(150, 4)

In [10]:
iris.data.view()[:10,]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1]])

In [11]:
from sklearn import svm

In [12]:
clf = svm.LinearSVC()

In [13]:
clf

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [21]:
iris.target.view()[45:55,]

array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])

In [22]:
# learning from data
clf.fit(iris.data, iris.target)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [26]:
# predicting on unseen data
clf.predict([[5.0, 3.6, 1.3, 0.25]])

array([0])

In [28]:
# checking some parameters
clf.coef_ 

array([[ 0.18423206,  0.45122667, -0.80794583, -0.45071058],
       [ 0.05169712, -0.89501188,  0.40591639, -0.93859936],
       [-0.85080513, -0.98677323,  1.38090563,  1.86537822]])

In [30]:
# clf.coef_ These are the weights assigned to the features (coefficients in the primal problem). 
# This is only available in the case of a linear kernel.

### linear regression

In [31]:
from sklearn import linear_model

In [32]:
reg = linear_model.LinearRegression()

In [33]:
# using the model reg to fit a data [x,y] => [X], [Y]  pairs
reg.fit([[0,0],[1,1],[2,2]], [0,1,2])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [34]:
reg.coef_

array([0.5, 0.5])

In [35]:
reg.predict([[4,4]])

array([4.])

In [38]:
reg.predict([[12,12]]) # makes sense, the estimator correspond to a linear function

array([12.])

### k-nearest neighbour classifier

In [39]:
iris.data.view()[45:55,]

array([[4.8, 3. , 1.4, 0.3],
       [5.1, 3.8, 1.6, 0.2],
       [4.6, 3.2, 1.4, 0.2],
       [5.3, 3.7, 1.5, 0.2],
       [5. , 3.3, 1.4, 0.2],
       [7. , 3.2, 4.7, 1.4],
       [6.4, 3.2, 4.5, 1.5],
       [6.9, 3.1, 4.9, 1.5],
       [5.5, 2.3, 4. , 1.3],
       [6.5, 2.8, 4.6, 1.5]])

In [40]:
iris.target.view()[45:55,]

array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])

In [41]:
knn = sklearn.neighbors.KNeighborsClassifier() # leveraging pre-loaded sklearn and iris dataset

In [42]:
knn.fit(iris.data, iris.target)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [44]:
knn.predict([[4.6, 3.2, 1.4, 0.2]]), knn.predict([[5.5, 2.3, 4. , 1.3]])

(array([0]), array([1]))

In [45]:
knn.predict([[4.6, 3.2, 1.4, 0.2], [5.5, 2.3, 4. , 1.3]])

array([0, 1])

In [47]:
knn.predict_proba([[4.6, 3.2, 1.4, 0.2]])

array([[1., 0., 0.]])

In [53]:
knn.predict_proba([[5.5, 2.3, 4. , 1.3]]) # return the proba for each option (3 classes)

array([[0., 1., 0.]])

In [52]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [50]:
iris.target.view()

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

### k-means clustering

In [2]:
from sklearn import cluster

In [3]:
k = 3
k_means = cluster.KMeans(k)

In [5]:
import sklearn

In [7]:
from sklearn import datasets # sometimes you have to import such 

In [8]:
iris = datasets.load_iris()

In [9]:
# recall kmeans is unsupervised
k_means.fit(iris.data)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [10]:
# print result 
k_means.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2,
       2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0], dtype=int32)

In [11]:
# to compare with
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])