# Playing with Scikit-learn 

## Understanding classes in Scikit-learn

## Defining applications for data science

### Example Estimator class

In [32]:
# import datasets that Scikit-learn provided
from sklearn.datasets import load_boston
# assign the datasets into a variable
boston = load_boston()
# assign both of boston predictors and target
x, y = boston.data, boston.target
# figuring out the size of x and y
print(f"X:{x.shape} y:{y.shape}")
# the output shows both arrays have the same number of
# roms and x has 13 features/variable

X:(506, 13) y:(506,)


In [33]:
# import LinearRegression class from linear_model
from sklearn.linear_model import LinearRegression
# assign LinearRegression class into hypothesis
# with a normalization
hypothesis = LinearRegression(normalize=True)
# fitting hypothesis into the predictors and outcome
hypothesis.fit(x, y)
# printing the all 13 (number of features in x) 
# LinearRegression coef. 
print(hypothesis.coef_)

[-1.08011358e-01  4.64204584e-02  2.05586264e-02  2.68673382e+00
 -1.77666112e+01  3.80986521e+00  6.92224640e-04 -1.47556685e+00
  3.06049479e-01 -1.23345939e-02 -9.52747232e-01  9.31168327e-03
 -5.24758378e-01]


### Example of Predictor class

In [24]:
import numpy as np
# making an array that consists of 13 features
# necessary to be predicted using hypothesis earlier
new_observation = np.array([1, 0, 1, 0, 0.5, 7, 59,
                             6, 3, 200, 20, 350, 4],
                        # dtype should be float to ensure 0.5 is 0.5
                        # reshape is necessary to ensure it counts as
                        # a 1 dimensional array
                            dtype=float).reshape(1, -1)
print(new_observation)
# predict the outcome from the new_observation
print(hypothesis.predict(new_observation))

[[  1.    0.    1.    0.    0.5   7.   59.    6.    3.  200.   20.  350.
    4. ]]
[25.90156732]


### Quality of the fit (R^2)

In [25]:
hypothesis.score(x, y)

0.7406426641094095

### Example of Transform class

In [29]:
# because LinearRegression doesn't provide a transformation
# we import MinMaxScaler to be used as a transformator
from sklearn.preprocessing import MinMaxScaler
# assign MinMaxScaler to variable scaler
# feature_range is a para. to set the min and max value
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(x)
# transform new_observation with scaler
print(scaler.transform(new_observation))

[[0.01116872 0.         0.01979472 0.         0.23662551 0.65893849
  0.57775489 0.44288845 0.08695652 0.02480916 0.78723404 0.88173887
  0.06263797]]


# Performing the Hasing Trick

## Using hash functions

## Demonstrating the hashing trick

### testing how built-in fuction python to hashing

In [33]:
print(hash("Python"))

8083329979669929672


### return and index in a specific positive range

In [43]:
print(abs(hash("Python")) % 20)

3


### trying one-hot encoding using Scikit-learn

In [45]:
from sklearn.feature_extraction.text import *
# create a encoder to hold a list of vectorized words(?)
oh_encoder = CountVectorizer()
# fitting the encoder and transforming it in the same time
oh_encoded = oh_encoder.fit_transform(['Python for data science', 
                                       'Python for machine learning'])
# display the words and its code
print(oh_encoder.vocabulary_)

{'python': 4, 'for': 1, 'data': 0, 'science': 5, 'machine': 3, 'learning': 2}


### Define a simple hashing trick

In [16]:
# creating 2 string as the input
string_1 = 'Python for data science'
string_2 = 'Python for machine learning'

# defining a function to work as a hashing trick
# first para. is the input
# second para. is the vector size, set to 20 if not specified
def hashing_trick(input_string, vector_size=20):
    # creating the vector
    feature_vector = [0] * 20
    # using for loop to chech every single word in the input
    for word in input_string.split(' '):
        # defining the index using built-in func. hash()
        # and because it can be a negative num. so abs() is 
        # necessary
        # % vector_size is functioned as the index para.
        # so that the index will not be more than its value
        index = abs(hash(word)) % vector_size
        # change the number to 1 designated to its index
        feature_vector[index] = 1
    return feature_vector

### testing both string 

In [18]:
print(hashing_trick(string_1))
print(hashing_trick(string_2))

[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0]
[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]


## Working with deterministic selection

### using sparse matrix

In [22]:
from scipy.sparse import csc_matrix
# creating a vector using previous hashing_trick function
x = hashing_trick(string_1)
# using sparse matrix to just find the non-0 value
# in the vector
print(csc_matrix(x))


  (0, 0)	1
  (0, 3)	1
  (0, 12)	1
  (0, 18)	1


### using HashingVectorizer

In [15]:
import sklearn.feature_extraction.text as txt

string_1 = 'Python for data science'
string_2 = 'Python for machine learning'
# creating the hashingvectorizer
# n_features para. defines how long the sparse matrix will be
# binary para. defines wether the matrix will consist of binary value
# norm. para. defines whether the matrix is normalized or nah
h_trick = txt.HashingVectorizer(n_features=20,
                                binary=True, norm=None)
# transforming the input with the HashingVectorizer
hashed_text = h_trick.transform([string_1, string_2])
hashed_text
# print(hashed_text)

<2x20 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

### testing HashingVectorizer

In [16]:
# define a new string/input to be inputted
string_3 = 'New text has arrived'
# using one-hot encoding from previous try
# todense func. used to return matrix
oh_encoder.transform([string_3]).todense()
# the output is all zero vector because the 
# one-hot encoding func. hasn't updated yet

NameError: name 'oh_encoder' is not defined

In [17]:
# using HashingVectorizer will automaticall add
# the new input to the matrix
h_trick.transform([string_3]).todense()

matrix([[1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 1.]])

# Considering Timing and Performance

## Benchmarking with timeit

### testing %timeit and %%timeit

In [51]:
# the uses of %timeit by assign a list 10^6 ordinal values
# the %timeit will only count its row
%timeit l = [k for k in range(10**6)]
# you can't call the variable listed in a %timeit

87.6 ms ± 3.91 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [52]:
%timeit -n 20 -r 5 l = [k for k in range(10**6)]

87.2 ms ± 1.56 ms per loop (mean ± std. dev. of 5 runs, 20 loops each)


In [92]:
%%timeit 
# using %% timeit to count the time for the entire cell
l = []
for k in range(10**6):
    l.append(k)


135 ms ± 3.81 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### testing %timeit on a different text encoding strategies

In [94]:
# testing on one-hot encoder/Countvectorizer
%timeit oh_encoded = oh_encoder.fit_transform([string_1, string_2])

351 µs ± 16.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [96]:
# testing on HashingVectorizer
%timeit hashing = h_trick.transform([string_1, string_2])

140 µs ± 4.11 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## Working with the memory profiler

### Installing memory_profiler package

In [12]:
import sys
!{sys.executable} -m pip install memory_profiler



### testing memory_profiler

In [26]:
# use this magic func. in every session you want to monitor
%load_ext memory_profiler

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [18]:
hashing = h_trick.transform([string_1, string_2])
# %memit used to track the memory consumption
%memit dense_hashing = hashing.toarray()

peak memory: 86.10 MiB, increment: 0.05 MiB


### Obtaining a complete overview of memory consumption

In [40]:
%%file example_code.py
# creating a file named example_code.py
# function of 2 alternative text encoding strategies
def comparison_test(text):
    import sklearn.feature_extraction.text as txt
    h_trick = txt.HashingVectorizer(n_features=20, binary=True,
                                    norm=None)
    oh_encoder = txt.CountVectorizer()
    oh_encoded = oh_encoder.fit_transform(text)
    hashing = h_trick.transform(text)
    return oh_encoded, hashing



Overwriting example_code.py


In [46]:
# import the function earlier
from example_code import comparison_test
text = ['Python for data science',
        'Python for machine learning']
#  show the overview of the testing of the function called earlier
# in a new window
%mprun -f comparison_test comparison_test(text)




# Running in Parallel on Multiple Cores

## Performing multicore parallelism

## Demonstrating multiprocessing

### using multiprocessing by Support Vector Classifier (SVC) and cross-validation

#### single-core

In [93]:
from sklearn.datasets import load_digits
# assign the dataset into a variable
digits = load_digits()
# assign digits.data to x
# assgin digits.target(the classification target) to y
x, y = digits.data, digits.target
# import SVC which is the algorithm u
from sklearn.svm import SVC
# cross_val_score used to evaluates score by cross validation
from sklearn.model_selection import cross_val_score
# counting the time of cross-validation using SVC algorithm
# 1st para. is the algortihm/object used to fit the data
# 2nd para. is the data to fit
# 3rd para. is the target variable to try to predict
# cv para. is how many folds you determine for cross_validation strategy
# n_jobs para. used to determine how many core used
%timeit single_core = cross_val_score(SVC(), x, y, \
                                      cv=20, n_jobs=1)

2.45 s ± 33.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### multicore

In [95]:
# n_jobs=-1 means you use all of the cores available
%timeit multi_core = cross_val_score(SVC(), x, y, \
                                     cv=20, n_jobs=-1)

1.02 s ± 48.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### to avoiding error caused by internal operations of a multicore task, happens a lot in console or IDE

In [102]:
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
if __name__ == '__main__':
    digits = load_digits()
    x, y = digits.data, digits.target
    multi_core = cross_val_score(SVC(), x, y,
                                 cv=20, n_jobs=-1)