# Intel® Extension for Scikit-learn Linear Regression for YearPredictionMSD dataset

In [1]:
from timeit import default_timer as timer
from sklearn import metrics
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import os
import requests
import warnings
from IPython.display import HTML
warnings.filterwarnings('ignore')

### Download the data

In [2]:
dataset_dir = 'data'
dataset_name = 'year_prediction_msd'
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt.zip'

os.makedirs(dataset_dir, exist_ok=True)
local_url = os.path.join(dataset_dir, os.path.basename(url))

if not os.path.isfile(local_url):
    response = requests.get(url, stream=True)
    with open(local_url, 'wb+') as file:
        for data in response.iter_content(8192):
            file.write(data)
    
year = pd.read_csv(local_url, header=None)
x = year.iloc[:, 1:].to_numpy(dtype=np.float32)
y = year.iloc[:, 0].to_numpy(dtype=np.float32)

In [14]:
year.to_csv('file1.csv')

Split the data into train and test sets

In [3]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((463810, 90), (51535, 90), (463810,), (51535,))

### Normalize the data

In [4]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler_x = MinMaxScaler()
scaler_y = StandardScaler()

In [5]:
scaler_x.fit(x_train)
x_train = scaler_x.transform(x_train)
x_test = scaler_x.transform(x_test)

scaler_y.fit(y_train.reshape(-1, 1))
y_train = scaler_y.transform(y_train.reshape(-1, 1)).ravel()
y_test = scaler_y.transform(y_test.reshape(-1, 1)).ravel()

### Patch original Scikit-learn with Intel® Extension for Scikit-learn
Intel® Extension for Scikit-learn (previously known as daal4py) contains drop-in replacement functionality for the stock Scikit-learn package. You can take advantage of the performance optimizations of Intel® Extension for Scikit-learn by adding just two lines of code before the usual Scikit-learn imports:

In [6]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Intel® Extension for Scikit-learn patching affects performance of specific Scikit-learn functionality. Refer to the [list of supported algorithms and parameters](https://intel.github.io/scikit-learn-intelex/algorithms.html) for details. In cases when unsupported parameters are used, the package fallbacks into original Scikit-learn. If the patching does not cover your scenarios, [submit an issue on GitHub](https://github.com/intel/scikit-learn-intelex/issues).

Training of the Linear Regression algorithm with Intel® Extension for Scikit-learn for YearPredictionMSD dataset

In [7]:
from sklearn.linear_model import LinearRegression

params = {
    "n_jobs": -1,
    "copy_X": False
}
start = timer()
model = LinearRegression(**params).fit(x_train, y_train)
train_patched = timer() - start
f"Intel® extension for Scikit-learn time: {train_patched:.2f} s"

'Intel® extension for Scikit-learn time: 2.02 s'

In [8]:
from sklearn.linear_model import LinearRegression

params = {
    "n_jobs": None,
    "copy_X": False
}
start = timer()
model = LinearRegression(**params).fit(x_train, y_train)
train_patched = timer() - start
f"Intel® extension for Scikit-learn time: {train_patched:.2f} s"

'Intel® extension for Scikit-learn time: 0.18 s'

Predict and get a result of the Linear Regression algorithm with Intel® Extension for Scikit-learn

In [9]:
y_predict = model.predict(x_test)
mse_metric_opt = metrics.mean_squared_error(y_test, y_predict)
f'Patched Scikit-learn MSE: {mse_metric_opt}'

'Patched Scikit-learn MSE: 0.7718408107757568'

### Train the same algorithm with original Scikit-learn
In order to cancel optimizations, we use *unpatch_sklearn* and reimport the class LinearRegression

In [10]:
from sklearnex import unpatch_sklearn
unpatch_sklearn()

Training of the Linear Regression algorithm with original Scikit-learn library for YearPredictionMSD dataset

In [11]:
from sklearn.linear_model import LinearRegression

start = timer()
model = LinearRegression(**params).fit(x_train, y_train)
train_unpatched = timer() - start
f"Original Scikit-learn time: {train_unpatched:.2f} s"

'Original Scikit-learn time: 2.23 s'

Predict and get a result of the Linear Regression algorithm with original Scikit-learn

In [12]:
y_predict = model.predict(x_test)
mse_metric_original = metrics.mean_squared_error(y_test, y_predict)
f'Original Scikit-learn MSE: {mse_metric_original}'

'Original Scikit-learn MSE: 0.7716856598854065'

In [13]:
HTML(f"<h3>Compare MSE metric of patched Scikit-learn and original</h3>"
     f"MSE metric of patched Scikit-learn: {mse_metric_opt} <br>"
     f"MSE metric of unpatched Scikit-learn: {mse_metric_original} <br>"
     f"Metrics ratio: {mse_metric_opt/mse_metric_original} <br>"
     f"<h3>With Scikit-learn-intelex patching you can:</h3>"
     f"<ul>"
     f"<li>Use your Scikit-learn code for training and prediction with minimal changes (a couple of lines of code);</li>"
     f"<li>Fast execution training and prediction of Scikit-learn models;</li>"
     f"<li>Get the similar quality</li>"
     f"<li>Get speedup in <strong>{(train_unpatched/train_patched):.1f}</strong> times.</li>"
     f"</ul>")

In [19]:
new_predict = model.predict([[51.85726,59.11655,26.39436,-5.4603,-20.69012,-19.95528,-6.72771,2.2959,10.31018,6.26597,-1.788,-6.19786,20.166,598.45275,1140.69539,721.49244,272.84841,564.0669,199.41547,189.04637,217.32042,137.1339,150.34608,98.21589,48.12644,-601.59295,10.58466,-83.35368,96.86756,69.40708,8.06033,-26.01693,-2.93173,26.18398,-12.2466,-14.52391,-121.61676,119.15632,-229.55722,38.53305,-9.97062,-39.45568,-32.91144,32.32695,-14.418,-37.88987,-85.41669,222.02357,-186.58755,-6.08163,1.78072,64.75548,24.55866,-1.12509,-13.58287,-99.66038,-124.73875,67.0263,33.05618,60.25818,28.00288,10.62425,-8.86772,78.13543,-181.10013,74.69489,57.45083,114.08816,-9.91322,7.53612,97.06395,233.17754,-100.68441,27.67012,-37.33008,-0.34676,-207.78766,116.75005,-91.82912,8.3502,-11.50511,-69.18291,60.58456,28.64599,-4.3962,-64.56491,-45.61012,-5.51512,32.35602,12.17352]])
print(new_predict)

[3853.6063806]
