[Link](https://runawayhorse001.github.io/LearningApacheSpark/regression.html)

# 5.1 Linear Regression

## Set up spark context and SparkSession


In [2]:
try:
    sc.stop
except:
    pass

from pyspark import SparkContext , SparkConf
from pyspark.sql import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sparkContext = sc)

##  Load dataset


In [3]:
df = spark.read.format('com.databricks.spark.csv').\
                       options(header='true', \
                       inferschema='true').\
            load("./data/Advertising.csv",header=True);
df.show(5,True)
df.printSchema()
df.describe().show()


+-----+-----+---------+-----+
|   TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|230.1| 37.8|     69.2| 22.1|
| 44.5| 39.3|     45.1| 10.4|
| 17.2| 45.9|     69.3|  9.3|
|151.5| 41.3|     58.5| 18.5|
|180.8| 10.8|     58.4| 12.9|
+-----+-----+---------+-----+
only showing top 5 rows

root
 |-- TV: double (nullable = true)
 |-- Radio: double (nullable = true)
 |-- Newspaper: double (nullable = true)
 |-- Sales: double (nullable = true)

+-------+-----------------+------------------+------------------+------------------+
|summary|               TV|             Radio|         Newspaper|             Sales|
+-------+-----------------+------------------+------------------+------------------+
|  count|              200|               200|               200|               200|
|   mean|         147.0425|23.264000000000024|30.553999999999995|14.022500000000003|
| stddev|85.85423631490805|14.846809176168728| 21.77862083852283| 5.217456565710477|
|    min|              0.7|             

##  Convert the data to dense vector (features and label)

In [4]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

# I provide two ways to build the features and labels

# method 1 (good for small feature):
#def transData(row):
#    return Row(label=row["Sales"],
#               features=Vectors.dense([row["TV"],
#                                       row["Radio"],
#                                       row["Newspaper"]]))

# Method 2 (good for large features):
def transData(data):
    return data.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label'])

##  Transform the dataset to DataFrame


In [5]:
transformed= transData(df)
transformed.show(5)

[Stage 5:>                                                          (0 + 1) / 1]                                                                                

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[230.1,37.8,69.2]| 22.1|
| [44.5,39.3,45.1]| 10.4|
| [17.2,45.9,69.3]|  9.3|
|[151.5,41.3,58.5]| 18.5|
|[180.8,10.8,58.4]| 12.9|
+-----------------+-----+
only showing top 5 rows



##  Deal With Categorical Variables


In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.

featureIndexer = VectorIndexer(inputCol="features", \
                               outputCol="indexedFeatures",\
                               maxCategories=4).fit(transformed)

data = featureIndexer.transform(transformed)
data.show(5,True)


+-----------------+-----+-----------------+
|         features|label|  indexedFeatures|
+-----------------+-----+-----------------+
|[230.1,37.8,69.2]| 22.1|[230.1,37.8,69.2]|
| [44.5,39.3,45.1]| 10.4| [44.5,39.3,45.1]|
| [17.2,45.9,69.3]|  9.3| [17.2,45.9,69.3]|
|[151.5,41.3,58.5]| 18.5|[151.5,41.3,58.5]|
|[180.8,10.8,58.4]| 12.9|[180.8,10.8,58.4]|
+-----------------+-----+-----------------+
only showing top 5 rows



## Split the data into training and test sets (40% held out for testing)

In [7]:
# Split the data into training and test sets (40% held out for testing)
(trainingData, testData) = transformed.randomSplit([0.6, 0.4])
trainingData.show(5)
testData.show(5)

+----------------+-----+
|        features|label|
+----------------+-----+
|  [0.7,39.6,8.7]|  1.6|
|  [4.1,11.6,5.7]|  3.2|
| [7.3,28.1,41.4]|  5.5|
|   [8.6,2.1,1.0]|  4.8|
|[11.7,36.9,45.2]|  7.3|
+----------------+-----+
only showing top 5 rows

+---------------+-----+
|       features|label|
+---------------+-----+
| [5.4,29.9,9.4]|  5.3|
|[7.8,38.9,50.6]|  6.6|
| [8.4,27.2,2.1]|  5.7|
|[8.7,48.9,75.0]|  7.2|
|[13.1,0.4,25.6]|  5.3|
+---------------+-----+
only showing top 5 rows



## LinearRegression

In [8]:
# Import LinearRegression class
from pyspark.ml.regression import LinearRegression
# Define LinearRegression algorithm
lr = LinearRegression()
# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, lr])
model = pipeline.fit(trainingData)



22/10/14 14:54:01 WARN Instrumentation: [441de1d4] regParam is zero, which might cause numerical instability and overfitting.
22/10/14 14:54:01 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/10/14 14:54:01 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
22/10/14 14:54:01 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
22/10/14 14:54:01 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


## Summary of the Model
Spark has a poor summary function for data and model. I wrote a summary function which has similar format as R output for the linear regression in PySpark.

In [9]:
def modelsummary(model):
    import numpy as np
    print ("Note: the last rows are the information for Intercept")
    print ("##","-------------------------------------------------")
    print ("##","  Estimate   |   Std.Error | t Values  |  P-value")
    coef = np.append(list(model.coefficients),model.intercept)
    Summary=model.summary

    for i in range(len(Summary.pValues)):
        print ("##",'{:10.6f}'.format(coef[i]),\
        '{:10.6f}'.format(Summary.coefficientStandardErrors[i]),\
        '{:8.3f}'.format(Summary.tValues[i]),\
        '{:10.6f}'.format(Summary.pValues[i]))

    print ("##",'---')
    print ("##","Mean squared error: % .6f" \
           % Summary.meanSquaredError, ", RMSE: % .6f" \
           % Summary.rootMeanSquaredError )
    print ("##","Multiple R-squared: %f" % Summary.r2, ", \
            Total iterations: %i"% Summary.totalIterations)
modelsummary(model.stages[-1])

Note: the last rows are the information for Intercept
## -------------------------------------------------
##   Estimate   |   Std.Error | t Values  |  P-value
##   0.045336   0.001873   24.205   0.000000
##   0.193150   0.010676   18.092   0.000000
##   0.002284   0.007470    0.306   0.760343
##   2.846897   0.395254    7.203   0.000000
## ---
## Mean squared error:  2.679128 , RMSE:  1.636804
## Multiple R-squared: 0.894188 ,             Total iterations: 0


##  Make predictions


In [10]:
# Make predictions.
predictions = model.transform(testData)
predictions

DataFrame[features: vector, label: double, indexedFeatures: vector, prediction: double]

In [11]:
predictions.select("features","label","indexedFeatures").show()

+----------------+-----+----------------+
|        features|label| indexedFeatures|
+----------------+-----+----------------+
|  [5.4,29.9,9.4]|  5.3|  [5.4,29.9,9.4]|
| [7.8,38.9,50.6]|  6.6| [7.8,38.9,50.6]|
|  [8.4,27.2,2.1]|  5.7|  [8.4,27.2,2.1]|
| [8.7,48.9,75.0]|  7.2| [8.7,48.9,75.0]|
| [13.1,0.4,25.6]|  5.3| [13.1,0.4,25.6]|
|[13.2,15.9,49.6]|  5.6|[13.2,15.9,49.6]|
|[16.9,43.7,89.4]|  8.7|[16.9,43.7,89.4]|
|[23.8,35.1,65.9]|  9.2|[23.8,35.1,65.9]|
|[25.0,11.0,29.7]|  7.2|[25.0,11.0,29.7]|
| [25.6,39.0,9.3]|  9.5| [25.6,39.0,9.3]|
| [27.5,1.6,20.7]|  6.9| [27.5,1.6,20.7]|
|[38.0,40.3,11.9]| 10.9|[38.0,40.3,11.9]|
| [38.2,3.7,13.8]|  7.6| [38.2,3.7,13.8]|
|[43.1,26.7,35.1]| 10.1|[43.1,26.7,35.1]|
|[44.5,39.3,45.1]| 10.4|[44.5,39.3,45.1]|
|[44.7,25.8,20.6]| 10.1|[44.7,25.8,20.6]|
|[50.0,11.6,18.4]|  8.4|[50.0,11.6,18.4]|
| [53.5,2.0,21.4]|  8.1| [53.5,2.0,21.4]|
|[57.5,32.8,23.5]| 11.8|[57.5,32.8,23.5]|
|[68.4,44.5,35.6]| 13.6|[68.4,44.5,35.6]|
+----------------+-----+----------

##  Evaluation

In [12]:
from pyspark.ml.evaluation import RegressionEvaluator
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="prediction",
                                metricName="rmse")

rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 1.73338


In [13]:
y_true = predictions.select("label").toPandas()
y_pred = predictions.select("prediction").toPandas()

import sklearn.metrics
r2_score = sklearn.metrics.r2_score(y_true, y_pred)
print('r^2 score: {0}'.format(r2_score))

r^2 score: 0.8997768705853216


# 5.2 Generalized Linear Regression

[Link](https://runawayhorse001.github.io/LearningApacheSpark/regression.html)

## GeneralizedLinearRegression

In [14]:
# Import LinearRegression class
from pyspark.ml.regression import GeneralizedLinearRegression

# Define LinearRegression algorithm
glr = GeneralizedLinearRegression(family="gaussian", link="identity",\
                                  maxIter=10, regParam=0.3)

# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, glr])

model = pipeline.fit(trainingData)

#  Summary of the Model
Spark has a poor summary function for data and model. I wrote a summary function which has similar format as R output for the linear regression in PySpark.

In [17]:
def modelsummary(model):
    import numpy as np
    print ("Note: the last rows are the information for Intercept")
    print ("##","-------------------------------------------------")
    print ("##","  Estimate   |   Std.Error | t Values  |  P-value")
    coef = np.append(list(model.coefficients),model.intercept)
    Summary=model.summary

    for i in range(len(Summary.pValues)):
        print ("##",'{:10.6f}'.format(coef[i]),\
        '{:10.6f}'.format(Summary.coefficientStandardErrors[i]),\
        '{:8.3f}'.format(Summary.tValues[i]),\
        '{:10.6f}'.format(Summary.pValues[i]))

    print ("##",'---')
    print ("##","Mean squared error: % .6f" \
           % Summary.meanSquaredError, ", RMSE: % .6f" \
           % Summary.rootMeanSquaredError )
    print ("##","Multiple R-squared: %f" % Summary.r2, ", \
            Total iterations: %i"% Summary.totalIterations)
modelsummary(model.stages[-1])

Note: the last rows are the information for Intercept
## -------------------------------------------------
##   Estimate   |   Std.Error | t Values  |  P-value
##   0.042842   0.001842   23.260   0.000000
##   0.181492   0.010423   17.413   0.000000
##   0.005500   0.007292    0.754   0.452130
##   3.379777   0.392207    8.617   0.000000
## ---


AttributeError: 'GeneralizedLinearRegressionTrainingSummary' object has no attribute 'meanSquaredError'

# Make predictions


In [18]:
# Make predictions.
predictions = model.transform(testData)
predictions

DataFrame[features: vector, label: double, indexedFeatures: vector, prediction: double]

In [19]:
predictions.select("features","label","indexedFeatures").show()

+----------------+-----+----------------+
|        features|label| indexedFeatures|
+----------------+-----+----------------+
|  [5.4,29.9,9.4]|  5.3|  [5.4,29.9,9.4]|
| [7.8,38.9,50.6]|  6.6| [7.8,38.9,50.6]|
|  [8.4,27.2,2.1]|  5.7|  [8.4,27.2,2.1]|
| [8.7,48.9,75.0]|  7.2| [8.7,48.9,75.0]|
| [13.1,0.4,25.6]|  5.3| [13.1,0.4,25.6]|
|[13.2,15.9,49.6]|  5.6|[13.2,15.9,49.6]|
|[16.9,43.7,89.4]|  8.7|[16.9,43.7,89.4]|
|[23.8,35.1,65.9]|  9.2|[23.8,35.1,65.9]|
|[25.0,11.0,29.7]|  7.2|[25.0,11.0,29.7]|
| [25.6,39.0,9.3]|  9.5| [25.6,39.0,9.3]|
| [27.5,1.6,20.7]|  6.9| [27.5,1.6,20.7]|
|[38.0,40.3,11.9]| 10.9|[38.0,40.3,11.9]|
| [38.2,3.7,13.8]|  7.6| [38.2,3.7,13.8]|
|[43.1,26.7,35.1]| 10.1|[43.1,26.7,35.1]|
|[44.5,39.3,45.1]| 10.4|[44.5,39.3,45.1]|
|[44.7,25.8,20.6]| 10.1|[44.7,25.8,20.6]|
|[50.0,11.6,18.4]|  8.4|[50.0,11.6,18.4]|
| [53.5,2.0,21.4]|  8.1| [53.5,2.0,21.4]|
|[57.5,32.8,23.5]| 11.8|[57.5,32.8,23.5]|
|[68.4,44.5,35.6]| 13.6|[68.4,44.5,35.6]|
+----------------+-----+----------

# Evaluation

In [20]:
from pyspark.ml.evaluation import RegressionEvaluator
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="prediction",
                                metricName="rmse")

rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 1.75231


In [21]:
y_true = predictions.select("label").toPandas()
y_pred = predictions.select("prediction").toPandas()

import sklearn.metrics
r2_score = sklearn.metrics.r2_score(y_true, y_pred)
print('r2_score: {0}'.format(r2_score))

r2_score: 0.897576528173309


# 5.3 Decision Tree Regressor

[Link](https://runawayhorse001.github.io/LearningApacheSpark/regression.html)

# DecisionTreeRegressor

In [22]:
from pyspark.ml.regression import DecisionTreeRegressor

# Train a DecisionTree model.
dt = DecisionTreeRegressor(featuresCol="indexedFeatures")

# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, dt])

model = pipeline.fit(trainingData)

#  Make predictions


In [23]:
# Make predictions.
predictions = model.transform(testData)
predictions

DataFrame[features: vector, label: double, indexedFeatures: vector, prediction: double]

In [24]:
predictions.select("features","label","indexedFeatures").show()

+----------------+-----+----------------+
|        features|label| indexedFeatures|
+----------------+-----+----------------+
|  [5.4,29.9,9.4]|  5.3|  [5.4,29.9,9.4]|
| [7.8,38.9,50.6]|  6.6| [7.8,38.9,50.6]|
|  [8.4,27.2,2.1]|  5.7|  [8.4,27.2,2.1]|
| [8.7,48.9,75.0]|  7.2| [8.7,48.9,75.0]|
| [13.1,0.4,25.6]|  5.3| [13.1,0.4,25.6]|
|[13.2,15.9,49.6]|  5.6|[13.2,15.9,49.6]|
|[16.9,43.7,89.4]|  8.7|[16.9,43.7,89.4]|
|[23.8,35.1,65.9]|  9.2|[23.8,35.1,65.9]|
|[25.0,11.0,29.7]|  7.2|[25.0,11.0,29.7]|
| [25.6,39.0,9.3]|  9.5| [25.6,39.0,9.3]|
| [27.5,1.6,20.7]|  6.9| [27.5,1.6,20.7]|
|[38.0,40.3,11.9]| 10.9|[38.0,40.3,11.9]|
| [38.2,3.7,13.8]|  7.6| [38.2,3.7,13.8]|
|[43.1,26.7,35.1]| 10.1|[43.1,26.7,35.1]|
|[44.5,39.3,45.1]| 10.4|[44.5,39.3,45.1]|
|[44.7,25.8,20.6]| 10.1|[44.7,25.8,20.6]|
|[50.0,11.6,18.4]|  8.4|[50.0,11.6,18.4]|
| [53.5,2.0,21.4]|  8.1| [53.5,2.0,21.4]|
|[57.5,32.8,23.5]| 11.8|[57.5,32.8,23.5]|
|[68.4,44.5,35.6]| 13.6|[68.4,44.5,35.6]|
+----------------+-----+----------

#  Evaluation

In [25]:
from pyspark.ml.evaluation import RegressionEvaluator
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="prediction",
                                metricName="rmse")

rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 1.62201


In [26]:
y_true = predictions.select("label").toPandas()
y_pred = predictions.select("prediction").toPandas()

import sklearn.metrics
r2_score = sklearn.metrics.r2_score(y_true, y_pred)
print('r2_score: {0}'.format(r2_score))

r2_score: 0.9122424724118161


# 5.4 Random Forest Regressor

[Link](https://runawayhorse001.github.io/LearningApacheSpark/regression.html)

# RandomForestRegressor

In [27]:
# Import LinearRegression class
from pyspark.ml.regression import RandomForestRegressor

# Define LinearRegression algorithm
rf = RandomForestRegressor() # featuresCol="indexedFeatures",numTrees=2, maxDepth=2, seed=42

# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])
model = pipeline.fit(trainingData)

#  Make predictions


In [28]:
# Make predictions.
predictions = model.transform(testData)
predictions

DataFrame[features: vector, label: double, indexedFeatures: vector, prediction: double]

In [29]:
predictions.select("features","label","indexedFeatures").show()

+----------------+-----+----------------+
|        features|label| indexedFeatures|
+----------------+-----+----------------+
|  [5.4,29.9,9.4]|  5.3|  [5.4,29.9,9.4]|
| [7.8,38.9,50.6]|  6.6| [7.8,38.9,50.6]|
|  [8.4,27.2,2.1]|  5.7|  [8.4,27.2,2.1]|
| [8.7,48.9,75.0]|  7.2| [8.7,48.9,75.0]|
| [13.1,0.4,25.6]|  5.3| [13.1,0.4,25.6]|
|[13.2,15.9,49.6]|  5.6|[13.2,15.9,49.6]|
|[16.9,43.7,89.4]|  8.7|[16.9,43.7,89.4]|
|[23.8,35.1,65.9]|  9.2|[23.8,35.1,65.9]|
|[25.0,11.0,29.7]|  7.2|[25.0,11.0,29.7]|
| [25.6,39.0,9.3]|  9.5| [25.6,39.0,9.3]|
| [27.5,1.6,20.7]|  6.9| [27.5,1.6,20.7]|
|[38.0,40.3,11.9]| 10.9|[38.0,40.3,11.9]|
| [38.2,3.7,13.8]|  7.6| [38.2,3.7,13.8]|
|[43.1,26.7,35.1]| 10.1|[43.1,26.7,35.1]|
|[44.5,39.3,45.1]| 10.4|[44.5,39.3,45.1]|
|[44.7,25.8,20.6]| 10.1|[44.7,25.8,20.6]|
|[50.0,11.6,18.4]|  8.4|[50.0,11.6,18.4]|
| [53.5,2.0,21.4]|  8.1| [53.5,2.0,21.4]|
|[57.5,32.8,23.5]| 11.8|[57.5,32.8,23.5]|
|[68.4,44.5,35.6]| 13.6|[68.4,44.5,35.6]|
+----------------+-----+----------

# 11. Evaluation

In [30]:
from pyspark.ml.evaluation import RegressionEvaluator
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="prediction",
                                metricName="rmse")

rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 2.38239


In [31]:
y_true = predictions.select("label").toPandas()
y_pred = predictions.select("prediction").toPandas()

import sklearn.metrics
r2_score = sklearn.metrics.r2_score(y_true, y_pred)
print('r2_score: {0}'.format(r2_score))

r2_score: 0.8106775745795071


# 5.5 GBT Regressor

[Link](https://runawayhorse001.github.io/LearningApacheSpark/regression.html)

# GBTRegressor

In [32]:
# Import LinearRegression class
from pyspark.ml.regression import GBTRegressor

# Define LinearRegression algorithm
rf = GBTRegressor() #numTrees=2, maxDepth=2, seed=42

# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])
model = pipeline.fit(trainingData)

# Make predictions


In [33]:
# Make predictions.
predictions = model.transform(testData)
predictions

DataFrame[features: vector, label: double, indexedFeatures: vector, prediction: double]

In [34]:
predictions.select("features","label","indexedFeatures").show()

+----------------+-----+----------------+
|        features|label| indexedFeatures|
+----------------+-----+----------------+
|  [5.4,29.9,9.4]|  5.3|  [5.4,29.9,9.4]|
| [7.8,38.9,50.6]|  6.6| [7.8,38.9,50.6]|
|  [8.4,27.2,2.1]|  5.7|  [8.4,27.2,2.1]|
| [8.7,48.9,75.0]|  7.2| [8.7,48.9,75.0]|
| [13.1,0.4,25.6]|  5.3| [13.1,0.4,25.6]|
|[13.2,15.9,49.6]|  5.6|[13.2,15.9,49.6]|
|[16.9,43.7,89.4]|  8.7|[16.9,43.7,89.4]|
|[23.8,35.1,65.9]|  9.2|[23.8,35.1,65.9]|
|[25.0,11.0,29.7]|  7.2|[25.0,11.0,29.7]|
| [25.6,39.0,9.3]|  9.5| [25.6,39.0,9.3]|
| [27.5,1.6,20.7]|  6.9| [27.5,1.6,20.7]|
|[38.0,40.3,11.9]| 10.9|[38.0,40.3,11.9]|
| [38.2,3.7,13.8]|  7.6| [38.2,3.7,13.8]|
|[43.1,26.7,35.1]| 10.1|[43.1,26.7,35.1]|
|[44.5,39.3,45.1]| 10.4|[44.5,39.3,45.1]|
|[44.7,25.8,20.6]| 10.1|[44.7,25.8,20.6]|
|[50.0,11.6,18.4]|  8.4|[50.0,11.6,18.4]|
| [53.5,2.0,21.4]|  8.1| [53.5,2.0,21.4]|
|[57.5,32.8,23.5]| 11.8|[57.5,32.8,23.5]|
|[68.4,44.5,35.6]| 13.6|[68.4,44.5,35.6]|
+----------------+-----+----------

# Evaluation

In [35]:
from pyspark.ml.evaluation import RegressionEvaluator
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="prediction",
                                metricName="rmse")

rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 1.47746


In [36]:
y_true = predictions.select("label").toPandas()
y_pred = predictions.select("prediction").toPandas()

import sklearn.metrics
r2_score = sklearn.metrics.r2_score(y_true, y_pred)
print('r2_score: {0}'.format(r2_score))

r2_score: 0.9271866710374933
