<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Read-the-data" data-toc-modified-id="Read-the-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Read the data</a></span></li><li><span><a href="#train-test-split" data-toc-modified-id="train-test-split-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>train test split</a></span></li><li><span><a href="#Linear-Regression-Simple-Example" data-toc-modified-id="Linear-Regression-Simple-Example-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Linear Regression Simple Example</a></span></li><li><span><a href="#Linear-Regression-Example-2" data-toc-modified-id="Linear-Regression-Example-2-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Linear Regression Example 2</a></span></li></ul></div>

In [1]:
import numpy as np
import pandas as pd
import pyspark
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf # @udf("integer") def myfunc(x,y): return x - y
from pyspark.sql import functions as F # stddev format_number date_format, dayofyear, when
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

print([(x.__name__,x.__version__) for x in [np, pd, pyspark]])

spark = pyspark.sql.SparkSession.builder.appName('lrex').getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)
sc.setLogLevel("INFO")

[('numpy', '1.17.1'), ('pandas', '0.25.1'), ('pyspark', '2.4.4')]


In [2]:
from pyspark.ml.regression import LinearRegression

In [3]:
def show_method_attributes(method, ncols=2):
    """ Show all the attributes of a given method.
    Example:
    ========
    show_method_attributes(list)
     """
    x = [I for I in dir(method) if I[0].islower()]
    x = [I for I in x if I not in 'os np pd sys time psycopg2'.split()]

    return pd.DataFrame(np.array_split(x,ncols)).T.fillna('')

# Read the data

In [4]:
!head -2 ../data/sample_linear_regression_data.txt

-9.490009878824548 1:0.4551273600657362 2:0.36644694351969087 3:-0.38256108933468047 4:-0.4458430198517267 5:0.33109790358914726 6:0.8067445293443565 7:-0.2624341731773887 8:-0.44850386111659524 9:-0.07269284838169332 10:0.5658035575800715
0.2577820163584905 1:0.8386555657374337 2:-0.1270180511534269 3:0.499812362510895 4:-0.22686625128130267 5:-0.6452430441812433 6:0.18869982177936828 7:-0.5804648622673358 8:0.651931743775642 9:-0.6555641246242951 10:0.17485476357259122


In [5]:
all_data = spark.read.format('libsvm').load('../data/sample_linear_regression_data.txt')
all_data.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

In [6]:
all_data.count()

501

In [7]:
all_data.columns

['label', 'features']

# train test split

In [8]:
train_data, test_data = all_data.randomSplit([0.7, 0.3])
train_data.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
|-28.571478869743427|(10,[0,1,2,3,4,5,...|
|-28.046018037776633|(10,[0,1,2,3,4,5,...|
| -23.51088409032297|(10,[0,1,2,3,4,5,...|
|-22.837460416919342|(10,[0,1,2,3,4,5,...|
|-21.432387764165806|(10,[0,1,2,3,4,5,...|
|-20.212077258958672|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-19.884560774273424|(10,[0,1,2,3,4,5,...|
|-19.872991038068406|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -19.66731861537172|(10,[0,1,2,3,4,5,...|
|-19.402336030214553|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|-18.845922472898582|(10,[0,1,2,3,4,5,...|
| -18.27521356600463|(10,[0,1,2,3,4,5,...|
|-17.803626188664516|(10,[0,1,2,3,4,5,...|
|-17.494200356883344|(10,[0,1,2,3,4,5,...|
|-17.428674570939506|(10,[0,1,2,3,4,5,...|
| -17.32672073267595|(10,[0,1,2,3,4,5,...|
|-17.026492264209548|(10,[0,1,2,3,4,5,...|
+----------

In [9]:
train_data.count(), test_data.count()

(345, 156)

In [10]:
train_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                345|
|   mean| 0.5674012185594746|
| stddev| 10.700402505809006|
|    min|-28.571478869743427|
|    max|  27.78383192005107|
+-------+-------------------+



In [11]:
test_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                156|
|   mean|-0.4298212851495383|
| stddev|  9.413447162254785|
|    min|-26.805483428483072|
|    max| 24.290551295953957|
+-------+-------------------+



# Linear Regression Simple Example

In [12]:
lr = LinearRegression(featuresCol='features',labelCol='label',
                     predictionCol='prediction')

In [13]:
model = lr.fit(train_data)

In [14]:
train_results = model.summary
show_method_attributes(train_results)

Unnamed: 0,0,1
0,coefficientStandardErrors,pValues
1,degreesOfFreedom,predictionCol
2,devianceResiduals,predictions
3,explainedVariance,r2
4,featuresCol,r2adj
5,labelCol,residuals
6,meanAbsoluteError,rootMeanSquaredError
7,meanSquaredError,tValues
8,numInstances,totalIterations
9,objectiveHistory,


In [15]:
train_results.rootMeanSquaredError

10.512462928415083

In [16]:
test_results = model.evaluate(test_data)

In [17]:
show_method_attributes(test_results)

Unnamed: 0,0,1
0,coefficientStandardErrors,pValues
1,degreesOfFreedom,predictionCol
2,devianceResiduals,predictions
3,explainedVariance,r2
4,featuresCol,r2adj
5,labelCol,residuals
6,meanAbsoluteError,rootMeanSquaredError
7,meanSquaredError,tValues
8,numInstances,


In [18]:
test_results.r2

-0.03714395271981985

In [21]:
unlabeled_data = test_data.select('features')
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 20 rows



In [22]:
predictions = model.transform(unlabeled_data)

In [23]:
predictions.show()

+--------------------+--------------------+
|            features|          prediction|
+--------------------+--------------------+
|(10,[0,1,2,3,4,5,...|  3.0106184608255853|
|(10,[0,1,2,3,4,5,...|  -3.665798783171582|
|(10,[0,1,2,3,4,5,...| -1.1217030507136105|
|(10,[0,1,2,3,4,5,...|   4.558366999880909|
|(10,[0,1,2,3,4,5,...|  2.2534829273191965|
|(10,[0,1,2,3,4,5,...|   1.434309714726609|
|(10,[0,1,2,3,4,5,...| -2.3900749360807416|
|(10,[0,1,2,3,4,5,...| -1.7095803389935507|
|(10,[0,1,2,3,4,5,...|  1.9697853546168298|
|(10,[0,1,2,3,4,5,...|  2.6064631488094805|
|(10,[0,1,2,3,4,5,...| -1.5652293896012985|
|(10,[0,1,2,3,4,5,...|   4.058423368844071|
|(10,[0,1,2,3,4,5,...|  -2.052139072789524|
|(10,[0,1,2,3,4,5,...|-0.16927446308127186|
|(10,[0,1,2,3,4,5,...| -1.8664706064505296|
|(10,[0,1,2,3,4,5,...| -0.8652059924977407|
|(10,[0,1,2,3,4,5,...| -0.7117727780519789|
|(10,[0,1,2,3,4,5,...| -3.3680932592817476|
|(10,[0,1,2,3,4,5,...| 0.46160414050568543|
|(10,[0,1,2,3,4,5,...|   3.88215

In [24]:
predictions.limit(5).toPandas()

Unnamed: 0,features,prediction
0,"(0.4572552704218824, -0.576096954000229, -0.20...",3.010618
1,"(-0.47083104147202404, 0.28748860067800597, 0....",-3.665799
2,"(-0.5195354431261132, 0.8080357948412571, 0.84...",-1.121703
3,"(0.4797855980916854, 0.01997502546020402, -0.8...",4.558367
4,"(-0.01772446594568744, 0.563282914714494, 0.14...",2.253483


# Linear Regression Example 2

In [25]:
!head -2 ../data/Ecommerce-Customers.csv

Email,Address,Avatar,Avg Session Length,Time on App,Time on Website,Length of Membership,Yearly Amount Spent
mstephenson@fernandez.com,"835 Frank TunnelWrightmouth, MI 82180-9605",Violet,34.49726772511229,12.65565114916675,39.57766801952616,4.0826206329529615,587.9510539684005


In [29]:
data = spark.read.csv('../data/Ecommerce_Customers.csv', header=True, inferSchema=True)
print(data.count())
data.limit(5).toPandas()

Unnamed: 0,Email,Address,Avatar,Avg Session Length,Time on App,Time on Website,Length of Membership,Yearly Amount Spent
0,mstephenson@fernandez.com,"835 Frank TunnelWrightmouth, MI 82180-9605",Violet,34.497268,12.655651,39.577668,4.082621,587.951054
1,hduke@hotmail.com,"4547 Archer CommonDiazchester, CA 06566-8576",DarkGreen,31.926272,11.109461,37.268959,2.664034,392.204933
2,pallen@yahoo.com,"24645 Valerie Unions Suite 582Cobbborough, DC ...",Bisque,33.000915,11.330278,37.110597,4.104543,487.547505
3,riverarebecca@gmail.com,"1414 David ThroughwayPort Jason, OH 22070-1220",SaddleBrown,34.305557,13.717514,36.721283,3.120179,581.852344
4,mstephens@davidson-herman.com,"14023 Rodriguez PassagePort Jacobville, PR 372...",MediumAquaMarine,33.330673,12.795189,37.536653,4.446308,599.406092


In [30]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [31]:
from pyspark.ml.feature import VectorAssembler

In [33]:
print(data.columns)

['Email', 'Address', 'Avatar', 'Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership', 'Yearly Amount Spent']


In [34]:
inputCols = ['Avg Session Length', 'Time on App','Time on Website',
             'Length of Membership', 'Yearly Amount Spent']
assembler = VectorAssembler(inputCols=inputCols, outputCol='features')

In [35]:
output = assembler.transform(data)

In [36]:
output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



In [37]:
output.head(1)

[Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005, features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826, 587.9511]))]

In [39]:
output.limit(1).toPandas()

Unnamed: 0,Email,Address,Avatar,Avg Session Length,Time on App,Time on Website,Length of Membership,Yearly Amount Spent,features
0,mstephenson@fernandez.com,"835 Frank TunnelWrightmouth, MI 82180-9605",Violet,34.497268,12.655651,39.577668,4.082621,587.951054,"[34.49726772511229, 12.65565114916675, 39.5776..."


In [40]:
final_data = output.select('features', 'Yearly Amount Spent')
final_data.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
|[33.8710378793419...|   637.102447915074|
|[32.0215955013870...|  521.5721747578274|
|[32.7391429383803...|  549.9041461052942|
|[33.9877728956856...|  570.2004089636196|
|[31.9365486184489...|  427.1993848953282|
|[33.9925727749537...|  492.6060127179966|
|[33.8793608248049...|  522.3374046069357|
|[29.5324289670579...|  408.6403510726275|
|[33.1903340437226...|  573.4158673313865|
|[32.3879758531538...|  470.4527333009554|
|[30.7377203726281...|  461.7807421962299|
|[32.1253868972878...| 457.84769594494855|
|[32.3388993230671...| 407.70454754954415|
|[32.1878120459321...|  452.3156754800354|
|[32.6178560628234...|   605.061038804892|
+----------

In [42]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                358|
|   mean|  498.3030800811718|
| stddev|  82.68661094963576|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [43]:
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [44]:
lr_model = lr.fit(train_data)

In [45]:
test_results = lr_model.evaluate(test_data)

In [46]:
test_results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
|3.410605131648481...|
|-1.70530256582424...|
|-2.27373675443232...|
|-5.11590769747272...|
|4.547473508864641...|
|                 0.0|
|1.136868377216160...|
|6.821210263296962...|
|-2.27373675443232...|
|-3.41060513164848...|
|1.705302565824240...|
|6.821210263296962...|
|-1.13686837721616...|
|-5.68434188608080...|
|-3.41060513164848...|
|-5.68434188608080...|
|-2.27373675443232...|
|3.410605131648481...|
|-3.41060513164848...|
|4.547473508864641...|
+--------------------+
only showing top 20 rows



In [47]:
test_results.rootMeanSquaredError

3.5798707070214937e-13

In [48]:
test_results.r2

1.0

In [49]:
unlabeled_data = test_data.select('features')
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|[29.5324289670579...|
|[30.8364326747734...|
|[31.0613251567161...|
|[31.0662181616375...|
|[31.1695067987115...|
|[31.2606468698795...|
|[31.3091926408918...|
|[31.3123495994443...|
|[31.5147378578019...|
|[31.5261978982398...|
|[31.5316044825729...|
|[31.6098395733896...|
|[31.6253601348306...|
|[31.6610498227460...|
|[31.7207699002873...|
|[31.7216523605090...|
|[31.7242025238451...|
|[31.8512531286083...|
|[31.8854062999117...|
|[31.9096268275227...|
+--------------------+
only showing top 20 rows



In [50]:
preds = lr_model.transform(unlabeled_data)
preds.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[29.5324289670579...|408.64035107262714|
|[30.8364326747734...| 467.5019004269898|
|[31.0613251567161...|487.55545805790183|
|[31.0662181616375...|448.93329320767486|
|[31.1695067987115...|427.35653080229235|
|[31.2606468698795...| 421.3266312569514|
|[31.3091926408918...| 432.7207178399335|
|[31.3123495994443...|463.59141802793994|
|[31.5147378578019...|489.81248799646164|
|[31.5261978982398...|409.09452619233815|
|[31.5316044825729...| 436.5156057293624|
|[31.6098395733896...| 444.5455496511075|
|[31.6253601348306...| 376.3369007569243|
|[31.6610498227460...| 416.3583535799009|
|[31.7207699002873...| 538.7749334780233|
|[31.7216523605090...| 347.7769266318727|
|[31.7242025238451...| 503.3878872879607|
|[31.8512531286083...|472.99224666679805|
|[31.8854062999117...|390.10327297247585|
|[31.9096268275227...| 563.4460356732387|
+--------------------+------------