# Curso Big Data #7 - Linear Regression in PySpark 1

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Simple_Linear_Regression').getOrCreate()

#### 1. Importamos el cvs

In [2]:

df = spark.read.csv('C:/Users/pc/pruebas/Ecommerce_Customers.csv', inferSchema='True', header='True')

#### 2. Load los datos

In [5]:
df.printSchema()

df.show(5)

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|

In [6]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [8]:
#Trae las columnas desde atras
df.columns[3:-1]

['Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership']

In [7]:
assembler = VectorAssembler(inputCols=df.columns[3:-1],outputCol='features')

In [8]:
final_df = assembler.transform(df).select('features', 'Yearly Amount Spent')

In [9]:
final_df.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
|[33.8710378793419...|   637.102447915074|
|[32.0215955013870...|  521.5721747578274|
|[32.7391429383803...|  549.9041461052942|
|[33.9877728956856...|  570.2004089636196|
|[31.9365486184489...|  427.1993848953282|
|[33.9925727749537...|  492.6060127179966|
|[33.8793608248049...|  522.3374046069357|
|[29.5324289670579...|  408.6403510726275|
|[33.1903340437226...|  573.4158673313865|
|[32.3879758531538...|  470.4527333009554|
|[30.7377203726281...|  461.7807421962299|
|[32.1253868972878...| 457.84769594494855|
|[32.3388993230671...| 407.70454754954415|
|[32.1878120459321...|  452.3156754800354|
|[32.6178560628234...|   605.061038804892|
+----------

#### 4. Separar los datos en entrenamiento y prueba
Divide aleatoriamente este DataFrame con los pesos proporcionados.Parámetros:
lista de pesos y lista de dobles como pesos con los que dividir el DataFrame. Los pesos se normalizarán si no suman 1,0.

In [11]:
train_data, test_data = final_df.randomSplit([0.7, 0.3])

In [12]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                350|
|   mean| 501.09543622925236|
| stddev|  74.55397434549616|
|    min|  275.9184206503857|
|    max|  725.5848140556806|
+-------+-------------------+



In [13]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                150|
|   mean| 495.15744299371414|
| stddev|  89.57690219886886|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



#### 5. Creamos el objeto para la regresion lineal

In [14]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol='features', labelCol='Yearly Amount Spent', predictionCol='prediction')
lr = lr.fit(train_data)

#### 6. Testeo de resultados

In [15]:
test_results = lr.evaluate(test_data)

In [16]:
test_results.rootMeanSquaredError

10.579107982747972

In [17]:
test_results.r2

0.9859585805767769

In [18]:
final_df.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                500|
|   mean|  499.3140382585909|
| stddev|   79.3147815497068|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [19]:
unlabeled_data = test_data.select('features')

In [20]:
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|[30.4925366965402...|
|[30.8162006488763...|
|[30.9716756438877...|
|[31.0613251567161...|
|[31.0662181616375...|
|[31.1695067987115...|
|[31.2606468698795...|
|[31.3091926408918...|
|[31.3123495994443...|
|[31.3584771924370...|
|[31.3662121671876...|
|[31.4474464941278...|
|[31.5316044825729...|
|[31.6098395733896...|
|[31.6253601348306...|
|[31.6739155032749...|
|[31.7207699002873...|
|[31.8124825597242...|
|[31.9120759292006...|
|[31.9262720263601...|
+--------------------+
only showing top 20 rows



In [21]:
predictions = lr.transform(unlabeled_data)

In [22]:
test_data.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[30.4925366965402...|  282.4712457199145|
|[30.8162006488763...|   266.086340948469|
|[30.9716756438877...|  494.6386097568927|
|[31.0613251567161...|  487.5554580579016|
|[31.0662181616375...| 448.93329320767435|
|[31.1695067987115...|  427.3565308022928|
|[31.2606468698795...|  421.3266312569514|
|[31.3091926408918...|  432.7207178399336|
|[31.3123495994443...|  463.5914180279406|
|[31.3584771924370...|  495.1759504494754|
|[31.3662121671876...|  430.5888825564849|
|[31.4474464941278...|   418.602742095224|
|[31.5316044825729...| 436.51560572936256|
|[31.6098395733896...| 444.54554965110816|
|[31.6253601348306...|  376.3369007569242|
|[31.6739155032749...|  475.7250679098812|
|[31.7207699002873...|   538.774933478023|
|[31.8124825597242...|  392.8103449837972|
|[31.9120759292006...|  387.5347163057077|
|[31.9262720263601...|  392.2049334443264|
+----------

In [23]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.4925366965402...|285.37331487902384|
|[30.8162006488763...| 281.1993172152738|
|[30.9716756438877...|486.59688129302117|
|[31.0613251567161...| 492.1278175699513|
|[31.0662181616375...|  460.780621905345|
|[31.1695067987115...| 414.5608446228325|
|[31.2606468698795...|420.15586796742514|
|[31.3091926408918...|428.38983046476255|
|[31.3123495994443...|444.60855001593677|
|[31.3584771924370...|489.65914959821157|
|[31.3662121671876...| 426.3506469716142|
|[31.4474464941278...|426.95761228334436|
|[31.5316044825729...|431.70195655949465|
|[31.6098395733896...|   425.74748962224|
|[31.6253601348306...| 379.0852229106076|
|[31.6739155032749...| 501.3913120026534|
|[31.7207699002873...| 546.2406380737009|
|[31.8124825597242...|395.36463368572004|
|[31.9120759292006...|388.40057971721603|
|[31.9262720263601...|380.14005155659675|
+--------------------+------------