In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('LR').getOrCreate()

In [4]:
df = spark.read.csv('Ecommerce_Customers.csv', header=True, inferSchema=True) #csv no tiene esquema, primera línea header

In [7]:
df.printSchema() #float y double(doble precisión, coma flotante que ocupa el doble que float, más preciso)

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [10]:
df.describe().toPandas() #df de terabytes no se puede pasar a pandas, describe sí porque es poca cosa

Unnamed: 0,summary,Email,Address,Avatar,Avg Session Length,Time on App,Time on Website,Length of Membership,Yearly Amount Spent
0,count,500,500,500,500.0,500.0,500.0,500.0,500.0
1,mean,,,,33.05319351819619,12.052487937166134,37.06044542094859,3.533461555915055,499.3140382585909
2,stddev,,,,0.9925631110845354,0.9942156084725424,1.0104889067564031,0.9992775024112583,79.3147815497068
3,min,aaron04@yahoo.com,"0001 Mack MillNorth Jennifer, NE 42021-5936",AliceBlue,29.532428967057943,8.508152176032603,33.91384724758464,0.2699010899842742,256.67058229005585
4,max,zscott@wright.com,Unit 7502 Box 8345DPO AE 53747,YellowGreen,36.13966248879052,15.126994288792469,40.005181638101895,6.922689335035808,765.5184619388373


Avatar y dirección , email no sirver para decir si va a gastar más o menos..  
Obtener gasto anual (predecir).

In [11]:
from pyspark.ml.feature import VectorAssembler 

Todos los algoritmos de ML en spark tienen una columna de entrada y una de salida (predicción)
    Si queremos usar muchas características se tienen que poner en una columna con VAss.

In [12]:
assembler = VectorAssembler(inputCols=['Avg Session Length','Time on App', 'Time on Website',
                                      'Length of Membership'], outputCol='features') 
#columna donde ponga ensamblado. VAss coge 
#varias columnas de entrada y devuelve una de salida que se va a usar para el algoritmo de regresión
output = assembler.transform(df)

In [15]:
output.select('features').head(1) #1 vector con 4 floats

[Row(features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826]))]

In [16]:
final_data = output.select(['features', 'Yearly Amount Spent'])

In [17]:
train_data, test_data = final_data.randomSplit([0.7,0.3]) #para la validación

In [18]:
train_data.describe().show()
test_data.describe().show()
final_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                354|
|   mean| 499.11172129771825|
| stddev|  82.25682312782251|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                146|
|   mean|  499.8045876020772|
| stddev|  71.94890622966321|
|    min|  319.9288698031936|
|    max|  708.9351848669818|
+-------+-------------------+

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                500|
|   mean|  499.3140382585909|
| stddev|   79.3147815497068|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [20]:
# modelo de regresión lineal
from pyspark.ml.regression import LinearRegression

In [21]:
lr = LinearRegression(labelCol='Yearly Amount Spent') #algoritmo para generar modelo configurado con etiq

In [22]:
lr_model=lr.fit(train_data)

In [23]:
#evaluar modelo
test_results= lr_model.evaluate(test_data)

In [26]:
test_results.residuals.show() #errores/diferencia entre predicción y real

+-------------------+
|          residuals|
+-------------------+
|-11.385311839158135|
| -5.537323494646046|
| 22.937872288864583|
| 3.4248212158030356|
| -8.327915253944525|
|  3.459465289418006|
|-13.755133930570764|
|-1.6311747176306426|
| 17.735403816355586|
|-1.6046097389952934|
|-16.883659605909145|
|-1.6691148252779726|
|    8.9683140558937|
| -2.297744296775136|
|  -9.16869543383308|
| 11.932387802805692|
|-1.4869914686441348|
| 11.734145884356451|
|-13.696005446861136|
|  8.089016122781572|
+-------------------+
only showing top 20 rows



In [28]:
test_results.rootMeanSquaredError # para los valores de gasto está bien

10.74694909738539

In [29]:
test_results.r2 #cantidad de varianza que explica el modelo, 98% muy bien

0.9775349781700026

In [None]:
#como aplicar a la realidad
#coger datos sin etiquetar

In [30]:
unlabeled_data=test_data.select('features')

In [33]:
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|[30.3931845423455...|
|[31.0613251567161...|
|[31.2834474760581...|
|[31.3091926408918...|
|[31.4474464941278...|
|[31.5316044825729...|
|[31.5741380228732...|
|[31.5761319713222...|
|[31.6098395733896...|
|[31.7216523605090...|
|[31.8164283341993...|
|[31.8186165667690...|
|[31.8512531286083...|
|[31.8530748017465...|
|[31.8854062999117...|
|[31.9096268275227...|
|[31.9120759292006...|
|[31.9262720263601...|
|[31.9365486184489...|
|[31.9549038566348...|
+--------------------+
only showing top 20 rows



In [34]:
predictions=lr_model.transform(unlabeled_data) #similar a transform de assembler

In [35]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.3931845423455...|331.31418164235174|
|[31.0613251567161...|493.09278155254765|
|[31.2834474760581...| 568.8432171368029|
|[31.3091926408918...| 429.2958966241306|
|[31.4474464941278...|426.93065734916854|
|[31.5316044825729...|433.05614043994456|
|[31.5741380228732...| 558.1644060911576|
|[31.5761319713222...|  542.857758706959|
|[31.6098395733896...| 426.8101458347526|
|[31.7216523605090...|349.38153637086793|
|[31.8164283341993...| 518.0061511095655|
|[31.8186165667690...| 448.0877881954136|
|[31.8512531286083...| 464.0239326109047|
|[31.8530748017465...| 461.5828677591271|
|[31.8854062999117...| 399.2719684063086|
|[31.9096268275227...| 551.5136478704335|
|[31.9120759292006...|389.02170777435185|
|[31.9262720263601...|380.47078755996995|
|[31.9365486184489...|440.89539034218933|
|[31.9549038566348...| 431.9088638171454|
+--------------------+------------