In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

from pyspark.ml.feature import StringIndexer
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

spark = SparkSession.builder.appName("LinearRegression").getOrCreate()

In [None]:
data = spark.read.csv(path        = "/kaggle/input/pyspark-ml/Ecommerce_Customers.csv", 
                      inferSchema = True,
                      header      = True)

data.printSchema()

In [None]:
data.show(10)

### Transformaciones

In [None]:
indexer = StringIndexer(inputCol = "Avatar", outputCol = "AvatarIndex")

data = indexer.fit(data).transform(data)
data.show(10)

In [None]:
assembler = VectorAssembler(inputCols = ["Avg Session Length", "Time on App", "Time on Website",
                                         "Length of Membership", "AvatarIndex"],
                            outputCol = "features")

output = assembler.transform(data)

output.select("features").show(truncate = False)

In [None]:
final_data = output.select("features", "Yearly Amount Spent")

train, test = final_data.randomSplit(weights = [0.8, 0.2], seed = 42)

In [None]:
train.describe().show()

test.describe().show()

In [None]:
lr = LinearRegression(featuresCol = "features",
                      labelCol = "Yearly Amount Spent", 
                      predictionCol = "predictions")


model = lr.fit(train)


print("Coeficientes: {}".format((model.coefficients)))
print("\n")
print("Intercepcion:{}".format((model.intercept)))

In [None]:
y_hat = model.evaluate(test)

In [None]:
y_hat.predictions.show()

In [None]:
y_hat.residuals.show()

In [None]:
print("RMSE: {}".format(y_hat.rootMeanSquaredError))
print("MSE: {}".format(y_hat.meanSquaredError))
print("r2: {}".format(y_hat.r2))

In [None]:
################################################################################################################################