In [1]:
from pyspark.sql import SparkSession

# Inicia la sesión de Spark
spark = SparkSession.builder.appName("PadelLogReg").getOrCreate()

# Carga los datos desde el CSV
data = spark.read.option("encoding", "Latin1").csv("padel.csv", header=True, inferSchema=True, sep=";")

data.head(3)

[Row(id=1, date='16/10/2021', location='Montgat', teammate='Martina', rival1='JoanF', rival2='Marta', result='W', score='4-6, 6-4, 6-4', team='9', indiv='8'),
 Row(id=2, date='06/11/2021', location='Cerdanyola', teammate='Martina', rival1='Andreu', rival2='Mireia', result='L', score='4-6, 6-2, 4-6', team='8', indiv='6'),
 Row(id=3, date='13/11/2021', location='Polinyà', teammate='Martina', rival1='Uri', rival2='Carol', result='W', score='3-6, 6-3, 6-2', team='8,5', indiv='8,25')]

In [2]:
from pyspark.sql.functions import regexp_replace
from pyspark.sql.types import DateType
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.functions import col, month, to_date

data = data.withColumn("team", regexp_replace("team", ",", "."))
data = data.withColumn("indiv", regexp_replace("indiv", ",", "."))
data = data.withColumn("team", data["team"].cast("double"))
data = data.withColumn("indiv", data["indiv"].cast("double"))
data = data.filter(data['result'] != 'N')

# Convertir la columna "date" al tipo de dato DateType
data = data.withColumn("date", to_date(col("date"), "dd/MM/yyyy"))

# Extraer el mes de la columna "date"
data = data.withColumn("month", month("date"))

In [3]:
print(data.columns)
print("\n")
print(data.printSchema())

['id', 'date', 'location', 'teammate', 'rival1', 'rival2', 'result', 'score', 'team', 'indiv', 'month']


root
 |-- id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- location: string (nullable = true)
 |-- teammate: string (nullable = true)
 |-- rival1: string (nullable = true)
 |-- rival2: string (nullable = true)
 |-- result: string (nullable = true)
 |-- score: string (nullable = true)
 |-- team: double (nullable = true)
 |-- indiv: double (nullable = true)
 |-- month: integer (nullable = true)

None


In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer


indexer1 = StringIndexer(inputCol="location", outputCol="location_index")
indexer2 = StringIndexer(inputCol="teammate", outputCol="teammate_index")
indexer3 = StringIndexer(inputCol="rival1", outputCol="rival1_index")
indexer4 = StringIndexer(inputCol="rival2", outputCol="rival2_index")
indexer5 = StringIndexer(inputCol="result", outputCol="result_index")

indexers = [indexer1, indexer2, indexer3, indexer4, indexer5]

pipeline = Pipeline(stages=indexers)
indexed = pipeline.fit(data).transform(data)

print(indexed.columns)

['id', 'date', 'location', 'teammate', 'rival1', 'rival2', 'result', 'score', 'team', 'indiv', 'month', 'location_index', 'teammate_index', 'rival1_index', 'rival2_index', 'result_index']


In [5]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler


assembler = VectorAssembler(inputCols=[
    'month', 
    'team', 
    'indiv', 
    'location_index', 
    'teammate_index', 
    'rival1_index', 
    'rival2_index'],
outputCol="features",
handleInvalid="skip")

output = assembler.transform(indexed)

In [6]:
final_data = output.select("features", "result_index")
final_data.show()

+--------------------+------------+
|            features|result_index|
+--------------------+------------+
|[10.0,9.0,8.0,2.0...|         0.0|
|[11.0,8.0,6.0,4.0...|         1.0|
|[11.0,8.5,8.25,1....|         0.0|
|[11.0,7.0,7.0,0.0...|         1.0|
|[12.0,7.0,7.0,2.0...|         0.0|
|[1.0,6.0,6.0,0.0,...|         1.0|
|[1.0,7.0,9.0,1.0,...|         1.0|
|[1.0,6.5,7.0,8.0,...|         1.0|
|[2.0,5.0,3.0,0.0,...|         1.0|
|[2.0,9.0,8.0,2.0,...|         0.0|
|[4.0,9.0,6.0,4.0,...|         0.0|
|[4.0,5.0,6.0,5.0,...|         0.0|
|[6.0,5.0,5.0,2.0,...|         0.0|
|[7.0,8.5,9.0,2.0,...|         0.0|
|[8.0,9.0,8.5,0.0,...|         0.0|
|[9.0,9.0,8.0,6.0,...|         0.0|
|[9.0,2.0,7.0,4.0,...|         0.0|
|[9.0,5.0,7.0,3.0,...|         0.0|
|[10.0,8.0,7.0,0.0...|         1.0|
|[10.0,7.5,6.0,3.0...|         0.0|
+--------------------+------------+
only showing top 20 rows



In [7]:
final_data = output.select("features", "result_index")

In [8]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

train_data.describe().show()
test_data.describe().show()

+-------+-------------------+
|summary|       result_index|
+-------+-------------------+
|  count|                 29|
|   mean|0.27586206896551724|
| stddev|  0.454858826147342|
|    min|                0.0|
|    max|                1.0|
+-------+-------------------+

+-------+------------------+
|summary|      result_index|
+-------+------------------+
|  count|                 6|
|   mean|0.3333333333333333|
| stddev|0.5163977794943223|
|    min|               0.0|
|    max|               1.0|
+-------+------------------+



In [9]:
from pyspark.ml.classification import LogisticRegression

lr_padel = LogisticRegression(labelCol="result_index")
fitted_padel_model = lr_padel.fit(train_data)

In [10]:
training_sum = fitted_padel_model.summary
training_sum.predictions.describe().show()

+-------+-------------------+------------------+
|summary|       result_index|        prediction|
+-------+-------------------+------------------+
|  count|                 29|                29|
|   mean|0.27586206896551724|0.1724137931034483|
| stddev|  0.454858826147342|0.3844258722192448|
|    min|                0.0|               0.0|
|    max|                1.0|               1.0|
+-------+-------------------+------------------+



In [11]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

pred_and_labels = fitted_padel_model.evaluate(test_data)
pred_and_labels.predictions.show()

+--------------------+------------+--------------------+--------------------+----------+
|            features|result_index|       rawPrediction|         probability|prediction|
+--------------------+------------+--------------------+--------------------+----------+
|[1.0,7.0,9.0,1.0,...|         1.0|[1.31158789870238...|[0.78777874747933...|       0.0|
|[6.0,5.0,5.0,2.0,...|         0.0|[-1.9809213190807...|[0.12122065900590...|       1.0|
|[7.0,8.5,9.0,2.0,...|         0.0|[24.7123139295793...|[0.99999999998148...|       0.0|
|[9.0,9.0,8.0,6.0,...|         0.0|[1.42833210055752...|[0.80664130468106...|       0.0|
|[10.0,3.5,5.0,0.0...|         1.0|[26.4248435711297...|[0.99999999999665...|       0.0|
|[10.0,6.0,5.0,1.0...|         0.0|[5.85564556545538...|[0.99714449097479...|       0.0|
+--------------------+------------+--------------------+--------------------+----------+



In [12]:
padel_eval = BinaryClassificationEvaluator(labelCol="result_index", rawPredictionCol="prediction")

In [13]:
auc = padel_eval.evaluate(pred_and_labels.predictions)
print("AUC: ", auc)

AUC:  0.375
