# Apache Spark ML Training

Dataset: https://www.kaggle.com/competitions/avazu-ctr-prediction/overview

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark import __version__

# import matplotlib.pyplot as plt
# import pyspark.pandas as ps
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

print("PySpark", __version__)


PySpark 3.2.1


In [2]:
# Start Spark Standalone Cluster
# $SPARK_HOME/sbin/start-all.sh

In [3]:

spark = SparkSession.builder \
    .appName("Spark ML") \
    .master("spark://carloshkayser:7077") \
    .config("spark.executor.memory", "24g") \
    .getOrCreate()

spark

22/06/15 00:06:20 WARN Utils: Your hostname, carloshkayser resolves to a loopback address: 127.0.1.1; using 10.32.45.215 instead (on interface ens160)
22/06/15 00:06:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/15 00:06:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/06/15 00:06:22 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
from pyspark.sql.types import StructType, StructField, FloatType, StringType, LongType, IntegerType, DoubleType

schema = StructType([
    StructField('id', LongType(), True),
    StructField("click", FloatType(), True),
    StructField("hour", IntegerType(), True),
    StructField("C1", IntegerType(), True),
    StructField("banner_pos", IntegerType(), True),
    StructField("site_id", StringType(), True),
    StructField("site_domain", StringType(), True),
    StructField("site_category", StringType(), True),
    StructField("app_id", StringType(), True),
    StructField("app_domain", StringType(), True),
    StructField("app_category", StringType(), True),
    StructField("device_id", StringType(), True),
    StructField("device_ip", StringType(), True),
    StructField("device_model", StringType(), True),
    StructField("device_type", IntegerType(), True),
    StructField("device_conn_type", IntegerType(), True),
    StructField("C14", IntegerType(), True),
    StructField("C15", IntegerType(), True),
    StructField("C16", IntegerType(), True),
    StructField("C17", IntegerType(), True),
    StructField("C18", IntegerType(), True),
    StructField("C19", IntegerType(), True),
    StructField("C20", IntegerType(), True),
    StructField("C21", IntegerType(), True)
])

In [5]:
df = spark.read.format("csv") \
    .option("header", "true") \
    .schema(schema) \
    .load('../dataset/click-through-rate-prediction/train.gz')


In [6]:
df = df.na.drop()

df = df.limit(1000000)

df = df.withColumnRenamed("click", "label")

In [7]:
df.count()

22/06/15 00:06:43 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/06/15 00:06:58 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/06/15 00:07:13 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/06/15 00:07:28 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/06/15 00:07:43 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
22/06/15 00:07:58 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure th

1000000

In [8]:
test_data, raw_training_data = df.randomSplit([0.3, 0.7])


In [9]:
raw_training_data.printSchema()


root
 |-- id: long (nullable = true)
 |-- label: float (nullable = true)
 |-- hour: integer (nullable = true)
 |-- C1: integer (nullable = true)
 |-- banner_pos: integer (nullable = true)
 |-- site_id: string (nullable = true)
 |-- site_domain: string (nullable = true)
 |-- site_category: string (nullable = true)
 |-- app_id: string (nullable = true)
 |-- app_domain: string (nullable = true)
 |-- app_category: string (nullable = true)
 |-- device_id: string (nullable = true)
 |-- device_ip: string (nullable = true)
 |-- device_model: string (nullable = true)
 |-- device_type: integer (nullable = true)
 |-- device_conn_type: integer (nullable = true)
 |-- C14: integer (nullable = true)
 |-- C15: integer (nullable = true)
 |-- C16: integer (nullable = true)
 |-- C17: integer (nullable = true)
 |-- C18: integer (nullable = true)
 |-- C19: integer (nullable = true)
 |-- C20: integer (nullable = true)
 |-- C21: integer (nullable = true)



In [10]:
from pyspark.ml.classification import LogisticRegression

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import MinMaxScaler

In [11]:
# Create the logistic regression model
lr = LogisticRegression(maxIter=10, regParam= 0.01)

In [12]:
# Create a StringIndexer
from pyspark.ml.feature import StringIndexer

cols = []
pipeline_stages = []
feature_columns = []

for name, type in raw_training_data.dtypes:
    if type == "string":
        feature_columns.append(f"{name}Index")
        pipeline_stages.append(StringIndexer(inputCol=name, outputCol=f"{name}Index", handleInvalid="skip"))
    
        cols.append(f"{name}Index")
    
    else:
        cols.append(name)


In [13]:
# Create a one hot encoder
feature_columns = ['site_idIndex', 'site_domainIndex', 'site_categoryIndex', 'app_idIndex', 'app_domainIndex', 'app_categoryIndex', 'device_idIndex', 'device_ipIndex', 'device_modelIndex']
output_ohe_columns = ['site_id_ohe', 'site_domain_ohe', 'site_category_ohe', 'app_id_ohe', 'app_domain_ohe', 'app_category_ohe', 'device_id_ohe', 'device_ip_ohe', 'device_model_ohe']

ohe = OneHotEncoder(inputCols = feature_columns, outputCols = output_ohe_columns)


In [14]:
from pyspark.ml.feature import MinMaxScaler

# Input list for scaling
inputs = ["hour", "C1", "banner_pos", "device_type", "device_conn_type", "C14", "C15", "C16", "C17", "C18", "C19", "C20", "C21"]

# We scale our inputs
assembler1 = VectorAssembler(inputCols=inputs, outputCol="features_scaled1")
scaler = MinMaxScaler(inputCol="features_scaled1", outputCol="features_scaled")


In [15]:
# We create a second assembler for the encoded columns.
assembler2 = VectorAssembler(
  inputCols=['features_scaled'] + output_ohe_columns, outputCol="features"
)


In [16]:
from pyspark.ml import Pipeline

# Create stages list
myStages = pipeline_stages + [assembler1, scaler, ohe, assembler2, lr]

# Set up the pipeline
pipeline = Pipeline(stages= myStages)

# We fit the model using the training data.
pModel = pipeline.fit(raw_training_data)

# We transform the data.
trainingPred = pModel.transform(raw_training_data)

# # We select the actual label, probability and predictions
trainingPred.select('label', 'probability', 'prediction').show()

22/06/15 00:17:06 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
22/06/15 00:17:27 WARN DAGScheduler: Broadcasting large task binary with size 9.5 MiB
22/06/15 00:17:31 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
22/06/15 00:17:47 WARN DAGScheduler: Broadcasting large task binary with size 9.6 MiB
22/06/15 00:18:36 WARN DAGScheduler: Broadcasting large task binary with size 35.5 MiB
22/06/15 00:18:51 WARN DAGScheduler: Broadcasting large task binary with size 35.5 MiB
22/06/15 00:19:05 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/06/15 00:19:05 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/06/15 00:19:05 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/06/15 00:19:05 WARN BLAS: Failed

+-----+--------------------+----------+
|label|         probability|prediction|
+-----+--------------------+----------+
|  0.0|[0.73177397072966...|       0.0|
|  0.0|[0.99430990977573...|       0.0|
|  0.0|[0.99251657844327...|       0.0|
|  0.0|[0.98793467334425...|       0.0|
|  0.0|[0.99140607237181...|       0.0|
|  1.0|[0.05275639135396...|       1.0|
|  0.0|[0.98696397007111...|       0.0|
|  0.0|[0.99029424100199...|       0.0|
|  0.0|[0.72063436022881...|       0.0|
|  0.0|[0.98635162555681...|       0.0|
|  0.0|[0.74332604504987...|       0.0|
|  0.0|[0.99736128272305...|       0.0|
|  0.0|[0.99117274337270...|       0.0|
|  1.0|[0.02297802281914...|       1.0|
|  0.0|[0.96260789092916...|       0.0|
|  1.0|[0.31539757863386...|       1.0|
|  0.0|[0.98475567498695...|       0.0|
|  0.0|[0.98210009314176...|       0.0|
|  0.0|[0.98117998079269...|       0.0|
|  0.0|[0.99013585648942...|       0.0|
+-----+--------------------+----------+
only showing top 20 rows



                                                                                

In [17]:
pModel.save("model/spark-logistic-regression-model")

22/06/15 00:20:33 WARN TaskSetManager: Stage 90 contains a task of very large size (1188 KiB). The maximum recommended task size is 1000 KiB.
22/06/15 00:20:34 WARN TaskSetManager: Stage 94 contains a task of very large size (4669 KiB). The maximum recommended task size is 1000 KiB.
22/06/15 00:20:35 WARN TaskSetManager: Stage 112 contains a task of very large size (3040 KiB). The maximum recommended task size is 1000 KiB.


In [18]:
pred = pModel.transform(test_data)

In [19]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(pred)

print("Train Accuracy = %g " % (accuracy))


22/06/15 00:21:00 WARN DAGScheduler: Broadcasting large task binary with size 38.4 MiB
[Stage 117:>                                                        (0 + 1) / 1]

Train Accuracy = 0.832906 


                                                                                