In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

# import matplotlib.pyplot as plt
# import pyspark.pandas as ps
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "24g") \
    .config("spark.kryoserializer.buffer.max", "512m") \
    .getOrCreate()

# .config("spark.executor.memory", "8g") \

spark

In [None]:
from pyspark.sql.types import StructType, StructField, FloatType, StringType, LongType, IntegerType

schema = StructType([
    StructField('id', LongType(), True),
    StructField("click", FloatType(), True),
    StructField("hour", IntegerType(), True),
    StructField("C1", IntegerType(), True),
    StructField("banner_pos", IntegerType(), True),
    StructField("site_id", StringType(), True),
    StructField("site_domain", StringType(), True),
    StructField("site_category", StringType(), True),
    StructField("app_id", StringType(), True),
    StructField("app_domain", StringType(), True),
    StructField("app_category", StringType(), True),
    StructField("device_id", StringType(), True),
    StructField("device_ip", StringType(), True),
    StructField("device_model", StringType(), True),
    StructField("device_type", IntegerType(), True),
    StructField("device_conn_type", IntegerType(), True),
    StructField("C14", IntegerType(), True),
    StructField("C15", IntegerType(), True),
    StructField("C16", IntegerType(), True),
    StructField("C17", IntegerType(), True),
    StructField("C18", IntegerType(), True),
    StructField("C19", IntegerType(), True),
    StructField("C20", IntegerType(), True),
    StructField("C21", IntegerType(), True)
])

In [None]:
# Load training data
# training = spark.read.format("csv") \
#     .option("header", "true") \
#     .option("inferSchema", "true") \
#     .load('../dataset/click-through-rate-prediction/train.gz')

training = spark.read.format("csv") \
    .option("header", "true") \
    .schema(schema) \
    .load('../dataset/click-through-rate-prediction/train.gz')


In [None]:
raw_training_data = training.limit(1000)

raw_training_data = raw_training_data.na.drop() 

In [None]:
raw_training_data = raw_training_data.withColumnRenamed("click", "label")

raw_training_data.printSchema()


In [None]:
raw_training_data.columns


In [None]:
# training = training.select([col for col in training.columns if col != "id"])
raw_training_data.limit(10).toPandas()

In [None]:
raw_training_data.describe().toPandas()

In [None]:
# Get string columns
for name, type in raw_training_data.dtypes:
  print(name, type)

In [None]:
# from pyspark.ml.feature import Imputer

# imputer = Imputer(
#   inputCols=["Glucose","BloodPressure","SkinThickness","BMI","Insulin"], 
#   outputCols=["Glucose","BloodPressure","SkinThickness","BMI","Insulin"]
# )

# model = imputer.fit(raw_training_data)
# raw_training_data = model.transform(raw_training_data)

# raw_training_data.show(5)

In [None]:
from pyspark.ml.feature import StringIndexer

cols = []
pipeline_stages = []
feature_columns = []

for name, type in raw_training_data.dtypes:
    if type == "string":
        feature_columns.append(f"{name}Index")
        pipeline_stages.append(StringIndexer(inputCol=name, outputCol=f"{name}Index"))
    
        cols.append(f"{name}Index")
    
    else:
        cols.append(name)

print(feature_columns)

from pyspark.ml import Pipeline

raw_training_data = Pipeline(stages=pipeline_stages).fit(raw_training_data).transform(raw_training_data)

raw_training_data = raw_training_data.select(cols)

raw_training_data.limit(10).toPandas()

In [None]:
cols

In [None]:
from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder(inputCols=feature_columns,
                        outputCols=[col+"_ohe" for col in feature_columns])

model = encoder.fit(raw_training_data)
encoded = model.transform(raw_training_data)

encoded.limit(10).toPandas()

In [None]:
cols = encoded.columns

for col_to_remove in feature_columns:
  cols.remove(col_to_remove)

cols

In [None]:
# cols=raw_training_data.columns
cols.remove("id")
cols.remove("label")

# Let us import the vector assembler
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=cols, outputCol="features")

# Now let us use the transform method to transform our dataset
encoded = assembler.transform(encoded)
encoded.select("features").toPandas()


In [None]:
from pyspark.ml.feature import StandardScaler

standardscaler = StandardScaler().setInputCol("features").setOutputCol("Scaled_features")
encoded = standardscaler.fit(encoded).transform(encoded)

encoded.select("features","Scaled_features").show(5)

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="label", maxIter=10, regParam= 0.01)

model=lr.fit(encoded)

predict_train=model.transform(encoded)
# predict_test=model.transform(test)
predict_train.select("label", "prediction", "probability").show(10)

In [None]:
model.transform(encoded).toPandas()

In [105]:
raw_training_data = training.limit(1000)

raw_training_data = raw_training_data.withColumnRenamed("click", "label")

raw_training_data = raw_training_data.na.drop()

In [106]:
# Create the logistic regression model
lr = LogisticRegression(maxIter=10, regParam= 0.01)

In [107]:
# Create a StringIndexer
from pyspark.ml.feature import StringIndexer

cols = []
pipeline_stages = []
feature_columns = []

for name, type in raw_training_data.dtypes:
    if type == "string":
        feature_columns.append(f"{name}Index")
        pipeline_stages.append(StringIndexer(inputCol=name, outputCol=f"{name}Index"))
    
        cols.append(f"{name}Index")
    
    else:
        cols.append(name)


In [108]:
# Create a one hot encoder
feature_columns = ['site_idIndex', 'site_domainIndex', 'site_categoryIndex', 'app_idIndex', 'app_domainIndex', 'app_categoryIndex', 'device_idIndex', 'device_ipIndex', 'device_modelIndex']
output_ohe_columns = ['site_id_ohe', 'site_domain_ohe', 'site_category_ohe', 'app_id_ohe', 'app_domain_ohe', 'app_category_ohe', 'device_id_ohe', 'device_ip_ohe', 'device_model_ohe']

ohe = OneHotEncoder(inputCols = feature_columns, outputCols = output_ohe_columns)


In [109]:
from pyspark.ml.feature import MinMaxScaler

# Input list for scaling
inputs = ["hour", "C1", "banner_pos", "device_type", "device_conn_type", "C14", "C15", "C16", "C17", "C18", "C19", "C20", "C21"]

# We scale our inputs
assembler1 = VectorAssembler(inputCols=inputs, outputCol="features_scaled1")
scaler = MinMaxScaler(inputCol="features_scaled1", outputCol="features_scaled")


In [110]:
# We create a second assembler for the encoded columns.
assembler2 = VectorAssembler(
  inputCols=['features_scaled'] + output_ohe_columns, outputCol="features"
)


In [111]:
# Create stages list
myStages = pipeline_stages + [assembler1, scaler, ohe, assembler2, lr]

# Set up the pipeline
pipeline = Pipeline(stages= myStages)

# We fit the model using the training data.
pModel = pipeline.fit(raw_training_data)

# We transform the data.
trainingPred = pModel.transform(raw_training_data)

# # We select the actual label, probability and predictions
trainingPred.select('label', 'probability', 'prediction').show()

+-----+--------------------+----------+
|label|         probability|prediction|
+-----+--------------------+----------+
|  0.0|[0.99448176328146...|       0.0|
|  0.0|[0.99802823200410...|       0.0|
|  0.0|[0.99295653865260...|       0.0|
|  1.0|[0.03449483439795...|       1.0|
|  0.0|[0.99386086643276...|       0.0|
|  0.0|[0.99795437760237...|       0.0|
|  0.0|[0.99295658072748...|       0.0|
|  0.0|[0.99880661696309...|       0.0|
|  0.0|[0.99457899494175...|       0.0|
|  0.0|[0.99157812715337...|       0.0|
|  1.0|[0.06347798172384...|       1.0|
|  0.0|[0.99814316731984...|       0.0|
|  0.0|[0.99211318819529...|       0.0|
|  1.0|[0.02023344102929...|       1.0|
|  0.0|[0.99804042434207...|       0.0|
|  0.0|[0.99448183108153...|       0.0|
|  0.0|[0.99457899494175...|       0.0|
|  0.0|[0.98886697514116...|       0.0|
|  0.0|[0.99208547417497...|       0.0|
|  0.0|[0.99814316731984...|       0.0|
+-----+--------------------+----------+
only showing top 20 rows



In [None]:
from pyspark.ml.classification import LogisticRegression

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import MinMaxScaler

# Create the logistic regression model
lr = LogisticRegression(maxIter=10, regParam= 0.01)

# We create a one hot encoder.
# ohe = OneHotEncoder(
#   # inputCols = ["site_id", "site_domain", "site_category", "app_id", "app_domain", "app_category", "device_id", "device_ip", "device_model"], 
#   inputCols = ["site_domain", "site_category", "app_id", "app_domain", "app_category", "device_id", "device_ip", "device_model"], 
#   # outputCols=["site_id_ohe", "site_domain_ohe", "site_category_ohe", "app_id_ohe", "app_domain_ohe", "app_category_ohe", "device_id_ohe", "device_ip_ohe", "device_model_ohe"]
#   outputCols=["site_domain_ohe", "site_category_ohe", "app_id_ohe", "app_domain_ohe", "app_category_ohe", "device_id_ohe", "device_ip_ohe", "device_model_ohe"]
# )

# Input list for scaling
inputs = ["hour", "C1", "banner_pos", "device_type", "device_conn_type", "C14", "C15", "C16", "C17", "C18", "C19", "C20", "C21"]

# We scale our inputs
assembler1 = VectorAssembler(inputCols=inputs, outputCol="features_scaled1")
scaler = MinMaxScaler(inputCol="features_scaled1", outputCol="features_scaled")

# We create a second assembler for the encoded columns.
assembler2 = VectorAssembler(
  # inputCols=["site_id_ohe", "site_domain_ohe", "site_category_ohe", "app_id_ohe", "app_domain_ohe", "app_category_ohe", "device_id_ohe", "device_ip_ohe", "device_model_ohe", 'features_scaled'], outputCol="features"
  inputCols=["site_domain", "site_category", "app_id", "app_domain", "app_category", "device_id", "device_ip", "device_model", 'features_scaled'], outputCol="features"
)

# Create stages list
# myStages = [assembler1, scaler, ohe, assembler2,lr]
myStages = [assembler1, scaler, assembler2, lr]

# Set up the pipeline
pipeline = Pipeline(stages= myStages)

# We fit the model using the training data.
pModel = pipeline.fit(raw_training_data)

# We transform the data.
trainingPred = pModel.transform(raw_training_data)

# # We select the actual label, probability and predictions
trainingPred.select('label', 'probability', 'prediction').show()

In [None]:
cols = raw_training_data.columns
cols.remove("label")

# Let us import the vector assembler
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=cols, outputCol="features")

# Now let us use the transform method to transform our dataset
raw_training_data = assembler.transform(raw_training_data)
raw_training_data.select("features").show(truncate=False)

In [None]:
training_data = raw_training_data.select("features", "label") #.withColumnRenamed("click", "label")
training_data.show()

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [None]:
model = lr.fit(training_data)

In [None]:
from pyspark.sql.types import StructType, StructField, FloatType, StringType, LongType, IntegerType

schema = StructType([
    StructField('id', LongType(), True),
    StructField("hour", IntegerType(), True),
    StructField("C1", IntegerType(), True),
    StructField("banner_pos", IntegerType(), True),
    StructField("site_id", StringType(), True),
    StructField("site_domain", StringType(), True),
    StructField("site_category", StringType(), True),
    StructField("app_id", StringType(), True),
    StructField("app_domain", StringType(), True),
    StructField("app_category", StringType(), True),
    StructField("device_id", StringType(), True),
    StructField("device_ip", StringType(), True),
    StructField("device_model", StringType(), True),
    StructField("device_type", IntegerType(), True),
    StructField("device_conn_type", IntegerType(), True),
    StructField("C14", IntegerType(), True),
    StructField("C15", IntegerType(), True),
    StructField("C16", IntegerType(), True),
    StructField("C17", IntegerType(), True),
    StructField("C18", IntegerType(), True),
    StructField("C19", IntegerType(), True),
    StructField("C20", IntegerType(), True),
    StructField("C21", IntegerType(), True)
])

# Load test data
test = spark.read.format("csv") \
    .option("header", "true") \
    .schema(schema) \
    .load('../dataset/click-through-rate-prediction/test.gz')


In [None]:
from pyspark.ml.feature import StringIndexer

cols = []
pipeline_stages = []
feature_columns = []

for name, type in test.dtypes:
    if type == "string":
        feature_columns.append(f"{name}Index")
        pipeline_stages.append(StringIndexer(inputCol=name, outputCol=f"{name}Index"))
    
        cols.append(f"{name}Index")
    
    else:
        cols.append(name)

cols

In [None]:
cols

In [None]:
from pyspark.ml import Pipeline

test = Pipeline(stages=pipeline_stages).fit(test).transform(test)

test = test.select(cols)

test.show()

In [None]:
cols = test.columns
# cols.remove("click")

# Let us import the vector assembler
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=cols, outputCol="features")

# Now let us use the transform method to transform our dataset
test = assembler.transform(test)
test = test.select("features")

test.show(truncate=False)

In [None]:
pred = model.transform(test)

In [None]:
from pyspark.ml.feature import StandardScaler

standardscaler = StandardScaler().setInputCol("features").setOutputCol("Scaled_features")
raw_training_data = standardscaler.fit(raw_training_data).transform(raw_training_data)
raw_training_data.select("features","Scaled_features").show(5)

In [None]:
from pyspark.ml.feature import HashingTF, Tokenizer, VectorAssembler

# Creating Vector Assembler
vecAssembler = VectorAssembler(inputCols=[col if type != "string" else col+"Index" for col, type in training.dtypes if col != "id"], outputCol="features")
pipeline_stages.append(vecAssembler)


In [None]:
from pyspark.ml.classification import LogisticRegression

# Creating Logistic Regression Model
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
pipeline_stages.append(lr)


In [None]:
from pyspark.ml import Pipeline

# Creating pipeline
pipeline = Pipeline(stages=pipeline_stages)


In [None]:
# Executing pipeline (VectorAssembler + LogisticRegression)
lrModel = pipeline.fit(training)


In [None]:
stop

In [None]:

# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)

pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# Fit the pipeline to training documents.
model = pipeline.fit(training)

# Prepare test documents, which are unlabeled (id, text) tuples.
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "spark hadoop spark"),
    (7, "apache hadoop")
], ["id", "text"])

# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)
selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
    rid, text, prob, prediction = row  # type: ignore
    print(
        "(%d, %s) --> prob=%s, prediction=%f" % (
            rid, text, str(prob), prediction   # type: ignore
        )
    )


In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(training)

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

# We can also use the multinomial family for binary classification
mlr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial")

# Fit the model
mlrModel = mlr.fit(training)

# Print the coefficients and intercepts for logistic regression with multinomial family
print("Multinomial coefficients: " + str(mlrModel.coefficientMatrix))
print("Multinomial intercepts: " + str(mlrModel.interceptVector))
