In [1]:
# Imports 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import time

from pyspark.sql import SQLContext
from pyspark.sql import types
from pyspark.sql.functions import isnan, when, count, col
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler, StandardScaler, VectorIndexer, Normalizer
from pyspark.ml.linalg import VectorUDT
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, BinaryLogisticRegressionSummary
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics

%matplotlib inline
plt.style.use('ggplot')

%reload_ext autoreload
%autoreload 2

# store path to notebook
PWD = !pwd
PWD = PWD[0]

# start Spark Session
from pyspark.sql import SparkSession
app_name = "hw5_notebook"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

## Setting up the toy dataset

In [44]:
# load the data
# Define the schema prior to loading the data
schema = StructType([StructField("label", IntegerType(), True),
                     StructField("I1", IntegerType(), True),
                     StructField("I2", IntegerType(), True),
                     StructField("I3", IntegerType(), True),
                     StructField("I4", IntegerType(), True),
                     StructField("I5", IntegerType(), True),
                     StructField("I6", IntegerType(), True),
                     StructField("I7", IntegerType(), True),
                     StructField("I8", IntegerType(), True),
                     StructField("I9", IntegerType(), True),
                     StructField("I10", IntegerType(), True),
                     StructField("I11", IntegerType(), True),
                     StructField("I12", IntegerType(), True),
                     StructField("I13", IntegerType(), True),
                     StructField("C1", StringType(), True),
                     StructField("C2", StringType(), True),
                     StructField("C3", StringType(), True),
                     StructField("C4", StringType(), True),
                     StructField("C5", StringType(), True),
                     StructField("C6", StringType(), True),
                     StructField("C7", StringType(), True),
                     StructField("C8", StringType(), True),
                     StructField("C9", StringType(), True),
                     StructField("C10", StringType(), True),
                     StructField("C11", StringType(), True),
                     StructField("C12", StringType(), True),
                     StructField("C13", StringType(), True),
                     StructField("C14", StringType(), True),
                     StructField("C15", StringType(), True),
                     StructField("C16", StringType(), True),
                     StructField("C17", StringType(), True),
                     StructField("C18", StringType(), True),
                     StructField("C19", StringType(), True),
                     StructField("C20", StringType(), True),
                     StructField("C21", StringType(), True),
                     StructField("C22", StringType(), True),
                     StructField("C23", StringType(), True),
                     StructField("C24", StringType(), True),
                     StructField("C25", StringType(), True),
                     StructField("C26", StringType(), True)])


In [45]:
# Load toy data into dataframe
toy_df = spark.read.parquet("toyData/*.parquet")

In [46]:
# Just some code to get only the numeric numbers for training. Won't be using this for the official notebook
toy_rdd = toy_df.rdd.map(tuple)
toy_RDD = toy_rdd.map(lambda line: (line[0],(line[1:14]))).cache()

In [47]:
assembler = VectorAssembler(
    inputCols=['I1','I2','I3','I4','I5','I6','I7','I8','I9','I10','I11','I12','I13'],
    outputCol="features")

toy_df_transformed = assembler.transform(toy_df)
toy_df_transformed = toy_df_transformed.select('features','label')
pd.DataFrame(toy_df_transformed.take(30), columns=toy_df_transformed.columns).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
features,"[1.0, 17.0, 3.0, 6.0, 5.0, 6.0, 1.0, 5.0, 6.0,...","[0.0, 23.0, 47.0, 6.0, 13044.0, 2546.0, 0.0, 7...","[-999.0, 0.0, 1.0, 2.0, 20.0, -999.0, 0.0, 2.0...","[0.0, 3.0, 13.0, 1.0, 2940.0, 83.0, 1.0, 16.0,...","[1.0, 0.0, 5.0, 3.0, 171.0, 34.0, 3.0, 34.0, 6...","[-999.0, -1.0, -999.0, -999.0, 2975.0, 1.0, 14...","[-999.0, 2.0, 1.0, 3.0, 3031.0, 76.0, 1.0, 3.0...","[1.0, 4.0, 19.0, 17.0, 36.0, 7.0, 6.0, 22.0, 9...","[-999.0, -1.0, -999.0, -999.0, 8740.0, 12.0, 1...","[-999.0, 0.0, 7.0, 18.0, 3424.0, 22.0, 1.0, 21...",...,"[-999.0, 1.0, 3.0, 3.0, 109391.0, -999.0, 0.0,...","[0.0, 0.0, 2.0, 3.0, 1344.0, 5.0, 30.0, 7.0, 3...","[0.0, 68.0, 16.0, 3.0, 3735.0, 64.0, 1.0, 3.0,...","(-999.0, 0.0, 16.0, 8.0, 0.0, -999.0, 0.0, 7.0...","[-999.0, 1.0, 4.0, 4.0, 1837.0, 9.0, 2.0, 4.0,...","[0.0, 0.0, 4.0, -999.0, 1573.0, 42.0, 14.0, 8....","[5.0, 297.0, 5.0, 3.0, 8.0, 1.0, 34.0, 3.0, 14...","[-999.0, 1.0, 4.0, 2.0, 66163.0, -999.0, 0.0, ...","[-999.0, -1.0, 17.0, 6.0, 31979.0, 94.0, 1.0, ...","[-999.0, 31.0, 1.0, -999.0, 28527.0, -999.0, 0..."
label,0,1,0,0,1,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0


## Setting up the toy dataset

In [2]:
toy_df = spark.createDataFrame(
    [(1.0, 1, 1, 1),
     (1.0, 5, 2, 2),
     (0.0, 3, 0, 3)],
    ["label", "page_num", "hour", "id"])

In [9]:
toy_rdd = toy_df.rdd.map(tuple)
toy_RDD = toy_rdd.map(lambda line: (line[0],(line[1:4]))).cache()
toy_RDD.collect()

[(1.0, (1, 1, 1)), (1.0, (5, 2, 2)), (0.0, (3, 0, 3))]

toy_RDD will be the RDD used to train a logistic regression using the gradient descent method.

In [8]:
assembler = VectorAssembler(
    inputCols=["page_num", "hour", "id"],
    outputCol="features")
toy_df_transformed = assembler.transform(toy_df)
toy_df_transformed = toy_df_transformed.select('features','label')
pd.DataFrame(toy_df_transformed.take(30), columns=toy_df_transformed.columns).transpose()

Unnamed: 0,0,1,2
features,"[1.0, 1.0, 1.0]","[5.0, 2.0, 2.0]","[3.0, 0.0, 3.0]"
label,1,1,0


toy_df_transformed will be the dataframe used to train a logistic regression using Spark ML's LogisticRegression() function

## Log Loss

In [10]:
def LogLoss(dataRDD, W):
    """
    Compute the Log Loss of our model.
    Args:
        dataRDD - each record is a tuple of (y, features_array)
        W       - (array) model coefficients
    """
    # Add 1 to the front of the predictors array
    # Note that the b value has to be added to the front of our theta array
    augmentedData = dataRDD.map(lambda x: (x[0], np.append([1.0], x[1])))

    def LogLossPerRow(line):
        # Calculate the log loss for each row of the data in parallel
        actual_y, features = line
        predicted_y = np.dot(np.transpose(W),features)
        prob = 1.0/(1.0 + np.exp(-1.0*predicted_y))
        # Output is -[ylog(prob) + (1-y)log(1-prob)]
        yield -1.0*(actual_y*np.log(prob) + (1-actual_y)*np.log(1-prob))
    
    loss = augmentedData.flatMap(LogLossPerRow).mean()

    return loss

Now lets test our LogLoss function by setting b to the mean of the our label values and all our predictors to 0.

In [49]:
# Calculate the mean of all our label values
meanQuality = toy_RDD.map(lambda x: x[0]).mean()
print(f"Mean: {meanQuality}")

Mean: 0.24


In [12]:
# Set the baseline model so that b is the mean we calculated and all other features are 0
#BASELINE = np.append([meanQuality],[0,0,0,0,0,0,0,0,0,0,0,0,0])
BASELINE = np.append([meanQuality],[0,0,0])

In [13]:
# Compute the loss of the baseline model
assert len(BASELINE) == len(toy_RDD.take(1)[0][1]) + 1, "Double check model dimensions"
print(f"Baseline model loss: {LogLoss(toy_RDD, BASELINE)}")

Baseline model loss: 0.6365923090742943


In [21]:
# Verify if our result is correct or not

# Count how many 1's and 0's we have in our label
ones = toy_RDD.map(lambda line: 1 if line[0] == 1 else 0).sum()
zeros = toy_RDD.count() - ones
print("There are {} ones and {} zeros in the label column of our dataset".format(ones, zeros))

# Calculate the LogLoss in our example case
loss = (ones*np.log(1/(1.0 + np.exp(-1.0*meanQuality))) + zeros*np.log(1.0 - 1.0/(1.0 + np.exp(-1.0*meanQuality))))/-3.0
print("The loss of the baseline model through manual calculation is {}".format(loss))

There are 2 ones and 1 zeros in the label column of our dataset
The loss of the baseline model through manual calculation is 0.6365923090742943


We see that the results of our LogLoss() function is correct.

## Gradient Descent

In [23]:
# This code calculates the 3 different gradients with and without regularization
# Then it updates the model (w) and outputs the new model
def GDUpdate_wReg(dataRDD, W, learningRate = 0.1, regType = None, regParam = 0.1):
    """
    Perform one gradient descent step/update with ridge or lasso regularization.
    Args:
        dataRDD - tuple of (y, features_array)
        W       - (array) model coefficients with intercept at index 0
        learningRate - (float) defaults to 0.1
        regType - (str) 'ridge' or 'lasso', defaults to None
        regParam - (float) regularization term coefficient
    Returns:
        model   - (array) updated coefficients, intercept still at index 0
    """
    # augmented data
    augmentedData = dataRDD.map(lambda x: (x[0], np.append([1.0], x[1])))
    
    new_model = None

    def GradientPerRow(line):
        # Calculates -y(1- 1/ 1+ exp( -ywx))x for each row
        actual_y, features = line
        predicted_y = np.dot(np.transpose(W),features)
        prob = 1.0/(1.0 + np.exp(-1.0*predicted_y))
        yield (prob - actual_y)*features
        
    # Use the same way as before to find the first component of the gradient function
    grad = augmentedData.flatMap(GradientPerRow).sum()
    
    # Take out the bias stored in index 0 of W
    model = W[1:]
    
    # Figure out the regulation component
    if regType == None:
        pass
        
    elif regType == 'lasso':
        reg_comp = regParam*np.sign(model)
        # Update the gradient function by taking the regularization component into consideration
        grad = grad + np.append(0,reg_comp)
                
    elif regType == 'ridge':
        reg_comp = regParam*model
        # Update the gradient function by taking the regularization component into consideration
        grad = grad + np.append(0,reg_comp)
    
    new_model = W - (learningRate*grad)
    
    return new_model

In [31]:
# This code performs the Gradient Descent iterations 
def GradientDescent_wReg(trainRDD, wInit, nSteps = 20, learningRate = 0.1,
                         regType = None, regParam = 0.1, verbose = False):
    """
    Perform nSteps iterations of regularized gradient descent and 
    track loss on a test and train set. Return lists of
    test/train loss and the models themselves.
    """
    # initialize lists to track model performance
    train_history, model_history = [], []
    
    # perform n updates & compute test and train loss after each update
    model = wInit
    for idx in range(nSteps):  
        # update the model
        model = GDUpdate_wReg(trainRDD, model, learningRate, regType, regParam)
        
        # keep track of test/train loss for plotting
        train_history.append(LogLoss(trainRDD, model))
        model_history.append(model)
        
        # console output if desired
        if verbose:
            print("----------")
            print(f"STEP: {idx+1}")
            print(f"training loss: {LogLoss(trainRDD, model)}")
            print(f"Model: {[round(w,3) for w in model]}")
    return train_history, model_history

In [None]:
# run 50 iterations
meanQuality = toy_RDD.map(lambda x: x[0]).mean()
wInit = np.append([meanQuality],[0,0,0,0,0,0,0,0,0,0,0,0,0])
#wInit = np.append([meanQuality],[0,0,0])

start = time.time()
results = GradientDescent_wReg(toy_RDD, wInit, nSteps = 50, learningRate = 1.0, 
                                     regType=None, regParam = 5.0, verbose = False )
print(f"\n... trained {len(results[1])} iterations in {time.time() - start} seconds")
print("The final log loss is: {}".format(results[0][-1]))
print("The coefficients are: {}".format(results[1][-1][1:]))
print("The intercept is: {}".format(results[1][-1][0]))

In [41]:
lr = LogisticRegression(maxIter=50, regParam=0, standardization=False, fitIntercept=True)
lrModel = lr.fit(toy_df_transformed)

In [40]:
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

Coefficients: [-1.536273002737445,19.1462705537368,-8.398210811732106]
Intercept: 9.906884345546295
