In [85]:
!pip install PySpark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [86]:
import pandas as pd
import numpy as np
from functools import wraps
import time

import statsmodels.api as sm
from sklearn.metrics import roc_auc_score

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [87]:
spark = SparkSession.builder.appName("test_regression").getOrCreate()

In [88]:
# Create dataset
observations = 5_000_000

target = np.random.binomial(n=1, p=0.2, size=(observations, 1))
y = target + np.random.normal(0, 0.1, size=(observations, 1))
z = target + np.random.normal(20, 10.0, size=(observations, 1))
w = np.random.normal(3, 1.0, size=(observations, 1))
df = pd.DataFrame(np.hstack([target, y, z, w]), 
                  columns=['target', 'y', 'z', 'w'])

# Create train test datasets
train = df.sample(frac=0.8, random_state=1)
test = df.drop(train.index)

# Convert to pyspark dataset
spark_train = spark.createDataFrame(train)
spark_test = spark.createDataFrame(test)

In [89]:
# Create functions for timing, pyspark regression and statsmodels regression
def timeit(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"{func.__name__} took {elapsed_time:.2f} seconds to run.")
        return result
    return wrapper

@timeit
def logit_pyspark(var, train_data, test_data):
  # create a VectorAssembler to combine the independent variables
  assembler = VectorAssembler(inputCols=[var], outputCol='features')
  train_spark = assembler.transform(train_data)
  test_spark = assembler.transform(test_data)

  # Create model
  lr = LogisticRegression(featuresCol='features', labelCol='target')
  model = lr.fit(train_spark)

  # Test model
  predictions = model.transform(test_spark)
  evaluator = BinaryClassificationEvaluator(labelCol='target')
  auc = evaluator.evaluate(predictions)
  print(f"Model with {var} as the independent variable has AUC of {auc:.2f}")

@timeit
def logit_statmodels(train_df, test_df, var):
  # Create model
  logit_model = sm.Logit(train_df[['target']], train_df[[var]])
  result = logit_model.fit(disp=0)  

  # Create AUC
  test['y_pred'] = result.predict(test_df[[var]])
  auc = roc_auc_score(test_df['target'], test_df['y_pred'])
  print(f"Model with {var} as the independent variable has AUC of {auc:.2f}")

In [90]:
# Test Statsmodel
vars = ['y', 'z', 'w']

# Run regression
for var in vars:
  logit_statmodels(train, test, var)

Model with y as the independent variable has AUC of 1.00
logit_statmodels took 4.77 seconds to run.
Model with z as the independent variable has AUC of 0.47
logit_statmodels took 7.19 seconds to run.
Model with w as the independent variable has AUC of 0.50
logit_statmodels took 6.18 seconds to run.


In [91]:
# Test PySpark
vars = ['y', 'z', 'w']

# Run regression
for var in vars:
  logit_pyspark(var, spark_train, spark_test)

Model with y as the independent variable has AUC of 1.00
logit_pyspark took 107.24 seconds to run.
Model with z as the independent variable has AUC of 0.53
logit_pyspark took 71.44 seconds to run.
Model with w as the independent variable has AUC of 0.50
logit_pyspark took 73.12 seconds to run.
