In [26]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.classification import LogisticRegression

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
spark = SparkSession.builder.appName('LogisticRegression').getOrCreate()

In [4]:
spark

In [5]:
filename = "data/bank-full.csv"

In [6]:
data = spark.read.csv(filename, header=True, inferSchema=True, sep=";")

In [7]:
# looking at the data:
data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



In [37]:
def assemble_vector(df, features_list, target_varaible_name):
    assembler = VectorAssembler(inputCols=features_list, outputCol='features')
    string_indexer = StringIndexer(inputCol=target_varaible_name, outputCol=target_varaible_name+'_index')
    stages=[assembler, string_indexer]
    
    selected_cols = [target_variable_name+'_index', 'features'] + features_list
    pipeline = Pipeline(stages=stages)
    
    assembleModel = pipeline.fit(df)
    df = assembleModel.transform(df).select(selected_cols)
    
    return df

In [38]:
# dependent and independent features:
target_variable_name = 'y'
logistic_df = data.select(['age', 'balance', 'day', 'duration',
                          'campaign', 'pdays', 'previous', 'y'])

features_list = logistic_df.columns
features_list.remove(target_variable_name)

In [39]:
# appplying the assemble_vector function:

df = assemble_vector(logistic_df, features_list, target_variable_name)

In [40]:
df.printSchema()

root
 |-- y_index: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- age: integer (nullable = true)
 |-- balance: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)



In [43]:
# Creating binary and multinomial logistic regression models:

binary_clf = LogisticRegression(featuresCol='features', labelCol='y_index', family='binomial')
multi_clf = LogisticRegression(featuresCol='features', labelCol='y_index', family='multinomial')

In [48]:
# Fitting the models:

binary_clf_model = binary_clf.fit(df)
multi_clf_model = multi_clf.fit(df)

In [49]:
# Looking at the coefficients and intercepts:

print(f"Binary Model Coefficients:\n {binary_clf_model.coefficients}")
print(f"Binary Model Intercepts:\n {binary_clf_model.intercept}")

Binary Model Coefficients:
 [0.007959289990802676,3.7181275564923895e-05,-0.0016500733151699207,0.0036371977014414765,-0.12804328355779698,0.002113571348833325,0.08593801084290789]
Binary Model Intercepts:
 -3.4699010652867206


In [50]:
print(f"Multinomial Model Coefficients:\n {multi_clf_model.coefficientMatrix}")
print(f"Multinomial Model Intercepts:\n {multi_clf_model.interceptVector}")

Multinomial Model Coefficients:
 DenseMatrix([[-3.97962982e-03, -1.85907216e-05,  8.24926374e-04,
              -1.81859931e-03,  6.40216709e-02, -1.05678834e-03,
              -4.29688093e-02],
             [ 3.97962982e-03,  1.85907216e-05, -8.24926374e-04,
               1.81859931e-03, -6.40216709e-02,  1.05678834e-03,
               4.29688093e-02]])
Multinomial Model Intercepts:
 [1.7349520795818267,-1.7349520795818267]
