<a href="https://colab.research.google.com/github/ankesh86/PySparkNotebooks/blob/main/SupervisedLearningConcepts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark==3.4.0

Collecting pyspark==3.4.0
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317122 sha256=635b07a532c9db38ae5c39a4045a7ca3a7d9e79b5d50928efbf61cd7d5120aab
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [2]:
#reading data
filename = "sample_data/bank-full.csv"

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
data = spark.read.csv(filename, header=True, inferSchema=True, sep=";")
data.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|
| 33|     unknown|  single|  unknown|     no|      1|     no|  no|unknown|  5|  may

# lendth of data

In [4]:
data.count()

45211

# Describe data

In [5]:
data.describe().toPandas()

Unnamed: 0,summary,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,count,45211.0,45211,45211,45211,45211,45211.0,45211,45211,45211,45211.0,45211,45211.0,45211.0,45211.0,45211.0,45211,45211
1,mean,40.93621021432837,,,,,1362.2720576850766,,,,15.80641879188693,,258.1630797814691,2.763840658246887,40.19782796222158,0.5803233726305546,,
2,stddev,10.618762040975408,,,,,3044.7658291685243,,,,8.322476153044596,,257.5278122651709,3.098020883279184,100.12874599059812,2.3034410449312204,,
3,min,18.0,admin.,divorced,primary,no,-8019.0,no,no,cellular,1.0,apr,0.0,1.0,-1.0,0.0,failure,no
4,max,95.0,unknown,single,unknown,yes,102127.0,yes,yes,unknown,31.0,sep,4918.0,63.0,871.0,275.0,unknown,yes


# Check Data types of each column

In [6]:
data.groupBy("marital").count().show()

+--------+-----+
| marital|count|
+--------+-----+
|divorced| 5207|
| married|27214|
|  single|12790|
+--------+-----+



In [7]:
data.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'int'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'int'),
 ('month', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('y', 'string')]

In [8]:
data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



# **Cardianility check**

In [9]:
from pyspark.sql.functions import approx_count_distinct, countDistinct

"""
Note: approxCountDistinct and countDistinct can be used interchangeably. Only difference is the computation time.

"approxCountDistinct" is useful for large datasets
"countDistinct" for small and medium datasets.

"""

def cardinality_calculation(df, cut_off=1):
    cardinality = df.select(*[approx_count_distinct(c).alias(c) for c in df.columns])

    ## convert to pandas for efficient calculations
    final_cardinality_df = cardinality.toPandas().transpose()
    final_cardinality_df.reset_index(inplace=True)
    final_cardinality_df.rename(columns={0:'Cardinality'}, inplace=True)

    #select variables with cardinality of 1
    vars_selected = final_cardinality_df['index'][final_cardinality_df['Cardinality'] <= cut_off]

    return final_cardinality_df, vars_selected

cardinality_df, cardinality_vars_selected = cardinality_calculation(data)

In [10]:
cardinality_df

Unnamed: 0,index,Cardinality
0,age,76
1,job,11
2,marital,3
3,education,4
4,default,2
5,balance,7375
6,housing,2
7,loan,2
8,contact,3
9,day,32


# **Missing value check**

In [11]:
#missing values check
from pyspark.sql.functions import count, when, isnan, col

# miss_percentage is set to 80% as discussed in the book
def missing_calculation(df, miss_percentage=0.80):

    #checks for both NaN and null values
    missing = df.select(*[count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns])
    length_df = df.count()
    ## convert to pandas for efficient calculations
    final_missing_df = missing.toPandas().transpose()
    final_missing_df.reset_index(inplace=True)
    final_missing_df.rename(columns={0:'missing_count'}, inplace=True)
    final_missing_df['missing_percentage'] = final_missing_df['missing_count']/length_df

    #select variables with cardinality of 1
    vars_selected = final_missing_df['index'][final_missing_df['missing_percentage'] >= miss_percentage]

    return final_missing_df, vars_selected

In [12]:
missing_calculation(data)

(        index  missing_count  missing_percentage
 0         age              0                 0.0
 1         job              0                 0.0
 2     marital              0                 0.0
 3   education              0                 0.0
 4     default              0                 0.0
 5     balance              0                 0.0
 6     housing              0                 0.0
 7        loan              0                 0.0
 8     contact              0                 0.0
 9         day              0                 0.0
 10      month              0                 0.0
 11   duration              0                 0.0
 12   campaign              0                 0.0
 13      pdays              0                 0.0
 14   previous              0                 0.0
 15   poutcome              0                 0.0
 16          y              0                 0.0,
 Series([], Name: index, dtype: object))

# **Identify variable types**

In [13]:
def variable_type(df):

    vars_list = df.dtypes
    char_vars = []
    num_vars = []
    for i in vars_list:
        if i[1] in ('string'):
            char_vars.append(i[0])
        else:
            num_vars.append(i[0])

    return char_vars, num_vars

In [14]:
char_vars, num_vars = variable_type(data)

In [15]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

def category_to_index(df, char_vars):

    char_df = df.select(char_vars)
    indexers = [StringIndexer(inputCol=c, outputCol=c+"_index", handleInvalid="keep") for c in char_df.columns]
    pipeline = Pipeline(stages=indexers)
    char_labels = pipeline.fit(char_df)
    df = char_labels.transform(df)
    return df, char_labels

data, char_labels = category_to_index(data, char_vars)

In [16]:
data.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'int'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'int'),
 ('month', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('y', 'string'),
 ('job_index', 'double'),
 ('marital_index', 'double'),
 ('education_index', 'double'),
 ('default_index', 'double'),
 ('housing_index', 'double'),
 ('loan_index', 'double'),
 ('contact_index', 'double'),
 ('month_index', 'double'),
 ('poutcome_index', 'double'),
 ('y_index', 'double')]

In [17]:
data = data.select([c for c in data.columns if c not in char_vars])

In [18]:
def rename_columns(df, char_vars):
    mapping = dict(zip([i + '_index' for i in char_vars], char_vars))
    df = df.select([col(c).alias(mapping.get(c, c)) for c in df.columns])
    return df

In [19]:
data = rename_columns(data, char_vars)

In [20]:
data.dtypes

[('age', 'int'),
 ('balance', 'int'),
 ('day', 'int'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('job', 'double'),
 ('marital', 'double'),
 ('education', 'double'),
 ('default', 'double'),
 ('housing', 'double'),
 ('loan', 'double'),
 ('contact', 'double'),
 ('month', 'double'),
 ('poutcome', 'double'),
 ('y', 'double')]

In [21]:
data.groupBy('y').count().show()

+---+-----+
|  y|count|
+---+-----+
|0.0|39922|
|1.0| 5289|
+---+-----+



In [22]:
linear_df = data.select(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'])

In [23]:
linear_df

DataFrame[age: int, balance: int, day: int, duration: int, campaign: int, pdays: int, previous: int]

In [24]:
target_variable_name = 'balance'

# **Assemble input vectors**

In [25]:
#assemble feature vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

def assemble_vectors(df, features_list, target_variable_name):
  stages = []
  #assemble vectors
  assembler = VectorAssembler(inputCols=features_list, outputCol='features')
  stages = [assembler]

  #select all the columns + target + newly created "features" column
  selectedCols = [target_variable_name, 'features'] + features_list

  #using pipeline
  pipeline = Pipeline(stages=stages)

  #assembler model
  assembleModel = pipeline.fit(df)

  #assemble model on data
  df = assembleModel.transform(df).select(selectedCols)

  return df


In [26]:
#excluding target variable
features_list = linear_df.columns
#features_list = char_vars #this option is used only for ChiSqselector
features_list.remove(target_variable_name)

In [27]:
features_list

['age', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [28]:
# applying function on dataframe
df = assemble_vectors(linear_df, features_list, target_variable_name)


In [29]:
df.show()

+-------+--------------------+---+---+--------+--------+-----+--------+
|balance|            features|age|day|duration|campaign|pdays|previous|
+-------+--------------------+---+---+--------+--------+-----+--------+
|   2143|[58.0,5.0,261.0,1...| 58|  5|     261|       1|   -1|       0|
|     29|[44.0,5.0,151.0,1...| 44|  5|     151|       1|   -1|       0|
|      2|[33.0,5.0,76.0,1....| 33|  5|      76|       1|   -1|       0|
|   1506|[47.0,5.0,92.0,1....| 47|  5|      92|       1|   -1|       0|
|      1|[33.0,5.0,198.0,1...| 33|  5|     198|       1|   -1|       0|
|    231|[35.0,5.0,139.0,1...| 35|  5|     139|       1|   -1|       0|
|    447|[28.0,5.0,217.0,1...| 28|  5|     217|       1|   -1|       0|
|      2|[42.0,5.0,380.0,1...| 42|  5|     380|       1|   -1|       0|
|    121|[58.0,5.0,50.0,1....| 58|  5|      50|       1|   -1|       0|
|    593|[43.0,5.0,55.0,1....| 43|  5|      55|       1|   -1|       0|
|    270|[41.0,5.0,222.0,1...| 41|  5|     222|       1|   -1|  

# **Linear Regression**

In [30]:
# fit the regression model
from pyspark.ml.regression import LinearRegression
reg = LinearRegression(featuresCol='features', labelCol='balance')
reg_model = reg.fit(df) #fit model

In [31]:
# viewing coefficient and intercepts for each variable
import pandas as pd
for k,v in df.schema["features"].metadata["ml_attr"]["attrs"].items():
  features_df= pd.DataFrame(v)

#print coefficent and intercept
print(reg_model.coefficients, reg_model.intercept)
features_df['coefficients'] = reg_model.coefficients

[28.083972908930026,3.30554636194966,0.2488284197090184,-14.142676297161422,-0.08248810233031972,23.46299280076253] 124.9213009281818


In [32]:
features_df

Unnamed: 0,idx,name,coefficients
0,0,age,28.083973
1,1,day,3.305546
2,2,duration,0.248828
3,3,campaign,-14.142676
4,4,pdays,-0.082488
5,5,previous,23.462993


In [33]:
# prediction result
pred_result = reg_model.transform(df)

In [34]:
pred_result

DataFrame[balance: int, features: vector, age: int, day: int, duration: int, campaign: int, pdays: int, previous: int, prediction: double]

In [35]:
reg_model.summary.r2

0.010568116511551984

In [36]:
features_list

['age', 'day', 'duration', 'campaign', 'pdays', 'previous']

# **Variance Inflation factor**
To avoid multi-collinearity
Generally it should be less than 10 9acceptable), when near to 1, it not coorelated

In [37]:
def vif_calculator(df, features_list):
  vif_list = []
  for i in features_list:
    temp_features_list = features_list.copy()
    temp_features_list.remove(i)
    temp_target = i
    assembler = VectorAssembler(inputCols=temp_features_list, outputCol='features')
    temp_df = assembler.transform(df)

    reg = LinearRegression(featuresCol = 'features', labelCol = i)
    reg_model = reg.fit(temp_df) #fit model
    temp_vif = 1/(1-reg_model.summary.r2)
    vif_list.append(temp_vif)

  return vif_list

features_df['vif'] = vif_calculator(linear_df, features_list)
print(features_df)

   idx      name  coefficients       vif
0    0       age     28.083973  1.000917
1    1       day      3.305546  1.034350
2    2  duration      0.248828  1.007627
3    3  campaign    -14.142676  1.039907
4    4     pdays     -0.082488  1.276182
5    5  previous     23.462993  1.261321


# **Logistic Regression**

In [38]:
target_variable_name = "y"
logistic_df = data.select(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'y'])
#exclude target variable and select all other feature vectors
features_list = logistic_df.columns
#features_list = char_vars #this option is used only for ChiSqselector
features_list.remove(target_variable_name)
# apply the function on our dataframe
df = assemble_vectors(logistic_df, features_list, target_variable_name)

In [39]:
# Assuming df is your Spark DataFrame
unique_classes = df.select('y').distinct().collect()
print("Unique classes in 'y':", unique_classes)


Unique classes in 'y': [Row(y=0.0), Row(y=1.0)]


In [40]:
from pyspark.ml.classification import LogisticRegression

# Check the number of unique classes
num_classes = df.select('y').distinct().count()

# Initialize logistic regression according to the number of classes
if num_classes == 2:
    clf = LogisticRegression(featuresCol='features', labelCol='y', family='binomial')
elif num_classes > 2:
    clf = LogisticRegression(featuresCol='features', labelCol='y', family='multinomial')
else:
    raise ValueError("The target variable y must have at least two classes.")

# Fit the model
model = clf.fit(df)
print("Model fitted successfully with", ("binomial" if num_classes == 2 else "multinomial"), "family.")


IllegalArgumentException: requirement failed: Binomial family only supports 1 or 2 outcome classes but found 3.

In [41]:
from pyspark.ml.classification import LogisticRegression

# Initialize logistic regression
binary_clf = LogisticRegression(featuresCol='features', labelCol='y', family='binomial')
multinomial_clf = LogisticRegression(featuresCol='features', labelCol='y', family='multinomial')

# Fit the appropriate model based on the number of classes
if num_classes == 2:
    model = binary_clf.fit(df)
    print("Fitted binary logistic regression model")
elif num_classes > 2:
    model = multinomial_clf.fit(df)
    print("Fitted multinomial logistic regression model")
else:
    raise ValueError("The target variable y must have at least two classes for logistic regression.")


IllegalArgumentException: requirement failed: Binomial family only supports 1 or 2 outcome classes but found 3.

In [42]:
from pyspark.ml.classification import LogisticRegression
binary_clf = LogisticRegression(featuresCol='features', labelCol='y', family='binomial')
multinomial_clf = LogisticRegression(featuresCol='features', labelCol='y', family='multinomial')
binary_clf_model = binary_clf.fit(df) # fit binary model
multinomial_clf_model = multinomial_clf.fit(df) # fit multinomial model

IllegalArgumentException: requirement failed: Binomial family only supports 1 or 2 outcome classes but found 3.

In [None]:
# Count distinct values in the target variable
num_classes = logistic_df.select('y').distinct().count()
print(f"Number of classes: {num_classes}")


In [None]:
df.show()

# **Decision Tree**

## **Decision tree Classifier**

In [43]:
target_variable_name = "y"
logistic_df = data.select(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'y'])
#exclude target variable and select all other feature vectors
features_list = logistic_df.columns
#features_list = char_vars #this option is used only for ChiSqselector
features_list.remove(target_variable_name)
# apply the function on our dataframe
binary_df = assemble_vectors(logistic_df, features_list, target_variable_name)

In [44]:
binary_df.show()

+---+--------------------+---+-------+---+--------+--------+-----+--------+
|  y|            features|age|balance|day|duration|campaign|pdays|previous|
+---+--------------------+---+-------+---+--------+--------+-----+--------+
|0.0|[58.0,2143.0,5.0,...| 58|   2143|  5|     261|       1|   -1|       0|
|0.0|[44.0,29.0,5.0,15...| 44|     29|  5|     151|       1|   -1|       0|
|0.0|[33.0,2.0,5.0,76....| 33|      2|  5|      76|       1|   -1|       0|
|0.0|[47.0,1506.0,5.0,...| 47|   1506|  5|      92|       1|   -1|       0|
|0.0|[33.0,1.0,5.0,198...| 33|      1|  5|     198|       1|   -1|       0|
|0.0|[35.0,231.0,5.0,1...| 35|    231|  5|     139|       1|   -1|       0|
|0.0|[28.0,447.0,5.0,2...| 28|    447|  5|     217|       1|   -1|       0|
|0.0|[42.0,2.0,5.0,380...| 42|      2|  5|     380|       1|   -1|       0|
|0.0|[58.0,121.0,5.0,5...| 58|    121|  5|      50|       1|   -1|       0|
|0.0|[43.0,593.0,5.0,5...| 43|    593|  5|      55|       1|   -1|       0|
|0.0|[41.0,2

In [45]:
target_variable_name = "balance"
linear_df = data.select(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'])
#exclude target variable and select all other feature vectors
features_list = linear_df.columns
#features_list = char_vars #this option is used only for ChiSqselector
features_list.remove(target_variable_name)
# apply the function on our dataframe
continuous_df = assemble_vectors(linear_df, features_list, target_variable_name)

In [47]:
continuous_df.show()

+-------+--------------------+---+---+--------+--------+-----+--------+
|balance|            features|age|day|duration|campaign|pdays|previous|
+-------+--------------------+---+---+--------+--------+-----+--------+
|   2143|[58.0,5.0,261.0,1...| 58|  5|     261|       1|   -1|       0|
|     29|[44.0,5.0,151.0,1...| 44|  5|     151|       1|   -1|       0|
|      2|[33.0,5.0,76.0,1....| 33|  5|      76|       1|   -1|       0|
|   1506|[47.0,5.0,92.0,1....| 47|  5|      92|       1|   -1|       0|
|      1|[33.0,5.0,198.0,1...| 33|  5|     198|       1|   -1|       0|
|    231|[35.0,5.0,139.0,1...| 35|  5|     139|       1|   -1|       0|
|    447|[28.0,5.0,217.0,1...| 28|  5|     217|       1|   -1|       0|
|      2|[42.0,5.0,380.0,1...| 42|  5|     380|       1|   -1|       0|
|    121|[58.0,5.0,50.0,1....| 58|  5|      50|       1|   -1|       0|
|    593|[43.0,5.0,55.0,1....| 43|  5|      55|       1|   -1|       0|
|    270|[41.0,5.0,222.0,1...| 41|  5|     222|       1|   -1|  

In [51]:
from pyspark.ml.classification import DecisionTreeClassifier

clf = DecisionTreeClassifier(featuresCol='features', labelCol='y', impurity='gini')
clf_model = clf.fit(binary_df)

clf2 = DecisionTreeClassifier(featuresCol='features', labelCol='y', impurity='entropy')
clf_model2 = clf2.fit(binary_df)


In [52]:
print(clf_model.featureImportances)

(7,[0,3,5],[0.06053525173984628,0.7247697525356532,0.21469499572450051])


In [53]:
print(clf_model2.featureImportances)

(7,[0,2,3,4,5],[0.018481303507164377,0.0011291767150968187,0.7281622975667823,0.00046685977426426655,0.25176036243669225])


## **Decision tree Regression**

In [54]:
from pyspark.ml.regression import DecisionTreeRegressor

reg = DecisionTreeRegressor(featuresCol = 'features', labelCol='balance', impurity='variance')
reg_model = reg.fit(continuous_df)
print(reg_model.featureImportances)#feature importances


(6,[0,1,2,3,4],[0.5109054100742587,0.3184339087685365,0.057805991093117154,0.03302031674831211,0.07983437331577559])


In [55]:
reg_model.transform(continuous_df) #future predictions

DataFrame[balance: int, features: vector, age: int, day: int, duration: int, campaign: int, pdays: int, previous: int, prediction: double]

In [56]:
clf_model.toDebugString

'DecisionTreeClassificationModel: uid=DecisionTreeClassifier_bdbd82b35615, depth=5, numNodes=31, numClasses=3, numFeatures=7\n  If (feature 3 <= 474.5)\n   If (feature 5 <= 14.5)\n    If (feature 0 <= 60.5)\n     Predict: 0.0\n    Else (feature 0 > 60.5)\n     If (feature 3 <= 127.5)\n      Predict: 0.0\n     Else (feature 3 > 127.5)\n      If (feature 3 <= 205.5)\n       Predict: 0.0\n      Else (feature 3 > 205.5)\n       Predict: 1.0\n   Else (feature 5 > 14.5)\n    If (feature 3 <= 165.5)\n     Predict: 0.0\n    Else (feature 3 > 165.5)\n     If (feature 5 <= 188.5)\n      If (feature 5 <= 94.5)\n       Predict: 1.0\n      Else (feature 5 > 94.5)\n       Predict: 0.0\n     Else (feature 5 > 188.5)\n      Predict: 0.0\n  Else (feature 3 > 474.5)\n   If (feature 3 <= 679.5)\n    If (feature 5 <= 8.5)\n     If (feature 0 <= 60.5)\n      Predict: 0.0\n     Else (feature 0 > 60.5)\n      Predict: 1.0\n    Else (feature 5 > 8.5)\n     If (feature 5 <= 94.5)\n      Predict: 1.0\n     Else

In [57]:
reg_model.toDebugString

'DecisionTreeRegressionModel: uid=DecisionTreeRegressor_2c33b9d0964c, depth=5, numNodes=63, numFeatures=6\n  If (feature 0 <= 54.5)\n   If (feature 0 <= 37.5)\n    If (feature 1 <= 17.5)\n     If (feature 1 <= 3.5)\n      If (feature 4 <= 370.5)\n       Predict: 1289.7027450980393\n      Else (feature 4 > 370.5)\n       Predict: 3244.75\n     Else (feature 1 > 3.5)\n      If (feature 4 <= 26.5)\n       Predict: 937.5397395002658\n      Else (feature 4 > 26.5)\n       Predict: 1198.0977011494253\n    Else (feature 1 > 17.5)\n     If (feature 1 <= 21.5)\n      If (feature 0 <= 29.5)\n       Predict: 917.4211287988422\n      Else (feature 0 > 29.5)\n       Predict: 1723.0375874125873\n     Else (feature 1 > 21.5)\n      If (feature 4 <= 3.5)\n       Predict: 952.7784339457568\n      Else (feature 4 > 3.5)\n       Predict: 1535.1295418641391\n   Else (feature 0 > 37.5)\n    If (feature 1 <= 21.5)\n     If (feature 1 <= 17.5)\n      If (feature 0 <= 50.5)\n       Predict: 1269.7091539838855

In [58]:
def parse(lines):
    block = []
    while lines :

        if lines[0].startswith('If'):
            bl = ' '.join(lines.pop(0).split()[1:]).replace('(', '').replace(')', '')
            block.append({'id':bl, 'children':parse(lines)})


            if lines[0].startswith('Else'):
                be = ' '.join(lines.pop(0).split()[1:]).replace('(', '').replace(')', '')
                block.append({'id':be, 'children':parse(lines)})
        elif not lines[0].startswith(('If','Else')):
            block2 = lines.pop(0)
            block.append({'id':block2})
        else:
            break
    return block

def tree_json(tree):
    data = []
    for line in tree.splitlines() :
        if line.strip():
            line = line.strip()
            data.append(line)
        else : break
        if not line : break
    res = []
    res.append({'id':'Root', 'children':parse(data[1:])})
    return res[0]

In [59]:
result = tree_json(clf_model.toDebugString)

In [60]:
result

{'id': 'Root',
 'children': [{'id': 'feature 3 <= 474.5',
   'children': [{'id': 'feature 5 <= 14.5',
     'children': [{'id': 'feature 0 <= 60.5',
       'children': [{'id': 'Predict: 0.0'}]},
      {'id': 'feature 0 > 60.5',
       'children': [{'id': 'feature 3 <= 127.5',
         'children': [{'id': 'Predict: 0.0'}]},
        {'id': 'feature 3 > 127.5',
         'children': [{'id': 'feature 3 <= 205.5',
           'children': [{'id': 'Predict: 0.0'}]},
          {'id': 'feature 3 > 205.5',
           'children': [{'id': 'Predict: 1.0'}]}]}]}]},
    {'id': 'feature 5 > 14.5',
     'children': [{'id': 'feature 3 <= 165.5',
       'children': [{'id': 'Predict: 0.0'}]},
      {'id': 'feature 3 > 165.5',
       'children': [{'id': 'feature 5 <= 188.5',
         'children': [{'id': 'feature 5 <= 94.5',
           'children': [{'id': 'Predict: 1.0'}]},
          {'id': 'feature 5 > 94.5', 'children': [{'id': 'Predict: 0.0'}]}]},
        {'id': 'feature 5 > 188.5',
         'children': [{'

In [61]:
from pyspark.sql.types import IntegerType
from pyspark.sql.types import StructField, StructType

cSchema = StructType([StructField("age", IntegerType())\
                      ,StructField("gender", IntegerType())\
                      ,StructField("y", IntegerType())])

test_list = [[30, 0, 1],
             [25, 1, 0],
             [45, 0, 0],
             [57, 1, 1],
             [27, 0, 1],
             [54, 1, 1],
             [35, 1, 1]]


test_df = spark.createDataFrame(test_list, schema=cSchema)
test_df.show()

+---+------+---+
|age|gender|  y|
+---+------+---+
| 30|     0|  1|
| 25|     1|  0|
| 45|     0|  0|
| 57|     1|  1|
| 27|     0|  1|
| 54|     1|  1|
| 35|     1|  1|
+---+------+---+



In [62]:
test_df = assemble_vectors(test_df, ['age','gender'], 'y')
test_clf = DecisionTreeClassifier(featuresCol='features', labelCol='y')
test_clf_model = test_clf.fit(test_df)

In [63]:
clf_model.featureImportances

SparseVector(7, {0: 0.0605, 3: 0.7248, 5: 0.2147})

In [64]:
test_clf_model.toDebugString

'DecisionTreeClassificationModel: uid=DecisionTreeClassifier_13caf4df29ee, depth=3, numNodes=7, numClasses=2, numFeatures=2\n  If (feature 0 <= 26.0)\n   Predict: 0.0\n  Else (feature 0 > 26.0)\n   If (feature 0 <= 40.0)\n    Predict: 1.0\n   Else (feature 0 > 40.0)\n    If (feature 0 <= 49.5)\n     Predict: 0.0\n    Else (feature 0 > 49.5)\n     Predict: 1.0\n'

# **Random Forest**

## **Classification**

In [65]:
from pyspark.ml.classification import RandomForestClassifier

clf = RandomForestClassifier(featuresCol='features', labelCol='y')
clf_model = clf.fit(binary_df)
print(clf_model.featureImportances)
print(clf_model.toDebugString)

(7,[0,1,2,3,4,5,6],[0.060776549248000945,0.016462442488656255,0.023802445641580818,0.667601208397336,0.00470703703877026,0.13205796240206363,0.09459235478359203])
RandomForestClassificationModel: uid=RandomForestClassifier_e42511bb5b98, numTrees=20, numClasses=3, numFeatures=7
  Tree 0 (weight 1.0):
    If (feature 6 <= 0.5)
     If (feature 3 <= 537.5)
      Predict: 0.0
     Else (feature 3 > 537.5)
      If (feature 1 <= 392.5)
       If (feature 3 <= 852.0)
        Predict: 0.0
       Else (feature 3 > 852.0)
        Predict: 1.0
      Else (feature 1 > 392.5)
       If (feature 4 <= 2.5)
        If (feature 2 <= 1.5)
         Predict: 1.0
        Else (feature 2 > 1.5)
         Predict: 0.0
       Else (feature 4 > 2.5)
        If (feature 0 <= 46.5)
         Predict: 1.0
        Else (feature 0 > 46.5)
         Predict: 0.0
    Else (feature 6 > 0.5)
     If (feature 3 <= 251.5)
      Predict: 0.0
     Else (feature 3 > 251.5)
      If (feature 2 <= 21.5)
       If (feature 3 <= 

## **Regression**

In [66]:
from pyspark.ml.regression import RandomForestRegressor

reg = RandomForestRegressor(featuresCol='features', labelCol='balance')
reg_model = reg.fit(continuous_df)
print(reg_model.featureImportances)
print(reg_model.toDebugString)

(6,[0,1,2,3,4,5],[0.4519800379245373,0.203479669242867,0.10188064559976347,0.07360458333721434,0.12792263819058974,0.04113242570502813])
RandomForestRegressionModel: uid=RandomForestRegressor_5b8fbef66dd6, numTrees=20, numFeatures=6
  Tree 0 (weight 1.0):
    If (feature 4 <= 273.5)
     If (feature 4 <= 30.0)
      If (feature 1 <= 21.5)
       If (feature 1 <= 18.5)
        If (feature 5 <= 0.5)
         Predict: 1273.472
        Else (feature 5 > 0.5)
         Predict: 579.5954198473282
       Else (feature 1 > 18.5)
        If (feature 3 <= 1.5)
         Predict: 2316.7947395564725
        Else (feature 3 > 1.5)
         Predict: 1736.5564311849141
      Else (feature 1 > 21.5)
       If (feature 2 <= 293.5)
        If (feature 1 <= 28.5)
         Predict: 985.1712054042644
        Else (feature 1 > 28.5)
         Predict: 1179.6801279488204
       Else (feature 2 > 293.5)
        If (feature 2 <= 871.5)
         Predict: 1278.073726541555
        Else (feature 2 > 871.5)
         

## **Gradient Boosting**

## **Classification**

In [67]:
from pyspark.ml.classification import GBTClassifier

clf = GBTClassifier(featuresCol='features', labelCol='y')
clf_model = clf.fit(binary_df)

print(clf_model.featureImportances)
print(clf_model.toDebugString)

(7,[0,1,2,3,4,5,6],[0.12254142931904063,0.09357576341144344,0.1391743695807476,0.46087842668732193,0.04422440619682294,0.121017928119802,0.018587676684821405])
GBTClassificationModel: uid = GBTClassifier_e5979df48d2d, numTrees=20, numClasses=2, numFeatures=7
  Tree 0 (weight 1.0):
    If (feature 3 <= 486.5)
     If (feature 5 <= 9.5)
      If (feature 0 <= 60.5)
       If (feature 3 <= 215.5)
        If (feature 0 <= 28.5)
         Predict: -0.8618307426597582
        Else (feature 0 > 28.5)
         Predict: -0.9639685102947113
       Else (feature 3 > 215.5)
        If (feature 0 <= 26.5)
         Predict: -0.5899159663865546
        Else (feature 0 > 26.5)
         Predict: -0.843870030104848
      Else (feature 0 > 60.5)
       If (feature 3 <= 125.5)
        If (feature 2 <= 9.5)
         Predict: -1.0
        Else (feature 2 > 9.5)
         Predict: -0.8070175438596491
       Else (feature 3 > 125.5)
        If (feature 3 <= 286.5)
         Predict: -0.2425249169435216
        E

## **Regressor**

In [68]:
from pyspark.ml.regression import GBTRegressor

reg = GBTRegressor(featuresCol='features', labelCol='balance')
reg_model = reg.fit(continuous_df)
print(reg_model.featureImportances)
print(reg_model.toDebugString)

(6,[0,1,2,3,4,5],[0.17283152520595657,0.19782142927146193,0.35600351989284607,0.1209886952073749,0.09255326231443217,0.05980156810792826])
GBTRegressionModel: uid=GBTRegressor_93a06c8a5d92, numTrees=20, numFeatures=6
  Tree 0 (weight 1.0):
    If (feature 0 <= 54.5)
     If (feature 0 <= 37.5)
      If (feature 1 <= 17.5)
       If (feature 1 <= 3.5)
        If (feature 4 <= 346.5)
         Predict: 1292.781717888101
        Else (feature 4 > 346.5)
         Predict: 2643.3076923076924
       Else (feature 1 > 3.5)
        If (feature 4 <= 26.5)
         Predict: 937.5397395002658
        Else (feature 4 > 26.5)
         Predict: 1198.0977011494253
      Else (feature 1 > 17.5)
       If (feature 1 <= 21.5)
        If (feature 0 <= 29.5)
         Predict: 917.4211287988422
        Else (feature 0 > 29.5)
         Predict: 1723.0375874125873
       Else (feature 1 > 21.5)
        If (feature 4 <= 3.5)
         Predict: 952.7784339457568
        Else (feature 4 > 3.5)
         Predict: 1

# **Support Vector Machines (SVM)**

In [72]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors


# Convert the label column to Double type
binary_df = binary_df.withColumn('y', binary_df['y'].cast('Double'))


We need to change the y columns to double

In [75]:
import numpy as np
from pyspark.ml.classification import LinearSVC
np.set_printoptions(precision=3, suppress=True)
clf = LinearSVC(featuresCol='features', labelCol='y')
clf_model = clf.fit(binary_df)
print(clf_model.intercept, clf_model.coefficients)

-1.000000057476241 [1.497445206471147e-10,-0.0,6.766960380352564e-10,1.4885182296856278e-10,-4.99308247508715e-09,-0.0,8.620405476582408e-09]


# **Neural Network**

In [76]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
#output layer is set to 2 because of binary target
clf = MultilayerPerceptronClassifier(featuresCol='features', labelCol='y', layers=[4,4,2])
clf_model = clf.fit(binary_df)

In [79]:
clf_model

MultilayerPerceptronClassificationModel: uid=MultilayerPerceptronClassifier_f44ccc4d03c4, numLayers=3, numClasses=2, numFeatures=4

# **One-vs-Rest classifier**
used for multi-class classifier

In [80]:
target_variable_name = "education"
multiclass_df = data.select(['age','balance','day','duration','campaign','pdays','previous','job','education'])
features_list = multiclass_df.columns

features_list.remove(target_variable_name)

#apply function on dataframe
multiclass_df = assemble_vectors(multiclass_df, features_list,target_variable_name)

In [81]:
#fitting the one-vs-rest classifier
from pyspark.ml.classification import RandomForestClassifier, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#generate test/train split
(train, test) = multiclass_df.randomSplit([0.7,0.3])

In [82]:
#initiate base classifier
clf = RandomForestClassifier(featuresCol='features', labelCol='education')

#initate One vs Rest classifier
ovr = OneVsRest(classifier=clf, featuresCol='features', labelCol='education')

#train over multiclass model
ovrModel = ovr.fit(train)

In [83]:
# score the model on test data
predictions = ovrModel.transform(test)

#obtain evaluator
evaluator = MulticlassClassificationEvaluator(metricName="accuracy", labelCol='education')

#compute the classification error on test data
accuracy = evaluator.evaluate(predictions)

print("Test Error = %g" %(1.0-accuracy))

Test Error = 0.328405


# **Naive-Bayes Classification**

In [84]:
target_variable_name = 'y'
nonneg_df = data.select(['age','day','duration','campaign','previous','y'])#excluding target and other variables

features_list = nonneg_df.columns
features_list.remove(target_variable_name)

#apply function on Dataframe
nonneg_df = assemble_vectors(nonneg_df, features_list, target_variable_name)


In [85]:
#fit naive bayes model
from pyspark.ml.classification import NaiveBayes
clf = NaiveBayes(featuresCol='features', labelCol='y')
clf_model = clf.fit(nonneg_df)