In [5]:
from pyspark.sql.session import SparkSession

spark = SparkSession.builder.getOrCreate()

In [6]:
spark

In [7]:
import pandas as pd
import numpy as np
from numpy import array
from math import sqrt
import sys
import getpass
uname=getpass.getuser()
import matplotlib.pyplot as plt

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.sql.functions import *

from pyspark.sql import functions as F


# Ml libraries
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

from itertools import chain
from pyspark.sql import Row
from pyspark.mllib.evaluation import MulticlassMetrics # evaluation of confusion matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import itertools


In [8]:
#parquet_path = r'C:\\Users\\{uname}\\Documents\\git_repos\\Analyses\\spark\\customer_timeseries\\uscan_features_full_100custs'.format(uname=uname)
parquet_path = r'uscan_features_full_100custs'#.format(uname=uname)

df = spark.read.parquet(parquet_path)


In [9]:
## We filter out invoices with an open date from before 2010-01-01 because we want to include invoices for which we have the 
# full life time only
df = df.filter(col('client_document_date') >= '2014-01-01')

### Creating the target variable

In [10]:
# Create the days to close variable which will be used as the target variable
timeFmt = "yyyy-MM-dd"
timeDiff = (F.unix_timestamp('close_date', format=timeFmt)
            - F.unix_timestamp('dates', format=timeFmt))
df = df.withColumn("total_days_to_closed", round(timeDiff/86400, 0)) 

#df = df.withColumn('total_days_to_closed',col('close_date') - col('dates'))

# Create variable with the different class cut-off points
lbv=[i for i in range(7,92,7)]

# Create the target variable
df = df.withColumn('target_variable', F.when(col('total_days_to_closed') <= lbv[0], 0)
                   .when(col('total_days_to_closed') > lbv[len(lbv)-1], len(lbv))
                   .when((col('total_days_to_closed')<=lbv[1]) & (col('total_days_to_closed')>lbv[1 - 1]), lbv[1]/7-1)     
                   .when((col('total_days_to_closed')<=lbv[2]) & (col('total_days_to_closed')>lbv[2 - 1]), lbv[2]/7-1)     
                   .when((col('total_days_to_closed')<=lbv[3]) & (col('total_days_to_closed')>lbv[3 - 1]), lbv[3]/7-1)     
                   .when((col('total_days_to_closed')<=lbv[4]) & (col('total_days_to_closed')>lbv[4 - 1]), lbv[4]/7-1)     
                   .when((col('total_days_to_closed')<=lbv[5]) & (col('total_days_to_closed')>lbv[5 - 1]), lbv[5]/7-1)     
                   .when((col('total_days_to_closed')<=lbv[6]) & (col('total_days_to_closed')>lbv[6 - 1]), lbv[6]/7-1)     
                   .when((col('total_days_to_closed')<=lbv[7]) & (col('total_days_to_closed')>lbv[7 - 1]), lbv[7]/7-1)     
                   .when((col('total_days_to_closed')<=lbv[8]) & (col('total_days_to_closed')>lbv[8 - 1]), lbv[8]/7-1)     
                   .when((col('total_days_to_closed')<=lbv[9]) & (col('total_days_to_closed')>lbv[9 - 1]), lbv[9]/7-1)     
                   .when((col('total_days_to_closed')<=lbv[10]) & (col('total_days_to_closed')>lbv[10 - 1]), lbv[10]/7-1)     
                   .when((col('total_days_to_closed')<=lbv[11]) & (col('total_days_to_closed')>lbv[11 - 1]), lbv[11]/7-1)     
                   .when((col('total_days_to_closed')<=lbv[12]) & (col('total_days_to_closed')>lbv[12 - 1]), lbv[12]/7-1)                        
                   .otherwise('nan'))


## TO DO: Incorporate extra class which includes invoices that never close (> 720 days)

In [11]:
# Group by to see how many rows each class consists of
df_count = df.select(df.target_variable.cast('float'),'client_unique_document_no').groupby('target_variable').agg(F.count('client_unique_document_no').alias('count')).sort("target_variable", ascending=True)
df_count.show()

# Calculate a min_var
min_var = df_count.agg({"count": "min"}).collect()[0]

print(min_var["min(count)"])

## Groupby target variable and count how many client_unique_document_no sit in each category
#df.select(df.target_variable.cast('float'),'client_unique_document_no').groupby('target_variable').agg(F.countDistinct('client_unique_document_no')).sort("target_variable", ascending=True).show()

## The problem is unbalanced, this means we will need to resample for decent results. 
## We will resample based on the number of rows in each class. We will only resample the training data set.

+---------------+-----+
|target_variable|count|
+---------------+-----+
|            0.0|13578|
|            1.0|11797|
|            2.0|11491|
|            3.0|10583|
|            4.0| 9201|
|            5.0| 7244|
|            6.0| 6208|
|            7.0| 5535|
|            8.0| 5028|
|            9.0| 4420|
|           10.0| 3938|
|           11.0| 3426|
|           12.0| 3096|
|           13.0|60688|
+---------------+-----+

3096


In [12]:
df.take(1)

[Row(all_open_invoices_amt='1929.2', all_open_invoices_count='1', avg_amt_inv_closed_pw_past_10_weeks='0.0', avg_amt_inv_closed_pw_past_11_weeks='0.0', avg_amt_inv_closed_pw_past_12_weeks='0.0', avg_amt_inv_closed_pw_past_13_weeks='0.0', avg_amt_inv_closed_pw_past_1_weeks='0.0', avg_amt_inv_closed_pw_past_2_weeks='0.0', avg_amt_inv_closed_pw_past_3_weeks='0.0', avg_amt_inv_closed_pw_past_4_weeks='0.0', avg_amt_inv_closed_pw_past_5_weeks='0.0', avg_amt_inv_closed_pw_past_6_weeks='0.0', avg_amt_inv_closed_pw_past_7_weeks='0.0', avg_amt_inv_closed_pw_past_8_weeks='0.0', avg_amt_inv_closed_pw_past_9_weeks='0.0', avg_amt_inv_open_pw_past_10_weeks='192.92000000000002', avg_amt_inv_open_pw_past_11_weeks='175.38181818181818', avg_amt_inv_open_pw_past_12_weeks='160.76666666666668', avg_amt_inv_open_pw_past_13_weeks='148.4', avg_amt_inv_open_pw_past_1_weeks='0.0', avg_amt_inv_open_pw_past_2_weeks='0.0', avg_amt_inv_open_pw_past_3_weeks='0.0', avg_amt_inv_open_pw_past_4_weeks='0.0', avg_amt_inv_o

### Removing text nan from the dataset

In [13]:
# For every column in dataframe df, we replace 'nan' with None
for i in df.columns:
    df = df.withColumn(i, F.when(col(i) == 'nan', None).otherwise(col(i)))


In [14]:
# Function to drop columns with None values
def drop_null_columns(df):
    """
    This function drops all columns which contain null values.
    :param df: A PySpark DataFrame
    """
    null_counts = df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]).collect()[0].asDict()
    to_drop = [k for k, v in null_counts.items() if v > 0]
    df = df.drop(*to_drop)
    return df

# Drops column b2, because it contains null values
df = drop_null_columns(df)

#Source: https://stackoverflow.com/questions/51322445/how-to-drop-all-columns-with-null-values-in-a-pyspark-dataframe

### Create train test set

In [15]:
# We take a cut-off point date. Any invoice open on that date becomes part of our test dataset, all other data will be used
# for the train validation bit. 

# set cut-off date
from pyspark.sql.types import TimestampType 
cut_off_date = to_date(lit('2017-11-15')).cast(TimestampType())
cut_off_date2 = to_date(lit('2018-11-14')).cast(TimestampType())

# Set close_date and dates to dateformat
df = df.withColumn('close_date_dateformat', to_date(unix_timestamp(col('close_date'), 'yyyy-MM-dd').cast("timestamp")))
df = df.withColumn('dates_dateformat', to_date(unix_timestamp(col('dates'), 'yyyy-MM-dd').cast("timestamp")))

# Filter data
train = df.filter(col('close_date_dateformat') < cut_off_date)
#test = df.filter(col('close_date_dateformat') == cut_off_date)
test = df.filter(col('close_date_dateformat') >= cut_off_date)
test = df.filter((col('dates_dateformat') >= cut_off_date) & (col('dates_dateformat') < cut_off_date2))

In [16]:
test.count()

83453

### Resampling the training data

In [17]:
# Group by to see how many rows each class consists of
df_count = train.select(train.target_variable.cast('float'),'client_unique_document_no').groupby('target_variable').agg(F.count('client_unique_document_no').alias('count')).sort("target_variable", ascending=True)
#df_count.show()

# Calculate a min_var
cut_off_var = df_count.agg({"count": "min"}).collect()[0]

#cut_off_var
print(cut_off_var["min(count)"])


328


##### Rebalancing class 13

##### Rebalancing the full dataset

In [18]:
%%time
# specify window to give random row numbers to all rows within one class
from pyspark.sql.window import Window
windowSpec = Window.partitionBy('target_variable').orderBy(rand())
train2 = train.select('client_unique_document_no','dates','target_variable', row_number().over(windowSpec).alias('record_no'))


CPU times: user 5.84 ms, sys: 3.11 ms, total: 8.95 ms
Wall time: 98.2 ms


In [19]:
%time
# Drop rows from window that have a record_no larger than cut_off_var['min(count)']
train3 = train2.filter(col('record_no') <= (cut_off_var["min(count)"]))

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 10 µs


In [20]:
# Unpersist train2 to be able to reuse the name
train2.unpersist()

DataFrame[client_unique_document_no: string, dates: string, target_variable: string, record_no: int]

In [21]:
%%time
# left join train to train 3 using dates and client_unique_document_no as key
cond = [train3.client_unique_document_no == train.client_unique_document_no, train3.dates == train.dates]
train2 = train3.join(train, cond, how='left').select(train['*']) 


CPU times: user 3.35 ms, sys: 2.01 ms, total: 5.37 ms
Wall time: 5.4 s


##### no rebalancing

##### Checking

In [22]:
%%time
train2.count()

CPU times: user 3.78 ms, sys: 3.24 ms, total: 7.01 ms
Wall time: 28.7 s


4592

In [23]:
%%time
train2.select(train2.target_variable.cast('float'),'client_unique_document_no').groupby('target_variable').agg(F.count('client_unique_document_no').alias('count')).sort("target_variable", ascending=True).show()


+---------------+-----+
|target_variable|count|
+---------------+-----+
|            0.0|  328|
|            1.0|  328|
|            2.0|  328|
|            3.0|  328|
|            4.0|  328|
|            5.0|  328|
|            6.0|  328|
|            7.0|  328|
|            8.0|  328|
|            9.0|  328|
|           10.0|  328|
|           11.0|  328|
|           12.0|  328|
|           13.0|  328|
+---------------+-----+

CPU times: user 9.16 ms, sys: 4.18 ms, total: 13.3 ms
Wall time: 23.6 s


### Splitting training dataset 'train' in train and validate

In [24]:
# To create a train and validation split, we transform the dataframe to have only one occurrence of each invoice
unique_inv = train2.select(col('client_unique_document_no'))
unique_inv = unique_inv.dropDuplicates(['client_unique_document_no'])
unique_inv.count()

328

In [25]:
# Create train and validation sets (split is 80/20)
limit_ = 260 #unique_inv.count() * 0.8

# From the unique list of invoices select 2552 (80%). We use orderBy(rand()) to ensure the data is shuffled before selecting the 80% limit
unique_inv_train = unique_inv.orderBy(rand()).limit(limit_)

# Label the invoices as train
id_ = 'train'
unique_inv_train = unique_inv_train.withColumn('train_val', lit(id_))

# Join with the original dataframe
cond = [unique_inv_train.client_unique_document_no == train2.client_unique_document_no]
try_ = unique_inv_train.join(train2, cond, how='right').select(train2['*'],unique_inv_train['train_val']) 

# Create train and validation dataframe using the train_val label
unique_inv_validate = try_.filter("train_val is null")
unique_inv_train = try_.filter("train_val is not null")

####TO DO: find way to automate the train/test 
####TO DO: find way to not have to client_unique_document_no columns


### Transform data

In [26]:
# Convert categorical variables to numerical
from pyspark.ml.feature import StringIndexer
l_indexer1 = StringIndexer(inputCol='disputed_final', outputCol='disputed_final_index') 
l_indexer2 = StringIndexer(inputCol='dow_due', outputCol='dow_due_index')
l_indexer3 = StringIndexer(inputCol='dow_opened', outputCol='dow_opened_index')


In [27]:
# convert training data
unique_inv_train = l_indexer1.fit(unique_inv_train).transform(unique_inv_train)
unique_inv_train = l_indexer2.fit(unique_inv_train).transform(unique_inv_train)
unique_inv_train = l_indexer3.fit(unique_inv_train).transform(unique_inv_train)


In [28]:
# convert validation data
unique_inv_validate = l_indexer1.fit(unique_inv_validate).transform(unique_inv_validate)
unique_inv_validate = l_indexer2.fit(unique_inv_validate).transform(unique_inv_validate)
unique_inv_validate = l_indexer3.fit(unique_inv_validate).transform(unique_inv_validate)


In [29]:
# Create unique_inv_test as test. This is to ensure the below works and creates the df unique_inv_test. 
unique_inv_test = test

In [30]:
# Convert testing data
unique_inv_test = l_indexer1.fit(unique_inv_test).transform(unique_inv_test)
unique_inv_test = l_indexer2.fit(unique_inv_test).transform(unique_inv_test)
unique_inv_test = l_indexer3.fit(unique_inv_test).transform(unique_inv_test)

# The string indexer converts a string variable to a double type where it labels all category to a number. The original label
# gets stored in the metadata and can be accessed from there.
# When using tree based algorithms, just using a stringIndexer is sufficient. For different type of algorithms, OneHotEncoder
# will have to be used afterwards
#https://stackoverflow.com/questions/32277576/how-to-handle-categorical-features-with-spark-ml

In [31]:
# We create a list with all features we want to include in the vector assembler.
feat_list = []
for i in unique_inv_validate.columns:
    feat_list.append(i)

# Now remove a number of columns that are not features
to_drop = ['train_val',
           'client_debtor_name','client_debtor_number','client_document_date'
           ,'close_date','dates','order_to_cash_final','term_cost','term_default'
           ,'total_days_to_closed','close_date_dateformat','dates_dateformat'
           ,'dpd_final'
           # Remove because of formatting issues: (need to convert to categorical variables)
           , 'disputed_final', 'dow_due', 'dow_opened'
           , 'due_date'
           # Not drop because needed in feature vector later on:
           #,'target_variable','client_unique_document_no'
          ]

for i in to_drop:
    feat_list.remove(i)

#print(len(feat_list))

In [32]:
for i in feat_list:
    print(i)

avg_amt_inv_closed_pw_past_10_weeks
avg_amt_inv_closed_pw_past_11_weeks
avg_amt_inv_closed_pw_past_12_weeks
avg_amt_inv_closed_pw_past_13_weeks
avg_amt_inv_closed_pw_past_1_weeks
avg_amt_inv_closed_pw_past_2_weeks
avg_amt_inv_closed_pw_past_3_weeks
avg_amt_inv_closed_pw_past_4_weeks
avg_amt_inv_closed_pw_past_5_weeks
avg_amt_inv_closed_pw_past_6_weeks
avg_amt_inv_closed_pw_past_7_weeks
avg_amt_inv_closed_pw_past_8_weeks
avg_amt_inv_closed_pw_past_9_weeks
avg_amt_inv_open_pw_past_10_weeks
avg_amt_inv_open_pw_past_11_weeks
avg_amt_inv_open_pw_past_12_weeks
avg_amt_inv_open_pw_past_13_weeks
avg_amt_inv_open_pw_past_1_weeks
avg_amt_inv_open_pw_past_2_weeks
avg_amt_inv_open_pw_past_3_weeks
avg_amt_inv_open_pw_past_4_weeks
avg_amt_inv_open_pw_past_5_weeks
avg_amt_inv_open_pw_past_6_weeks
avg_amt_inv_open_pw_past_7_weeks
avg_amt_inv_open_pw_past_8_weeks
avg_amt_inv_open_pw_past_9_weeks
avg_amt_inv_overdue_pw_past_10_weeks
avg_amt_inv_overdue_pw_past_11_weeks
avg_amt_inv_overdue_pw_past_12_wee

In [33]:
# convert column datatypes to Integer/Float to be able to use the VectorAssembler later on
from pyspark.sql.types import IntegerType

unique_inv_train = unique_inv_train.select(*[col(x).cast('float') for x in feat_list])
unique_inv_validate = unique_inv_validate.select(*[col(x).cast('float') for x in feat_list])   
unique_inv_test = unique_inv_test.select(*[col(x).cast('float') for x in feat_list])

unique_inv_train.printSchema()


root
 |-- avg_amt_inv_closed_pw_past_10_weeks: float (nullable = true)
 |-- avg_amt_inv_closed_pw_past_11_weeks: float (nullable = true)
 |-- avg_amt_inv_closed_pw_past_12_weeks: float (nullable = true)
 |-- avg_amt_inv_closed_pw_past_13_weeks: float (nullable = true)
 |-- avg_amt_inv_closed_pw_past_1_weeks: float (nullable = true)
 |-- avg_amt_inv_closed_pw_past_2_weeks: float (nullable = true)
 |-- avg_amt_inv_closed_pw_past_3_weeks: float (nullable = true)
 |-- avg_amt_inv_closed_pw_past_4_weeks: float (nullable = true)
 |-- avg_amt_inv_closed_pw_past_5_weeks: float (nullable = true)
 |-- avg_amt_inv_closed_pw_past_6_weeks: float (nullable = true)
 |-- avg_amt_inv_closed_pw_past_7_weeks: float (nullable = true)
 |-- avg_amt_inv_closed_pw_past_8_weeks: float (nullable = true)
 |-- avg_amt_inv_closed_pw_past_9_weeks: float (nullable = true)
 |-- avg_amt_inv_open_pw_past_10_weeks: float (nullable = true)
 |-- avg_amt_inv_open_pw_past_11_weeks: float (nullable = true)
 |-- avg_amt_inv_o

In [34]:
# Remove target variable and clinet_unique_document_no from list
feat_list.remove('target_variable')
feat_list.remove('client_unique_document_no')

In [35]:
# Create the feature vector
vector_assembler = VectorAssembler(inputCols = feat_list, outputCol="features")

df_train_temp = vector_assembler.transform(unique_inv_train)
df_validate_temp = vector_assembler.transform(unique_inv_validate)
df_test_temp = vector_assembler.transform(unique_inv_test)

#df_validate_temp = vector_assembler.transform(unique_inv_validate)
#df_temp.show(3)

In [36]:
# Next we create df_train and df_validdate by selecting the features vector and target variable from the temp datasets 
# we created
df_train = df_train_temp.select('features','target_variable')
df_validate = df_validate_temp.select('features','target_variable')
df_test = df_test_temp.select('features','target_variable')

In [37]:
df_train.rdd.getNumPartitions()
# when rebalancing 13/14 partitions

200

In [38]:
df_train.count()

3711

### The random Forest Model

For the differences between apache.ml and apache.mllib read the below StackOverflow:
https://stackoverflow.com/questions/43240539/pyspark-mllib-versus-pyspark-ml-packages/43241691

#### ml libraries
https://spark.apache.org/docs/1.6.0/ml-classification-regression.html#random-forest-classifier

##### Train - fit the model

In [39]:
rf = RandomForestClassifier(labelCol="target_variable", featuresCol="features", numTrees=50, minInfoGain = 0.0
                           , maxDepth = 8)

## Algorithm options:
##self, featuresCol="features", labelCol="label", predictionCol="prediction", probabilityCol="probability"
##, rawPredictionCol="rawPrediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0
##, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", numTrees=20
##, featureSubsetStrategy="auto", seed=None, subsamplingRate=1.0

In [40]:
%%time
# Train model.
model = rf.fit(df_train)

# model cannot be fit locally with MaxDepth = 10 as this leads to a Java Heap Space error. 

--- Logging error ---
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pysp

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/anusha/

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:60223)
Traceback (most recent call last):
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1246, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/anusha/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py", lin

Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:60223)

##### validate

In [None]:
%%time
# Make predictions.
val_predictions = model.transform(df_validate)


In [None]:
%%time
# Select example rows to display.
val_predictions.select("prediction", "target_variable", "features").show(5)


In [None]:
%%time
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="target_variable", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(val_predictions)
print("Accuracy = %g" % (accuracy))

# Data since 2010, 1 year of test data, no rebalancing: 38%
# Data since 2010, 1 year of test data, rebalancing class 13: 23%
# Data since 2010, 1 year of test data, rebalancing all classes: 17%
# Data since 2010, 1 year of test data, rebalancing all classes and adding cat variables: 16%
# Data since 2010, 1 year of test data, no rebalancing with cat variables: 36%
# 1 year of test data, rebalancing all classes with cat variables: 16%

In [None]:
%%time
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="target_variable", predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(val_predictions)
print("f1 score = %g" % (f1))

# Data since 2010, 1 year of test data, no rebalancing 26%
# Data since 2010, 1 year of test data, rebalancing class 13: 16%
# Data since 2010, 1 year of test data, rebalancing all classes: 13%
# Data since 2010, 1 year of test data, rebalancing all classes and adding cat variables: 14% 
# Data since 2010, 1 year of test data, no rebalancing with cat variables: 25%
# 1 year of test data, rebalancing all classes with cat variables: 13%

##### Test - on unseen data

In [None]:
%%time
# Make predictions on the test set
test_predictions = model.transform(df_test)


In [None]:
%%time
# Select example rows to display.
test_predictions.select("prediction", "target_variable", "features").show(5)


In [None]:
%%time
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="target_variable", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(test_predictions)
print("accuracy = %g" % (accuracy))
# is total number of correctly classified instances out of total number of instances



In [None]:
%%time
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="target_variable", predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(test_predictions)
print("f1 score = %g" % (f1))


In [None]:
# Extract the column names and the feature importances. Combine into one frame and order by feature importance. 
attrs = sorted(
    (attr["idx"], attr["name"]) for attr in (chain(*test_predictions
        .schema["features"]
        .metadata["ml_attr"]["attrs"].values())))

var1 = [{'feature':name, 'importance':float(model.featureImportances[idx])} for idx, name in attrs]

data_df = spark.createDataFrame(Row(**x) for x in var1)#.show(truncate=False)
#data_df.take(5)

#https://stackoverflow.com/questions/52238803/how-to-convert-list-of-dictionaries-into-spark-dataframe
#https://stackoverflow.com/questions/50937591/pyspark-random-forest-feature-importance-mapping-after-column-transformations

In [None]:
#data_df.select('feature','importance').sort("importance", ascending=False).show(20)
data_df.select('feature','importance').sort("importance", ascending=False).take(20)


##### Plot confusion matrix

In [None]:
#confusion matrix
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    #plt.figure(figsize=(20,20))

In [None]:
#metrics = MulticlassMetrics(df_try.rdd)
#metrics = MulticlassMetrics(test_predictions.select("prediction", test_predictions.target_variable.cast('int')).rdd)
metrics = MulticlassMetrics(test_predictions.select("prediction", "target_variable").rdd)


In [None]:
cnf_matrix = metrics.confusionMatrix().toArray()

In [None]:
class_names = []
for i in range(0,14):
    class_names.append(i)

In [None]:
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
#plt.figure()
#plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
#                      title='Normalized confusion matrix')

plt.show()

