In [3]:
 
#import packages

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import yaml
import pandas as pd

#local session 
spark = (SparkSession.builder.master("local[2]").appName("logistic-regression").getOrCreate())

# Set the data path
with open("../../config.yaml") as f:
    config = yaml.safe_load(f)

rescue_path_parquet = config["rescue_clean_path"]
# Read in the data
rescue = spark.read.parquet(rescue_path_parquet)

rescue.limit(5).toPandas()



Unnamed: 0,incident_number,datetimeofcall,cal_year,finyear,typeofincident,engine_count,job_hours,hourly_cost,total_cost,finaldescription,...,originofcall,propertytype,propertycategory,specialservicetypecategory,specialservicetype,ward,borough,stngroundname,postcodedistrict,incident_duration
0,139091,01/01/2009 03:01,2009,2008/09,Special Service,1.0,2.0,255,510.0,"Dog With Jaw Trapped In Magazine Rack,b15",...,Person (land Line),House - Single Occupancy,Dwelling,Other Animal Assistance,Animal Assistance Involving Livestock - Other ...,Crystal Palace & Upper Norwood,Croydon,Norbury,Se19,2.0
1,275091,01/01/2009 08:51,2009,2008/09,Special Service,1.0,1.0,255,255.0,"Assist Rspca With Fox Trapped,b15",...,Person (land Line),Railings,Outdoor Structure,Other Animal Assistance,Animal Assistance Involving Livestock - Other ...,Woodside,Croydon,Woodside,Se25,1.0
2,2075091,04/01/2009 10:07,2009,2008/09,Special Service,1.0,1.0,255,255.0,"Dog Caught In Drain,b15",...,Person (mobile),Pipe Or Drain,Outdoor Structure,Animal Rescue From Below Ground,Animal Rescue From Below Ground - Domestic Pet,Carshalton Central,Sutton,Wallington,Sm5,1.0
3,2872091,05/01/2009 12:27,2009,2008/09,Special Service,1.0,1.0,255,255.0,"Horse Trapped In Lake,j17",...,Person (mobile),"Intensive Farming Sheds (chickens, Pigs Etc)",Non Residential,Animal Rescue From Water,Animal Rescue From Water - Farm Animal,Harefield,Hillingdon,Ruislip,Ub9,1.0
4,3553091,06/01/2009 15:23,2009,2008/09,Special Service,1.0,1.0,255,255.0,"Rabbit Trapped Under Sofa,b15",...,Person (mobile),House - Single Occupancy,Dwelling,Other Animal Assistance,Animal Assistance Involving Livestock - Other ...,Gooshays,Havering,Harold Hill,Rm3,1.0


In [4]:
rescue_path_parquet

'file:////home/cdsw/ons-spark/ons-spark/data/rescue_clean.parquet'

In [2]:

# Create is_cat column to contain target variable and select relevant predictors
rescue_cat = rescue.withColumn('is_cat', 
                               F.when(F.col('animal_group')=="Cat", 1)
                               .otherwise(0)).select("typeofincident", 
                              "engine_count", 
                              "job_hours", 
                              "hourly_cost", 
                              "total_cost", 
                              "originofcall", 
                              "propertycategory",
                              "specialservicetypecategory",
                              "incident_duration",
                              "is_cat")

# Check created column
rescue_cat.limit(20).toPandas()

# Check data types
rescue_cat.printSchema()


root
 |-- typeofincident: string (nullable = true)
 |-- engine_count: string (nullable = true)
 |-- job_hours: string (nullable = true)
 |-- hourly_cost: string (nullable = true)
 |-- total_cost: string (nullable = true)
 |-- originofcall: string (nullable = true)
 |-- propertycategory: string (nullable = true)
 |-- specialservicetypecategory: string (nullable = true)
 |-- incident_duration: string (nullable = true)
 |-- is_cat: integer (nullable = false)



In [3]:

# Convert engine_count, job_hours, hourly_cost, total_cost and incident_duration columns to numeric
rescue_cat = (
  rescue_cat.withColumn("engine_count", F.col("engine_count").cast("double"))
            .withColumn("job_hours", F.col("job_hours").cast("double"))
            .withColumn("hourly_cost", F.col("hourly_cost").cast("double"))
            .withColumn("total_cost", F.col("total_cost").cast("double"))
            .withColumn("incident_duration", F.col("incident_duration").cast("double")))


# Check data types are now correct
rescue_cat.printSchema()



root
 |-- typeofincident: string (nullable = true)
 |-- engine_count: double (nullable = true)
 |-- job_hours: double (nullable = true)
 |-- hourly_cost: double (nullable = true)
 |-- total_cost: double (nullable = true)
 |-- originofcall: string (nullable = true)
 |-- propertycategory: string (nullable = true)
 |-- specialservicetypecategory: string (nullable = true)
 |-- incident_duration: double (nullable = true)
 |-- is_cat: integer (nullable = false)



In [4]:

# Get the count of missing values for each column
missing_summary = (
    rescue_cat
    .select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in rescue_cat.columns])
)

# Show the summary
missing_summary.show(vertical = True)

# We can see that these are all on the same rows by filtering for NAs in one of the columns:
rescue_cat.filter(rescue_cat.total_cost.isNull()).limit(38).toPandas()

# For simplicity, we will just filter out these rows:
rescue_cat = rescue_cat.na.drop()

# Double check we have no nulls left:
missing_summary = (
    rescue_cat
    .select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in rescue_cat.columns])
    .show(vertical = True)
)


-RECORD 0-------------------------
 typeofincident             | 0   
 engine_count               | 38  
 job_hours                  | 38  
 hourly_cost                | 0   
 total_cost                 | 38  
 originofcall               | 0   
 propertycategory           | 0   
 specialservicetypecategory | 0   
 incident_duration          | 38  
 is_cat                     | 0   

-RECORD 0-------------------------
 typeofincident             | 0   
 engine_count               | 0   
 job_hours                  | 0   
 hourly_cost                | 0   
 total_cost                 | 0   
 originofcall               | 0   
 propertycategory           | 0   
 specialservicetypecategory | 0   
 incident_duration          | 0   
 is_cat                     | 0   



In [5]:

# Importing the required libraries - replace OneHotEncoderEstimator with OneHotEncoder if using Spark >= 3.0
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoderEstimator

## First we call the StringIndexer separately for each categorical variable

# Indexing the specialservicetypecategory column
serviceIdx = StringIndexer(inputCol='specialservicetypecategory',
                               outputCol='serviceIndex')

# Indexing the originofcallcolumn
callIdx = StringIndexer(inputCol='originofcall',
                               outputCol='callIndex')

# Indexing the propertycategory column
propertyIdx = StringIndexer(inputCol='propertycategory',
                               outputCol='propertyIndex')
                               
# Apply indexing to each column one by one

rescue_cat_indexed = serviceIdx.fit(rescue_cat).transform(rescue_cat)
rescue_cat_indexed = callIdx.fit(rescue_cat_indexed).transform(rescue_cat_indexed)
rescue_cat_indexed = propertyIdx.fit(rescue_cat_indexed).transform(rescue_cat_indexed)

# Check that this has worked correctly
rescue_cat_indexed.select('is_cat', 'specialservicetypecategory', 'originofcall',
                          'propertycategory', 'serviceIndex', 'callIndex', 
                          'propertyIndex').show(10, truncate = False)


+------+-------------------------------+------------------+-----------------+------------+---------+-------------+
|is_cat|specialservicetypecategory     |originofcall      |propertycategory |serviceIndex|callIndex|propertyIndex|
+------+-------------------------------+------------------+-----------------+------------+---------+-------------+
|0     |Other Animal Assistance        |Person (land Line)|Dwelling         |0.0         |1.0      |0.0          |
|0     |Other Animal Assistance        |Person (land Line)|Outdoor Structure|0.0         |1.0      |3.0          |
|0     |Animal Rescue From Below Ground|Person (mobile)   |Outdoor Structure|2.0         |0.0      |3.0          |
|0     |Animal Rescue From Water       |Person (mobile)   |Non Residential  |3.0         |0.0      |2.0          |
|0     |Other Animal Assistance        |Person (mobile)   |Dwelling         |0.0         |0.0      |0.0          |
|0     |Other Animal Assistance        |Person (land Line)|Dwelling         |0.0

In [6]:

# Apply OneHotEncoderEstimator to each categorical column simultaneously
# Replace OneHotEncoderEstimator with OneHotEncoder if using Spark >= 3.0
encoder = OneHotEncoderEstimator(inputCols = ['serviceIndex', 'callIndex', 'propertyIndex'], 
                                 outputCols = ['serviceVec', 'callVec', 'propertyVec'])

rescue_cat_ohe = encoder.fit(rescue_cat_indexed).transform(rescue_cat_indexed)

# Check that this has worked correctly 
rescue_cat_ohe.select('is_cat', 'specialservicetypecategory', 'originofcall',
                          'propertycategory', 'serviceVec', 'callVec', 
                          'propertyVec').show(10, truncate = False)



+------+-------------------------------+------------------+-----------------+-------------+-------------+-------------+
|is_cat|specialservicetypecategory     |originofcall      |propertycategory |serviceVec   |callVec      |propertyVec  |
+------+-------------------------------+------------------+-----------------+-------------+-------------+-------------+
|0     |Other Animal Assistance        |Person (land Line)|Dwelling         |(3,[0],[1.0])|(7,[1],[1.0])|(6,[0],[1.0])|
|0     |Other Animal Assistance        |Person (land Line)|Outdoor Structure|(3,[0],[1.0])|(7,[1],[1.0])|(6,[3],[1.0])|
|0     |Animal Rescue From Below Ground|Person (mobile)   |Outdoor Structure|(3,[2],[1.0])|(7,[0],[1.0])|(6,[3],[1.0])|
|0     |Animal Rescue From Water       |Person (mobile)   |Non Residential  |(3,[],[])    |(7,[0],[1.0])|(6,[2],[1.0])|
|0     |Other Animal Assistance        |Person (mobile)   |Dwelling         |(3,[0],[1.0])|(7,[0],[1.0])|(6,[0],[1.0])|
|0     |Other Animal Assistance        |

In [7]:


# Call 'VectorAssembler' to vectorise all predictor columns in dataset
assembler = VectorAssembler(inputCols=['engine_count', 'job_hours', 'hourly_cost',
                                       'callVec', 'propertyVec', 'serviceVec'],
                            outputCol = "features")
 
# Apply vectorisation                                      
rescue_cat_vectorised = assembler.transform(rescue_cat_ohe)

# Rename "is_cat" target variable column to "label" ready to pass to the regression model
rescue_cat_final = rescue_cat_vectorised.withColumnRenamed("is_cat", "label").select("label", "features")



In [8]:

# Import GeneralizedLinearRegression
from pyspark.ml.regression import GeneralizedLinearRegression

# Define model - specify family and link as shown for logistic regression
glr = GeneralizedLinearRegression(family="binomial", link="logit")


In [9]:

# Run model
model = glr.fit(rescue_cat_final)

# Get model results
model_output = model.transform(rescue_cat_final)
model_output.show(10)


+-----+--------------------+-------------------+
|label|            features|         prediction|
+-----+--------------------+-------------------+
|    0|(19,[0,1,2,4,10,1...| 0.5837340059218253|
|    0|(19,[0,1,2,4,13,1...| 0.2558840192691791|
|    0|(19,[0,1,2,3,13,1...|0.31990195367533936|
|    0|(19,[0,1,2,3,12],...|0.15861715479284585|
|    0|(19,[0,1,2,3,10,1...|  0.557249095551789|
|    0|(19,[0,1,2,4,10,1...| 0.5898814716386804|
|    0|(19,[0,1,2,4,11,1...| 0.4056895632593727|
|    0|(19,[0,1,2,3,11],...| 0.1809701223341755|
|    0|(19,[0,1,2,4,10,1...| 0.6870145858037298|
|    0|(19,[0,1,2,3,11],...| 0.1809701223341755|
+-----+--------------------+-------------------+
only showing top 10 rows



In [10]:

# Get model summary
summary = model.summary

# Show summary
summary


Coefficients:
             Feature Estimate   Std Error T Value P Value
         (Intercept) -20.7632  32614.1522 -0.0006  0.9995
        engine_count  -0.6127      0.2744 -2.2328  0.0256
           job_hours  -0.0254      0.0609 -0.4160  0.6774
         hourly_cost  -0.0007      0.0010 -0.7110  0.4771
callVec_Person (m...  21.5650  32614.1521  0.0007  0.9995
callVec_Person (l...  21.6985  32614.1521  0.0007  0.9995
      callVec_Police  20.6004  32614.1521  0.0006  0.9995
   callVec_Other Frs  20.8927  32614.1521  0.0006  0.9995
callVec_Person (r...  22.2337  32614.1522  0.0007  0.9995
   callVec_Ambulance  21.4983  32614.1522  0.0007  0.9995
   callVec_Not Known  -3.4161 357614.2975  0.0000  1.0000
propertyVec_Dwelling  -0.7497      1.4258 -0.5258  0.5990
 propertyVec_Outdoor  -1.4950      1.4245 -1.0495  0.2939
propertyVec_Non R...  -1.6538      1.4277 -1.1584  0.2467
propertyVec_Outdo...  -2.1807      1.4295 -1.5255  0.1271
propertyVec_Road ...  -0.5467      1.4325 -0.3817  0.7027


In [11]:

# Get model output
model_output = model.transform(rescue_cat_final)

# Get feature names from the model output metadata
# Numeric and binary (categorical) metadata are accessed separately
numeric_metadata = model_output.select("features").schema[0].metadata.get('ml_attr').get('attrs').get('numeric')
binary_metadata = model_output.select("features").schema[0].metadata.get('ml_attr').get('attrs').get('binary')

# Merge the numeric and binary metadata lists to get all the feature names
merge_list = numeric_metadata + binary_metadata

# Convert the feature name list to a Pandas dataframe
full_summary = pd.DataFrame(merge_list)

# Get the regression coefficients from the model
full_summary['coefficients'] = model.coefficients

# The intercept coefficient needs to be added in separately since it is not part of the features metadata
# Define a new row for the intercept coefficient and get value from model
intercept = pd.DataFrame({'name':'intercept', 'coefficients':model.intercept}, index = [0])

# Add new row to the top of the full_summary dataframe
full_summary = pd.concat([intercept,full_summary.loc[:]]).reset_index(drop=True)

# Add standard errors, t-values and p-values from summary into the full_summary dataframe:
full_summary['std_error'] = summary.coefficientStandardErrors
full_summary['tvalues'] = summary.tValues
full_summary['pvalues'] = summary.pValues

# Manually calculate upper and lower confidence bounds and add into dataframe
full_summary['upper_ci'] = full_summary['coefficients'] + (1.96*full_summary['std_error'])
full_summary['lower_ci'] = full_summary['coefficients'] - (1.96*full_summary['std_error'])

# View final model summary
full_summary


Unnamed: 0,coefficients,idx,name,std_error,tvalues,pvalues,upper_ci,lower_ci
0,-20.763158,,intercept,0.274393,-2.23278,0.02556342,-20.225347,-21.300969
1,-0.61266,0.0,engine_count,0.060947,-0.416007,0.6774046,-0.493203,-0.732116
2,-0.025354,1.0,job_hours,0.000985,-0.711028,0.4770669,-0.023424,-0.027285
3,-0.0007,2.0,hourly_cost,32614.152138,0.000661,0.9994724,63923.737491,-63923.738891
4,21.564985,3.0,callVec_Person (mobile),32614.152138,0.000665,0.9994692,63945.303176,-63902.173206
5,21.698456,4.0,callVec_Person (land Line),32614.152139,0.000632,0.999496,63945.436648,-63902.039737
6,20.600364,5.0,callVec_Police,32614.15214,0.000641,0.9994889,63944.338559,-63903.137831
7,20.89271,6.0,callVec_Other Frs,32614.152176,0.000682,0.9994561,63944.630974,-63902.845555
8,22.233733,7.0,callVec_Person (running Call),32614.152169,0.000659,0.9994741,63945.971985,-63901.504518
9,21.498271,8.0,callVec_Ambulance,357614.29751,-1e-05,0.9999924,700945.52139,-700902.524848


In [12]:
 
# Add the `typeofincident` categorical column into analysis 
# Setup column indexing
incidentIdx = StringIndexer(inputCol='typeofincident',
                               outputCol='incidentIndex')

# Call the string indexer 
rescue_cat_singular_indexed = incidentIdx.fit(rescue_cat_indexed).transform(rescue_cat_indexed)

# Setup one-hot encoding
encoder_singular = OneHotEncoderEstimator(inputCols = ['incidentIndex'], 
                                 outputCols = ['incidentVec'])
                      
# The following returns an error "The input column incidentIndex should have at least two distinct values":
rescue_cat_singular_ohe = encoder_singular.fit(rescue_cat_singular_indexed).transform(rescue_cat_singular_indexed)



IllegalArgumentException: 'requirement failed: The input column incidentIndex should have at least two distinct values.'

In [13]:

# Convert typeofincident column into numeric value
rescue_cat_singular = rescue_cat_ohe.withColumn('typeofincident', F.when(F.col('typeofincident')=="Special Service", 1)
                               .otherwise(0))

# Setup the vectorassembler to include this variable in the features column
assembler = VectorAssembler(inputCols=['typeofincident', 'engine_count', 'job_hours', 'hourly_cost', 
                                       'callVec', 'propertyVec', 'serviceVec'], 
                           outputCol = "features")

rescue_cat_vectorised_sing = assembler.transform(rescue_cat_singular)


rescue_cat_final_sing = rescue_cat_vectorised_sing.withColumnRenamed("is_cat", "label").select("label", "features")

# Run the model
model_sing = glr.fit(rescue_cat_final_sing)

# Return model summary (will give an error)
summary_sing = model_sing.summary

summary_sing



Py4JJavaError: An error occurred while calling o735.toString.
: java.lang.UnsupportedOperationException: No summary available for this GeneralizedLinearRegressionModel
	at org.apache.spark.ml.regression.GeneralizedLinearRegressionTrainingSummary.toString(GeneralizedLinearRegression.scala:1571)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)


In [14]:

rescue_cat.groupBy("specialservicetypecategory").count().orderBy("count").show(truncate = False)


+-------------------------------+-----+
|specialservicetypecategory     |count|
+-------------------------------+-----+
|Animal Rescue From Water       |343  |
|Animal Rescue From Below Ground|593  |
|Animal Rescue From Height      |2123 |
|Other Animal Assistance        |2801 |
+-------------------------------+-----+



In [15]:

# Add "000_" prefix to selected reference categories

rescue_cat_reindex = (rescue_cat
                      .withColumn('specialservicetypecategory', 
                                       F.when(F.col('specialservicetypecategory')=="Other Animal Assistance", "000_Other Animal Assistance")
                                       .otherwise(F.col('specialservicetypecategory')))
                      .withColumn('originofcall', 
                                       F.when(F.col('originofcall') == "Person (mobile)", "000_Person (mobile)")
                                       .otherwise(F.col('originofcall')))
                      .withColumn('propertycategory', 
                                       F.when(F.col('propertycategory') == "Dwelling", "000_Dwelling")
                                       .otherwise(F.col('propertycategory'))))

# Check prefix additions 
rescue_cat_reindex.select('specialservicetypecategory', 'originofcall', 'propertycategory').show(20)

# Use stringOrderType argument of StringIndexer

# Re-indexing the specialservicetypecategory column
serviceIdx = StringIndexer(inputCol='specialservicetypecategory',
                               outputCol='serviceIndex', 
                               stringOrderType = "alphabetDesc")

# Indexing the originofcallcolumn
callIdx = StringIndexer(inputCol='originofcall',
                               outputCol='callIndex',
                               stringOrderType = "alphabetDesc")

# Indexing the propertycategory column
propertyIdx = StringIndexer(inputCol='propertycategory',
                               outputCol='propertyIndex', 
                               stringOrderType = "alphabetDesc")

# Call indexing for each column one by one

rescue_cat_indexed = serviceIdx.fit(rescue_cat_reindex).transform(rescue_cat_reindex)
rescue_cat_indexed = callIdx.fit(rescue_cat_indexed).transform(rescue_cat_indexed)
rescue_cat_indexed = propertyIdx.fit(rescue_cat_indexed).transform(rescue_cat_indexed)


+--------------------------+-------------------+-----------------+
|specialservicetypecategory|       originofcall| propertycategory|
+--------------------------+-------------------+-----------------+
|      000_Other Animal ...| Person (land Line)|     000_Dwelling|
|      000_Other Animal ...| Person (land Line)|Outdoor Structure|
|      Animal Rescue Fro...|000_Person (mobile)|Outdoor Structure|
|      Animal Rescue Fro...|000_Person (mobile)|  Non Residential|
|      000_Other Animal ...|000_Person (mobile)|     000_Dwelling|
|      000_Other Animal ...| Person (land Line)|     000_Dwelling|
|      000_Other Animal ...| Person (land Line)|          Outdoor|
|      Animal Rescue Fro...|000_Person (mobile)|          Outdoor|
|      Animal Rescue Fro...| Person (land Line)|     000_Dwelling|
|      Animal Rescue Fro...|000_Person (mobile)|          Outdoor|
|      Animal Rescue Fro...| Person (land Line)|          Outdoor|
|      Animal Rescue Fro...|000_Person (mobile)|          Outd

In [16]:

# Encode re-indexed columns
encoder = OneHotEncoderEstimator(inputCols = ['serviceIndex', 'callIndex', 'propertyIndex'], 
                                 outputCols = ['serviceVec', 'callVec', 'propertyVec'])

rescue_cat_ohe = encoder.fit(rescue_cat_indexed).transform(rescue_cat_indexed)


# Vectorize all our predictors into a new column called "features" 

assembler = VectorAssembler(inputCols=['engine_count', 'job_hours', 'hourly_cost', 
                                       'callVec', 'propertyVec', 'serviceVec'], 
                           outputCol = "features")

rescue_cat_vectorised = assembler.transform(rescue_cat_ohe)

# Rename target variable "is_cat" to "label" ready to run regression model
rescue_cat_final = rescue_cat_vectorised.withColumnRenamed("is_cat", "label").select("label", "features")

# Run the model again
model = glr.fit(rescue_cat_final)

# Show summary
model.summary



Coefficients:
             Feature Estimate   Std Error  T Value P Value
         (Intercept)   1.0466      0.3855   2.7150  0.0066
        engine_count  -0.6127      0.2744  -2.2328  0.0256
           job_hours  -0.0254      0.0609  -0.4160  0.6774
         hourly_cost  -0.0007      0.0010  -0.7110  0.4771
      callVec_Police  -0.9646      0.2259  -4.2704  0.0000
callVec_Person (r...   0.6687      1.5616   0.4282  0.6685
callVec_Person (l...   0.1335      0.0573   2.3311  0.0197
   callVec_Other Frs  -0.6723      0.3661  -1.8365  0.0663
   callVec_Not Known -24.9811 356123.9993  -0.0001  0.9999
  callVec_Coastguard -26.0473 356123.9993  -0.0001  0.9999
   callVec_Ambulance  -0.0667      1.4188  -0.0470  0.9625
propertyVec_Road ...   0.2030      0.1470   1.3807  0.1674
propertyVec_Outdo...  -1.4309      0.1159 -12.3498  0.0000
 propertyVec_Outdoor  -0.7453      0.0680 -10.9546  0.0000
propertyVec_Other...  -0.2720      0.4562  -0.5962  0.5511
propertyVec_Non R...  -0.9041      0.0922 

In [17]:

from pyspark.ml.stat import Correlation

# Select feature column vector
features_vector = rescue_cat_final.select("features")

# Generate correlation matrix
matrix = Correlation.corr(features_vector, "features").collect()[0][0]

# Convert matrix into a useful format
corr_matrix = matrix.toArray().tolist() 

# Get list of features to assign to matrix columns and indices
features = pd.DataFrame(merge_list)['name'].values.tolist()

# Final correlation matrix
corr_matrix_df = pd.DataFrame(data=corr_matrix, columns = features, index = features) 

corr_matrix_df



Unnamed: 0,engine_count,job_hours,hourly_cost,callVec_Person (mobile),callVec_Person (land Line),callVec_Police,callVec_Other Frs,callVec_Person (running Call),callVec_Ambulance,callVec_Not Known,propertyVec_Dwelling,propertyVec_Outdoor,propertyVec_Non Residential,propertyVec_Outdoor Structure,propertyVec_Road Vehicle,propertyVec_Other Residential,serviceVec_Other Animal Assistance,serviceVec_Animal Rescue From Height,serviceVec_Animal Rescue From Below Ground
engine_count,1.0,0.665043,-0.014307,0.049797,-0.002593,-0.042509,0.058198,-0.001834,-0.001834,-0.002593,0.010865,-0.028092,0.133052,-0.008213,-0.030875,-0.002593,0.235154,-0.069015,-0.029857
job_hours,0.665043,1.0,-0.002139,0.066634,0.022691,-0.048369,0.077129,0.035761,-0.003674,-0.005197,0.01276,-0.032097,0.159524,0.005622,-0.013365,0.022691,0.18222,-0.076956,-0.005555
hourly_cost,-0.014307,-0.002139,1.0,-0.091829,-0.018839,-0.161395,-0.056116,0.000235,-0.01332,-0.009254,0.038299,0.00197,-0.067821,-0.010384,0.019469,0.002249,-0.099933,-0.02582,-0.099316
callVec_Person (mobile),0.049797,0.066634,-0.091829,1.0,-0.002772,-0.139063,-0.013345,-0.00196,-0.00196,-0.002772,0.025817,0.005902,0.064985,0.011162,-0.016936,-0.002772,0.096359,-0.050171,-0.011778
callVec_Person (land Line),-0.002593,0.022691,-0.018839,-0.002772,1.0,-0.017127,-0.001644,-0.000241,-0.000241,-0.000341,-0.003632,-0.005258,0.009122,-0.001081,-0.006304,-0.000341,0.034749,-0.013927,-0.0062
callVec_Police,-0.042509,-0.048369,-0.161395,-0.139063,-0.017127,1.0,-0.082447,-0.012109,-0.012109,-0.017127,-0.064641,-0.053138,-0.028732,0.03966,0.034325,0.001404,-0.085306,0.130976,-0.018198
callVec_Other Frs,0.058198,0.077129,-0.056116,-0.013345,-0.001644,-0.082447,1.0,-0.001162,-0.001162,-0.001644,-0.017484,-0.025312,0.069788,-0.005205,0.013944,-0.001644,0.084906,-0.030837,-0.010611
callVec_Person (running Call),-0.001834,0.035761,0.000235,-0.00196,-0.000241,-0.012109,-0.001162,1.0,-0.000171,-0.000241,-0.002568,-0.003718,0.021019,-0.000765,-0.004457,-0.000241,0.052395,-0.009847,-0.004384
callVec_Ambulance,-0.001834,-0.003674,-0.01332,-0.00196,-0.000241,-0.012109,-0.001162,-0.000171,1.0,-0.000241,-0.002568,-0.003718,0.021019,-0.000765,-0.004457,-0.000241,-0.003257,-0.009847,-0.004384
callVec_Not Known,-0.002593,-0.005197,-0.009254,-0.002772,-0.000341,-0.017127,-0.001644,-0.000241,-0.000241,1.0,-0.003632,-0.005258,0.009122,-0.001081,-0.006304,-0.000341,-0.004607,-0.013927,0.024434


In [18]:

rescue_cat.select("job_hours", "hourly_cost", "total_cost").orderBy("job_hours", ascending=False).limit(30).toPandas()


Unnamed: 0,job_hours,hourly_cost,total_cost
0,12.0,326.0,3912.0
1,12.0,290.0,3480.0
2,10.0,298.0,2980.0
3,9.0,260.0,2340.0
4,9.0,260.0,2340.0
5,9.0,260.0,2340.0
6,9.0,295.0,2655.0
7,8.0,260.0,2080.0
8,8.0,333.0,2664.0
9,7.0,328.0,2296.0


In [19]:

from pyspark.ml import Pipeline

# Rename "is_cat" to "label" before setting up pipeline stages
rescue_cat_reindex = rescue_cat_reindex.withColumnRenamed("is_cat", "label")

# 1. Indexing the specialservicetypecategory column
serviceIdx = StringIndexer(inputCol='specialservicetypecategory',
                               outputCol='serviceIndex', 
                               stringOrderType = "alphabetDesc")

# 2. Indexing the originofcall column
callIdx = StringIndexer(inputCol='originofcall',
                               outputCol='callIndex',
                               stringOrderType = "alphabetDesc")

# 3. Indexing the propertycategory column
propertyIdx = StringIndexer(inputCol='propertycategory',
                               outputCol='propertyIndex', 
                               stringOrderType = "alphabetDesc")

# 4. One-hot encoding
encoder = OneHotEncoderEstimator(inputCols = ['serviceIndex', 'callIndex', 'propertyIndex'], 
                                 outputCols = ['serviceVec', 'callVec', 'propertyVec'])

# 5. Vector assembler
assembler = VectorAssembler(inputCols=['engine_count', 'hourly_cost', 
                                       'callVec', 'propertyVec', 'serviceVec'], 
                           outputCol = "features")

# 6. Regression model
glr = GeneralizedLinearRegression(family="binomial", link="logit")

# Creating the pipeline
pipe = Pipeline(stages=[serviceIdx, callIdx, propertyIdx,
                        encoder, assembler, glr])
                        
# View the pipeline stages
pipe.getStages()


[StringIndexer_345e883be5cb,
 StringIndexer_b5e8c260abe3,
 StringIndexer_d82908c4b741,
 OneHotEncoderEstimator_97de31f05d35,
 VectorAssembler_4832dfca75f0,
 GeneralizedLinearRegression_5c5176783b95]

In [20]:

fit_model = pipe.fit(rescue_cat_reindex)

# Save model results
results = fit_model.transform(rescue_cat_reindex)
  
# Showing the results
results.show()


+---------------+------------+---------+-----------+----------+-------------------+-----------------+--------------------------+-----------------+-----+------------+---------+-------------+-------------+-------------+-------------+--------------------+-------------------+
| typeofincident|engine_count|job_hours|hourly_cost|total_cost|       originofcall| propertycategory|specialservicetypecategory|incident_duration|label|serviceIndex|callIndex|propertyIndex|   serviceVec|      callVec|  propertyVec|            features|         prediction|
+---------------+------------+---------+-----------+----------+-------------------+-----------------+--------------------------+-----------------+-----+------------+---------+-------------+-------------+-------------+-------------+--------------------+-------------------+
|Special Service|         1.0|      2.0|      255.0|     510.0| Person (land Line)|     000_Dwelling|      000_Other Animal ...|              2.0|    0|         3.0|      2.0|      

In [21]:

# Get coefficients summary table
summary = fit_model.stages[-1].summary

summary


Coefficients:
             Feature Estimate   Std Error  T Value P Value
         (Intercept)   1.0885      0.3722   2.9246  0.0034
        engine_count  -0.6804      0.2214  -3.0729  0.0021
         hourly_cost  -0.0007      0.0010  -0.7175  0.4730
      callVec_Police  -0.9668      0.2257  -4.2829  0.0000
callVec_Person (r...   0.6470      1.5530   0.4166  0.6770
callVec_Person (l...   0.1338      0.0572   2.3375  0.0194
   callVec_Other Frs  -0.6789      0.3657  -1.8565  0.0634
   callVec_Not Known -25.0264 356123.9993  -0.0001  0.9999
  callVec_Coastguard -26.0423 356123.9993  -0.0001  0.9999
   callVec_Ambulance  -0.0626      1.4188  -0.0441  0.9648
propertyVec_Road ...   0.2017      0.1470   1.3720  0.1701
propertyVec_Outdo...  -1.4310      0.1159 -12.3504  0.0000
 propertyVec_Outdoor  -0.7482      0.0677 -11.0556  0.0000
propertyVec_Other...  -0.2765      0.4561  -0.6062  0.5444
propertyVec_Non R...  -0.9055      0.0921  -9.8298  0.0000
    propertyVec_Boat   0.7267      1.4248 

In [22]:

# Save pipeline

pipe.write().overwrite().save("rescue_pipeline")

# Save the pipeline model

fit_model.write().overwrite().save("rescue_model")



In [23]:

# Load saved pipeline
reloaded_pipeline = Pipeline.load("rescue_pipeline")

# Re-fit to a subset of rescue data as an example of how pipelines can be re-used
new_model = reloaded_pipeline.fit(rescue_cat_reindex.sample(withReplacement=None,
                      fraction=0.1, seed = 99))
                      
# View new model summary
new_model.stages[-1].summary


Coefficients:
             Feature Estimate   Std Error T Value P Value
         (Intercept)   2.9173      1.2388  2.3549  0.0185
        engine_count  -0.7009      0.7351 -0.9534  0.3404
         hourly_cost  -0.0063      0.0033 -1.9205  0.0548
      callVec_Police  -0.4627      0.5684 -0.8140  0.4156
callVec_Person (l...   0.3852      0.1876  2.0531  0.0401
   callVec_Other Frs  -0.6854      0.7712 -0.8887  0.3742
propertyVec_Road ...   0.2272      0.5093  0.4460  0.6556
propertyVec_Outdo...  -1.3168      0.3752 -3.5100  0.0004
 propertyVec_Outdoor  -0.8949      0.2167 -4.1300  0.0000
propertyVec_Other...  25.6083 356121.6985  0.0001  0.9999
propertyVec_Non R...  -1.2404      0.3110 -3.9886  0.0001
serviceVec_Animal...  -1.1309      0.5072 -2.2296  0.0258
serviceVec_Animal...   0.2916      0.1982  1.4714  0.1412
serviceVec_Animal...  -0.0509      0.2914 -0.1748  0.8613

(Dispersion parameter for binomial family taken to be 1.0000)
    Null deviance: 816.4533 on 577 degrees of freedom

In [24]:

# Close the spark session
spark.stop()
