# Random Forest

In [389]:
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import expr
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from helpers.helper_functions import translate_to_file_string

import pandas as pd
from IPython.display import display, HTML
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

#Change Column Names
def delete_space(df):
    names = df.schema.names
    for name in names:
        newName = name.replace(" ","")
        df = df.withColumnRenamed(name, newName)
    return df

In [390]:
inputFile = translate_to_file_string("../data/data.csv")

Spark session creation 

In [391]:
spark = (SparkSession
       .builder
       .appName("Modell_randomForest")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [392]:
# create a DataFrame using an ifered Schema 
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) 

Prepare training and test data.

In [393]:
df = delete_space(df)
df.printSchema()

#Create Pandas DataFrame
df_pandas = df.toPandas()
pandasCol = list(df_pandas)
for col in pandasCol:
    if df_pandas[col].dtypes=='object':
        #ToInteger
        df_pandas[col]= pd.factorize(df_pandas[col])[0]



df_pandas.info()

HTML(df_pandas.head(5).to_html())

df2 = spark.createDataFrame(df_pandas)

featureCols = df2.columns.copy()
featureCols.remove("CustomerID")
featureCols.remove("Contract")
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

#Keep Nullvalues 
assembler.setHandleInvalid("keep")

root
 |-- CustomerID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- SeniorCitizen: integer (nullable = true)
 |-- Partner: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- PhoneService: string (nullable = true)
 |-- MultipleLines: string (nullable = true)
 |-- InternetService: string (nullable = true)
 |-- OnlineSecurity: string (nullable = true)
 |-- OnlineBackup: string (nullable = true)
 |-- DeviceProtection: string (nullable = true)
 |-- TechSupport: string (nullable = true)
 |-- StreamingTV: string (nullable = true)
 |-- StreamingMovies: string (nullable = true)
 |-- Contract: string (nullable = true)
 |-- PaperlessBilling: string (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- MonthlyCharges: double (nullable = true)
 |-- TotalCharges: double (nullable = true)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6499 entries, 0 to 6498
Data columns (total 20 columns):
 #   Column

VectorAssembler_609d4e938064

In [394]:
labledPointDataSet = assembler.transform(df2)
splits = labledPointDataSet.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

Random Forest Classifier

In [395]:
#Stanard Ergebnis: Test Error ~0.175
# rf = RandomForestClassifier(labelCol="Contract", featuresCol="features")

# verbessertes Ergebnis: Test Error ~0.0855
rf = RandomForestClassifier(labelCol="Contract", featuresCol="features", \
                 minInstancesPerNode=1350, featureSubsetStrategy='sqrt', subsamplingRate=1, seed= 12345, numTrees=850)

Train the model 

In [396]:
rfModel = rf.fit(training)

Test the model

In [397]:
predictions = rfModel.transform(test)
predictions.show()

+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+--------------------+--------------------+--------------------+----------+
|CustomerID|Gender|SeniorCitizen|Partner|Dependents|Tenure|PhoneService|MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|Contract|PaperlessBilling|PaymentMethod|MonthlyCharges|TotalCharges|            features|       rawPrediction|         probability|prediction|
+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+--------------------+--------------------+--------------------+----------+
|        2

In [398]:
evaluator = BinaryClassificationEvaluator(labelCol="Contract",rawPredictionCol="rawPrediction", metricName="areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print("Test Error",(1.0 - accuracy))

Test Error 0.08550441453177005


In [399]:
spark.stop()