# Random Forest

In [1]:
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import IndexToString, Normalizer, StringIndexer, VectorAssembler, VectorIndexer, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import expr
from pyspark.ml.evaluation import RegressionEvaluator
from helpers.helper_functions import translate_to_file_string
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.mllib.evaluation import MulticlassMetrics, RegressionMetrics
from pyspark.mllib.util import MLUtils
import pandas as pd
from IPython.display import display, HTML

#Change Column Names
def delete_space(df):
    names = df.schema.names
    for name in names:
        newName = name.replace(" ","")
        df = df.withColumnRenamed(name, newName)
    return df

In [2]:
inputFile = translate_to_file_string("../data/data.csv")

Spark session creation 

In [3]:
spark = (SparkSession
       .builder
       .appName("Modell_LogisticalRegression")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [4]:
# create a DataFrame using an ifered Schema 
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) 

df = delete_space(df)
df_orig = df
df = df.where("MonthlyCharges Between 22 AND 95")
df = df.where("TotalCharges IS NOT NULL")

## Prepare training and test data.

### Use this for Pandas Dataframe

In [5]:

#Create Pandas DataFrame
df_pandas = df.toPandas()
df_pandas_cat = df.toPandas()
#Pandas Indexing Method to Integer or Category Datatype
pandasCol = list(df_pandas)
for col in pandasCol:
    if df_pandas[col].dtypes=='object':
        if not col == "Contract":
            #Categorize
            df_pandas_cat[col]= pd.Categorical(pd.factorize(df_pandas_cat[col])[0])
            #ToInteger
            df_pandas[col]= pd.factorize(df_pandas[col])[0]

#Define whicht Columns should be normalized
newCols = []
for col in pandasCol:
    if not col == "Contract":
        newCols.append(col)

#Normalize the selected Columns
df_pandas[newCols]=(df_pandas[newCols]-df_pandas[newCols].min())/(df_pandas[newCols].max()-df_pandas[newCols].min())



#Write Pandas Dataframe Back to Spark Dataframe
df_temp = spark.createDataFrame(df_pandas)
df = df_temp
df.printSchema()
# Create Indexer for Contract (Still needed)
contractIndexer = StringIndexer().setInputCol("Contract").setOutputCol("Contract_Int").fit(df)

#Create FeatureCols 
featureCols = df.columns.copy()
featureCols.remove("Contract")
featureCols.remove("CustomerID")


root
 |-- CustomerID: double (nullable = true)
 |-- Gender: double (nullable = true)
 |-- SeniorCitizen: double (nullable = true)
 |-- Partner: double (nullable = true)
 |-- Dependents: double (nullable = true)
 |-- Tenure: double (nullable = true)
 |-- PhoneService: double (nullable = true)
 |-- MultipleLines: double (nullable = true)
 |-- InternetService: double (nullable = true)
 |-- OnlineSecurity: double (nullable = true)
 |-- OnlineBackup: double (nullable = true)
 |-- DeviceProtection: double (nullable = true)
 |-- TechSupport: double (nullable = true)
 |-- StreamingTV: double (nullable = true)
 |-- StreamingMovies: double (nullable = true)
 |-- Contract: string (nullable = true)
 |-- PaperlessBilling: double (nullable = true)
 |-- PaymentMethod: double (nullable = true)
 |-- MonthlyCharges: double (nullable = true)
 |-- TotalCharges: double (nullable = true)



### Create Indexer

In [6]:
#Comment the Following if Pandas-Dataset is used
# IDIndexer = StringIndexer().setInputCol("CustomerID").setOutputCol("CustomerID_Int").fit(df)
# genderIndexer = StringIndexer().setInputCol("Gender").setOutputCol("Gender_Int").fit(df)
# seniorIndexer = StringIndexer().setInputCol("SeniorCitizen").setOutputCol("SeniorCitizen_Int").fit(df)
# partnerIndexer = StringIndexer().setInputCol("Partner").setOutputCol("Partner_Int").fit(df)
# DependentsIndexer = StringIndexer().setInputCol("Dependents").setOutputCol("Dependents_Int").fit(df)
# tenureIndexer = StringIndexer().setInputCol("Tenure").setOutputCol("Tenure_Int").fit(df)
# phoneIndexer = StringIndexer().setInputCol("PhoneService").setOutputCol("PhoneService_Int").fit(df)
# multipleIndexer = StringIndexer().setInputCol("MultipleLines").setOutputCol("MultipleLines_Int").fit(df)
# internetIndexer = StringIndexer().setInputCol("InternetService").setOutputCol("InternetService_Int").fit(df)
# onlineSecurityIndexer = StringIndexer().setInputCol("OnlineSecurity").setOutputCol("OnlineSecurity_Int").fit(df)
# onlineBackupIndexer = StringIndexer().setInputCol("OnlineBackup").setOutputCol("OnlineBackup_Int").fit(df)
# deviceIndexer = StringIndexer().setInputCol("DeviceProtection").setOutputCol("DeviceProtection_Int").fit(df)
# techIndexer = StringIndexer().setInputCol("TechSupport").setOutputCol("TechSupport_Int").fit(df)
# streamingTVIndexer = StringIndexer().setInputCol("StreamingTV").setOutputCol("StreamingTV_Int").fit(df)
# streamingMoviesIndexer = StringIndexer().setInputCol("StreamingMovies").setOutputCol("StreamingMovies_Int").fit(df)
# contractIndexer = StringIndexer().setInputCol("Contract").setOutputCol("Contract_Int").fit(df)
# paperlessIndexer = StringIndexer().setInputCol("PaperlessBilling").setOutputCol("PaperlessBilling_Int").fit(df)
# paymentIndexer = StringIndexer().setInputCol("PaymentMethod").setOutputCol("PaymentMethod_Int").fit(df)
# monthlyIndexer = StringIndexer().setInputCol("MonthlyCharges").setOutputCol("MonthlyCharges_Int").fit(df)
# totalIndexer = StringIndexer().setInputCol("TotalCharges").setOutputCol("TotalCharges_Int").fit(df)

In [7]:
# #Comment this if Pandas Dataset is used
# featureCols = df.columns.copy()
# for col in featureCols:
#     if not col == "Tenure" and not col == "MonthlyCharges" and not col == "TotalCharges":
#         featureCols.remove(col)
#         colname = col +"_Int"
#         featureCols = featureCols + [colname]
#     else:
#         featureCols.remove(col)
#         featureCols = featureCols + [col]

# featureCols.remove("Contract_Int")
# featureCols.remove("CustomerID_Int")
# featureCols.remove("Gender")
# featureCols = featureCols + ["Gender_Int"]

In [8]:
assembler =  VectorAssembler(outputCol="features", inputCols=list(featureCols))
#, handleInvalid="keep"

featureIndexer = VectorIndexer(inputCol="features",outputCol="indexedFeatures", maxCategories=6) 
#, handleInvalid="keep"

predConverter = IndexToString(inputCol="prediction",outputCol="predictedLabel",labels=contractIndexer.labels)

scaler = StandardScaler(inputCol="indexedFeatures", outputCol="scaledFeatures",withStd=True, withMean=False)

## Logistic Regression

In [9]:
lr = LogisticRegression(featuresCol="scaledFeatures", labelCol="Contract_Int")
paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [100, 120 , 150])\
                                 .addGrid(lr.regParam, [0.1, 0.3, 0.5]) \
                                 .addGrid(lr.standardization, [True, False]) \
                                 .addGrid(lr.elasticNetParam, [0, 1]) \
                                 .build()

### Create Train and Test Datasets

In [10]:
splits = df.randomSplit([0.9, 0.1 ], 12345)
train = splits[0]
test = splits[1]

### Build the Pipeline

In [11]:
#Use This for Pandas-Dataframe
pipeline = Pipeline(stages= [contractIndexer, assembler, featureIndexer, scaler, lr, predConverter])

In [12]:
##Use This for Spark Dataframe
# pipeline = Pipeline(stages= [genderIndexer, seniorIndexer, partnerIndexer, DependentsIndexer, phoneIndexer, multipleIndexer, internetIndexer, onlineSecurityIndexer, onlineBackupIndexer, deviceIndexer, techIndexer, streamingTVIndexer, streamingMoviesIndexer, contractIndexer, paperlessIndexer, paymentIndexer, assembler, featureIndexer, scaler, lr, predConverter])

### Build the Evaluator and Cross Validator

In [13]:
evaluator =  RegressionEvaluator(labelCol="Contract_Int", metricName="rmse")

cv = CrossValidator(estimator=pipeline,evaluator=evaluator,estimatorParamMaps=paramGrid,numFolds=3, parallelism=2)

### Train the Model

In [14]:
cvModel = cv.fit(train)

In [15]:
#stages[19] for Spark Dataframe ;  stages[4] for Pandas Dataframe 
treeModel = cvModel.bestModel.stages[4]
print("Learned classification tree model:\n",treeModel)
print("Best Params: \n", treeModel.explainParams())

Learned classification tree model:
 LogisticRegressionModel: uid=LogisticRegression_66bb12f0a9cd, numClasses=3, numFeatures=18
Best Params: 
 aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0, current: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features, current: scaledFeatures)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: Contract_Int)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes,

In [16]:
predictions = cvModel.transform(test)
predictions.select("prediction", "Contract_Int", "predictedLabel", "Contract", "features").show()

+----------+------------+--------------+--------------+--------------------+
|prediction|Contract_Int|predictedLabel|      Contract|            features|
+----------+------------+--------------+--------------+--------------------+
|       1.0|         1.0|      Two year|      Two year|[0.0,0.0,1.0,0.0,...|
|       0.0|         0.0|Month-to-month|Month-to-month|(18,[4,5,6,8,11,1...|
|       0.0|         0.0|Month-to-month|Month-to-month|(18,[0,2,4,5,6,7,...|
|       0.0|         2.0|Month-to-month|      One year|(18,[2,4,5,6,9,13...|
|       1.0|         2.0|      Two year|      One year|[0.0,0.0,0.0,1.0,...|
|       0.0|         2.0|Month-to-month|      One year|[1.0,0.0,1.0,1.0,...|
|       0.0|         0.0|Month-to-month|Month-to-month|[1.0,0.0,1.0,0.0,...|
|       0.0|         0.0|Month-to-month|Month-to-month|(18,[2,4,5,6,7,9,...|
|       0.0|         0.0|Month-to-month|Month-to-month|(18,[9,16,17],[0....|
|       0.0|         0.0|Month-to-month|Month-to-month|(18,[4,5,6,14,15,...|

In [17]:
new_df = predictions.groupBy("prediction").count()
new_df.show()
new_train_df = train.groupBy("Contract").count()
new_train_df.show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|  305|
|       1.0|  115|
|       2.0|   10|
+----------+-----+

+--------------+-----+
|      Contract|count|
+--------------+-----+
|Month-to-month| 2348|
|      One year|  674|
|      Two year|  752|
+--------------+-----+



In [18]:
accuracy = evaluator.evaluate(predictions)
print("Test Error = " ,(1.0 - accuracy))

Test Error =  0.2782450328829461


In [22]:
predictionAndLabels = predictions.select("prediction", "Contract_Int").rdd.map(lambda p: [p[0], p[1]]) # Map to RDD prediction|label
metrics =  MulticlassMetrics(predictionAndLabels)
metrics_regression = RegressionMetrics(predictionAndLabels)

In [23]:
confusion = metrics.confusionMatrix()


In [24]:
labels = predictionAndLabels.map(lambda x: x[1]).distinct().collect()
for label in  labels:
  print("Class %f precision = %f\n" % (label , metrics.precision(label)))
  print("Class %f recall = %f\n" % (label, metrics.recall(label)))
  print("Class %f F1 score = %f\n" % (label, metrics.fMeasure( label)))


Class 0.000000 precision = 0.822951

Class 0.000000 recall = 0.976654

Class 0.000000 F1 score = 0.893238

Class 1.000000 precision = 0.739130

Class 1.000000 recall = 0.858586

Class 1.000000 F1 score = 0.794393

Class 2.000000 precision = 0.500000

Class 2.000000 recall = 0.067568

Class 2.000000 F1 score = 0.119048



In [25]:
print("Weighted precision = %s\n" % metrics.weightedPrecision)
print("Weighted recall = %s\n" % metrics.weightedRecall)
print("Weighted F1 score = %s\n" % metrics.weightedFMeasure)
print("Weighted false positive rate = %s\n" % metrics.weightedFalsePositiveRate)

Weighted precision = 0.7480750551144557

Weighted recall = 0.7930232558139534

Weighted F1 score = <bound method MulticlassMetrics.weightedFMeasure of <pyspark.mllib.evaluation.MulticlassMetrics object at 0x7eff35f4dfd0>>

Weighted false positive rate = 0.20984136904549094



In [26]:
print("Recall = %s" % metrics.recall(1.0))
print("Precision = %s" % metrics.precision(1.0))
print("Accuracy = %s" % metrics.accuracy) 
print("F1 = %s" % metrics.fMeasure(1.0))

Recall = 0.8585858585858586
Precision = 0.7391304347826086
Accuracy = 0.7930232558139535
F1 = 0.7943925233644858


In [33]:
print("Variance = %s" % metrics_regression.explainedVariance)
print("Absolute Error = %s" % metrics_regression.meanAbsoluteError)
print("Squared Error = %s" % metrics_regression.meanSquaredError)
print("R2 = %s" % metrics_regression.r2)
print("Root Mean Squared Error = %s" % metrics_regression.rootMeanSquaredError)

Variance = 0.3297404002163331
Absolute Error = 0.3116279069767442
Squared Error = 0.5209302325581395
R2 = 0.11503936935529824
Root Mean Squared Error = 0.7217549671170539


In [34]:
spark.stop()