# Random Forest

In [89]:
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import IndexToString, Normalizer, StringIndexer, VectorAssembler, VectorIndexer, StandardScaler
from pyspark.ml.classification import RandomForestClassifier, LinearSVC, OneVsRest, LogisticRegression, MultilayerPerceptronClassifier
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import expr
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator
from helpers.helper_functions import translate_to_file_string
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.regression import LinearRegression


import pandas as pd
from IPython.display import display, HTML
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

#Change Column Names
def delete_space(df):
    names = df.schema.names
    for name in names:
        newName = name.replace(" ","")
        df = df.withColumnRenamed(name, newName)
    return df

In [90]:
inputFile = translate_to_file_string("../data/data.csv")

Spark session creation 

In [91]:
spark = (SparkSession
       .builder
       .appName("Modell_KNN")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [92]:
# create a DataFrame using an ifered Schema 
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) 

Prepare training and test data.

In [93]:
df = delete_space(df)
df.printSchema()

df_orig = df

# #Create Pandas DataFrame
# df_pandas = df.toPandas()
# pandasCol = list(df_pandas)
# for col in pandasCol:
#     if df_pandas[col].dtypes=='object':
#         #ToInteger
#         df_pandas[col]= pd.factorize(df_pandas[col])[0]



# newCols = []
# for col in pandasCol:
#     if not col == "Tenure" and not col == "MonthlyCharges" and not col == "TotalCharges" and not col == "Contract":
#         newCols.append(col)

# df_pandas[newCols]=(df_pandas[newCols]-df_pandas[newCols].min())/(df_pandas[newCols].max()-df_pandas[newCols].min())
# df_pandas.info()

# HTML(df_pandas.head(5).to_html())

# df2 = spark.createDataFrame(df_pandas)

# featureCols = df2.columns.copy()
# featureCols.remove("CustomerID")
# featureCols.remove("Contract")
# print(featureCols)

# assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

# #Keep Nullvalues 
# assembler.setHandleInvalid("keep")

root
 |-- CustomerID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- SeniorCitizen: integer (nullable = true)
 |-- Partner: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- PhoneService: string (nullable = true)
 |-- MultipleLines: string (nullable = true)
 |-- InternetService: string (nullable = true)
 |-- OnlineSecurity: string (nullable = true)
 |-- OnlineBackup: string (nullable = true)
 |-- DeviceProtection: string (nullable = true)
 |-- TechSupport: string (nullable = true)
 |-- StreamingTV: string (nullable = true)
 |-- StreamingMovies: string (nullable = true)
 |-- Contract: string (nullable = true)
 |-- PaperlessBilling: string (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- MonthlyCharges: double (nullable = true)
 |-- TotalCharges: double (nullable = true)



In [95]:
df = df.where("TotalCharges IS NOT NULL")
df = df.where("MonthlyCharges Between 22 AND 95")

In [96]:
IDIndexer = StringIndexer().setInputCol("CustomerID").setOutputCol("CustomerID_Int").fit(df)
genderIndexer = StringIndexer().setInputCol("Gender").setOutputCol("Gender_Int").fit(df)
seniorIndexer = StringIndexer().setInputCol("SeniorCitizen").setOutputCol("SeniorCitizen_Int").fit(df)
partnerIndexer = StringIndexer().setInputCol("Partner").setOutputCol("Partner_Int").fit(df)
DependentsIndexer = StringIndexer().setInputCol("Dependents").setOutputCol("Dependents_Int").fit(df)
tenureIndexer = StringIndexer().setInputCol("Tenure").setOutputCol("Tenure_Int").fit(df)
phoneIndexer = StringIndexer().setInputCol("PhoneService").setOutputCol("PhoneService_Int").fit(df)
multipleIndexer = StringIndexer().setInputCol("MultipleLines").setOutputCol("MultipleLines_Int").fit(df)
internetIndexer = StringIndexer().setInputCol("InternetService").setOutputCol("InternetService_Int").fit(df)
onlineSecurityIndexer = StringIndexer().setInputCol("OnlineSecurity").setOutputCol("OnlineSecurity_Int").fit(df)
onlineBackupIndexer = StringIndexer().setInputCol("OnlineBackup").setOutputCol("OnlineBackup_Int").fit(df)
deviceIndexer = StringIndexer().setInputCol("DeviceProtection").setOutputCol("DeviceProtection_Int").fit(df)
techIndexer = StringIndexer().setInputCol("TechSupport").setOutputCol("TechSupport_Int").fit(df)
streamingTVIndexer = StringIndexer().setInputCol("StreamingTV").setOutputCol("StreamingTV_Int").fit(df)
streamingMoviesIndexer = StringIndexer().setInputCol("StreamingMovies").setOutputCol("StreamingMovies_Int").fit(df)
contractIndexer = StringIndexer().setInputCol("Contract").setOutputCol("Contract_Int").fit(df)
paperlessIndexer = StringIndexer().setInputCol("PaperlessBilling").setOutputCol("PaperlessBilling_Int").fit(df)
paymentIndexer = StringIndexer().setInputCol("PaymentMethod").setOutputCol("PaymentMethod_Int").fit(df)
monthlyIndexer = StringIndexer().setInputCol("MonthlyCharges").setOutputCol("MonthlyCharges_Int").fit(df)
totalIndexer = StringIndexer().setInputCol("TotalCharges").setOutputCol("TotalCharges_Int").fit(df)

In [97]:
featureCols = df.columns.copy()
for col in featureCols:
    if not col == "Tenure" and not col == "MonthlyCharges" and not col == "TotalCharges":
        featureCols.remove(col)
        colname = col +"_Int"
        featureCols = featureCols + [colname]
    else:
        featureCols.remove(col)
        featureCols = featureCols + [col]

featureCols.remove("Contract_Int")
featureCols.remove("CustomerID_Int")
featureCols.remove("Gender")
featureCols = featureCols + ["Gender_Int"]

CustomerID
SeniorCitizen
Partner
Dependents
Tenure
PhoneService
MultipleLines
InternetService
OnlineSecurity
OnlineBackup
DeviceProtection
TechSupport
StreamingTV
StreamingMovies
Contract
PaperlessBilling
PaymentMethod
MonthlyCharges
TotalCharges
['SeniorCitizen_Int', 'Partner_Int', 'Dependents_Int', 'Tenure', 'PhoneService_Int', 'MultipleLines_Int', 'InternetService_Int', 'OnlineSecurity_Int', 'OnlineBackup_Int', 'DeviceProtection_Int', 'TechSupport_Int', 'StreamingTV_Int', 'StreamingMovies_Int', 'PaperlessBilling_Int', 'PaymentMethod_Int', 'MonthlyCharges', 'TotalCharges', 'Gender_Int']


In [98]:
assembler =  VectorAssembler(outputCol="features", inputCols=list(featureCols))
#, handleInvalid="keep"

In [99]:
featureIndexer = VectorIndexer(inputCol="features",outputCol="indexedFeatures", maxCategories=6)
#handleInvalid="keep" 

In [100]:
predConverter = IndexToString(inputCol="prediction",outputCol="predictedLabel",labels=contractIndexer.labels)

In [101]:
scaler = StandardScaler(inputCol="indexedFeatures", outputCol="scaledFeatures",withStd=True, withMean=False)

## Decision Tree

In [103]:
# dt = DecisionTreeClassifier(labelCol="Contract_Int", featuresCol="features")
# paramGrid = ParamGridBuilder().addGrid(dt.maxDepth, [ 10, 15 , 20 ]) \
# 				              .addGrid(dt.minInfoGain, [ 0.02, 0.01, 0.001]) \
# 				              .addGrid(dt.minInstancesPerNode, [5, 10, 15]) \
#                               .addGrid(dt.maxBins, [5, 6, 9]) \
# 				              .build()


## Random Forest

In [104]:
# dt = RandomForestClassifier(labelCol="Contract_Int", featuresCol="features", seed=12345)
# paramGrid = ParamGridBuilder().addGrid(dt.subsamplingRate, [ 1 ]) \
#                 .addGrid(dt.featureSubsetStrategy, [ 'sqrt' ]) \
#                 .addGrid(dt.numTrees, [50]) \
#                 .addGrid(dt.minInstancesPerNode, [10]) \
#                 .build()            
				              
#minInstancesPerNode=1350, featureSubsetStrategy='sqrt', subsamplingRate=1, seed= 12345, numTrees=850)

### SVM -> geht nicht (Da nur 2 Klassen)

In [105]:
# lsvc = LinearSVC(labelCol="Contract_Int",aggregationDepth=2, featuresCol="features" ) 
# #lsvc = LinearSVC(aggregationDepth=2) 
# paramGrid = ParamGridBuilder().addGrid(lsvc.maxIter, [50])\
#                                  .addGrid(lsvc.regParam, [0.5]) \
#                                  .addGrid(lsvc.standardization, [True]) \
#                                  .build()
# dt = OneVsRest(classifier=lsvc, labelCol="Contract_Int", featuresCol="features", rawPredictionCol="rawPrediction")

## Logistic Regression

In [106]:
# dt = LogisticRegression(featuresCol="features", labelCol="Contract_Int")
# paramGrid = ParamGridBuilder().addGrid(dt.maxIter, [100, 120 , 150])\
#                                  .addGrid(dt.regParam, [0.1, 0.3, 0.5]) \
#                                  .addGrid(dt.standardization, [True, False]) \
#                                  .addGrid(dt.elasticNetParam, [0, 1]) \
#                                  .build()

## Linear Regression

In [107]:
# lr = LinearRegression(featuresCol="features", labelCol="Contract_Int")
# paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [100, 120 , 150])\
#                                  .addGrid(lr.regParam, [0.1, 0.3, 0.5]) \
#                                  .addGrid(lr.standardization, [True, False]) \
#                                  .addGrid(lr.elasticNetParam, [0, 1]) \
#                                  .build()
# dt = OneVsRest(classifier=lr, labelCol="Contract_Int", featuresCol="features", rawPredictionCol="rawPrediction")

## KNN

In [108]:
dt = MultilayerPerceptronClassifier(featuresCol="scaledFeatures", labelCol="Contract_Int")   
paramGrid =  ParamGridBuilder().addGrid(dt.layers, [[ 18, 10, 5, 3 ]]) \
				.addGrid(dt.blockSize,  [128 ]) \
                .addGrid(dt.maxIter,[ 100, 1000 ] )\
				.addGrid(dt.stepSize, [0.003, 0.03, 0.3 ])\
				.addGrid(dt.tol, [ 0.05, 0.1, 0.2 ]) \
				.build()

In [109]:
splits = df.randomSplit([0.9, 0.1 ], 12345)
train = splits[0]
test = splits[1]

In [110]:
pipeline = Pipeline(stages= [genderIndexer, seniorIndexer, partnerIndexer, DependentsIndexer, phoneIndexer, multipleIndexer, internetIndexer, onlineSecurityIndexer, onlineBackupIndexer, deviceIndexer, techIndexer, streamingTVIndexer, streamingMoviesIndexer, contractIndexer, paperlessIndexer, paymentIndexer, assembler, featureIndexer, scaler, dt, predConverter])

In [111]:
#evaluator =  BinaryClassificationEvaluator(labelCol="Contract_Int", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
evaluator =  MulticlassClassificationEvaluator(labelCol="Contract_Int", metricName="f1")
#evaluator =  RegressionEvaluator(labelCol="Contract_Int", metricName="rmse")

In [112]:
cv = CrossValidator(estimator=pipeline,evaluator=evaluator,estimatorParamMaps=paramGrid,numFolds=10, parallelism=2)

In [113]:
cvModel = cv.fit(train)

In [114]:
treeModel = cvModel.bestModel.stages[19]
print("Learned classification tree model:\n",treeModel)
print("Best Params: \n", treeModel.explainParams())

Learned classification tree model:
 MultilayerPerceptronClassificationModel: uid=MultilayerPerceptronClassifier_795f43a160bb, numLayers=4, numClasses=3, numFeatures=18
Best Params: 
 blockSize: block size for stacking input data in matrices. Data is stacked within partitions. If block size is more than remaining data in a partition then it is adjusted to the size of this data. (default: 128, current: 128)
featuresCol: features column name. (default: features, current: scaledFeatures)
initialWeights: The initial weights of the model. (undefined)
labelCol: label column name. (default: label, current: Contract_Int)
layers: Sizes of layers from input layer to output layer E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 neurons and output layer of 10 neurons. (current: [18, 10, 5, 3])
maxIter: max number of iterations (>= 0). (default: 100, current: 100)
predictionCol: prediction column name. (default: prediction)
probabilityCol: Column name for predicted class conditi

In [115]:
predictions = cvModel.transform(test)
predictions.select("prediction", "Contract_Int", "predictedLabel", "Contract", "features", "scaledFeatures").show()

+----------+------------+--------------+--------------+--------------------+--------------------+
|prediction|Contract_Int|predictedLabel|      Contract|            features|      scaledFeatures|
+----------+------------+--------------+--------------+--------------------+--------------------+
|       2.0|         2.0|      One year|      One year|[0.0,0.0,0.0,27.0...|[0.0,0.0,0.0,1.11...|
|       1.0|         1.0|      Two year|      Two year|[0.0,1.0,1.0,25.0...|[0.0,2.0066441334...|
|       0.0|         0.0|Month-to-month|Month-to-month|(18,[3,5,8,9,12,1...|(18,[3,5,8,9,12,1...|
|       1.0|         0.0|      Two year|Month-to-month|(18,[1,3,6,9,11,1...|(18,[1,3,6,9,11,1...|
|       2.0|         2.0|      One year|      One year|[1.0,0.0,0.0,44.0...|[2.66845817562711...|
|       2.0|         1.0|      One year|      Two year|[0.0,1.0,1.0,48.0...|[0.0,2.0066441334...|
|       0.0|         0.0|Month-to-month|Month-to-month|(18,[1,2,3,6,12,1...|(18,[1,2,3,6,12,1...|
|       0.0|        

In [116]:
new_df = predictions.groupBy("prediction").count()
new_df.show()
new_train_df = train.groupBy("Contract").count()
new_train_df.show()
predictions.printSchema()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|  255|
|       1.0|   84|
|       2.0|   53|
+----------+-----+

+--------------+-----+
|      Contract|count|
+--------------+-----+
|Month-to-month| 2369|
|      One year|  665|
|      Two year|  778|
+--------------+-----+

root
 |-- CustomerID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- SeniorCitizen: integer (nullable = true)
 |-- Partner: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- PhoneService: string (nullable = true)
 |-- MultipleLines: string (nullable = true)
 |-- InternetService: string (nullable = true)
 |-- OnlineSecurity: string (nullable = true)
 |-- OnlineBackup: string (nullable = true)
 |-- DeviceProtection: string (nullable = true)
 |-- TechSupport: string (nullable = true)
 |-- StreamingTV: string (nullable = true)
 |-- StreamingMovies: string (nullable = true)
 |-- Contract: string (nullable = true)
 |-- P

In [117]:
accuracy = evaluator.evaluate(predictions)
print("Test Error = " ,(1.0 - accuracy))

Test Error =  0.24070009196356956


In [118]:
predictionAndLabels = predictions.select("prediction", "Contract_Int").rdd.map(lambda p: [p[0], p[1]]) # Map to RDD prediction|label
metrics =  MulticlassMetrics(predictionAndLabels)

In [119]:
confusion = metrics.confusionMatrix()
print("Confusion matrix: \n" , confusion)

Confusion matrix: 
 DenseMatrix([[217.,   4.,  15.],
             [  4.,  59.,  10.],
             [ 34.,  21.,  28.]])


In [120]:
labels = predictionAndLabels.map(lambda x: x[1]).distinct().collect()
for label in  labels:
  print("Class %f precision = %f\n" % (label , metrics.precision(label)))
  print("Class %f recall = %f\n" % (label, metrics.recall(label)))
  print("Class %f F1 score = %f\n" % (label, metrics.fMeasure( label)))


Class 2.000000 precision = 0.528302

Class 2.000000 recall = 0.337349

Class 2.000000 F1 score = 0.411765

Class 1.000000 precision = 0.702381

Class 1.000000 recall = 0.808219

Class 1.000000 F1 score = 0.751592

Class 0.000000 precision = 0.850980

Class 0.000000 recall = 0.919492

Class 0.000000 F1 score = 0.883910



In [121]:
print("Weighted precision = %s\n" % metrics.weightedPrecision)
print("Weighted recall = %s\n" % metrics.weightedRecall)
print("Weighted F1 score = %s\n" % metrics.weightedFMeasure)
print("Weighted false positive rate = %s\n" % metrics.weightedFalsePositiveRate)

Weighted precision = 0.7549853027464355

Weighted recall = 0.7755102040816326

Weighted F1 score = <bound method MulticlassMetrics.weightedFMeasure of <pyspark.mllib.evaluation.MulticlassMetrics object at 0x7f387d3a1390>>

Weighted false positive rate = 0.178376002494803



In [122]:
print("Recall = %s" % metrics.recall(1.0))
print("Precision = %s" % metrics.precision(1.0))
print("Accuracy = %s" % metrics.accuracy) 
print("F1 = %s" % metrics.fMeasure(1.0))

Recall = 0.8082191780821918
Precision = 0.7023809523809523
Accuracy = 0.7755102040816326
F1 = 0.751592356687898


In [123]:
#spark.stop()