# Logistical Regression

In [600]:
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import IndexToString, Normalizer, StringIndexer, VectorAssembler, VectorIndexer, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import expr
from pyspark.ml.evaluation import RegressionEvaluator
from helpers.helper_functions import translate_to_file_string
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.mllib.evaluation import MulticlassMetrics, RegressionMetrics
from pyspark.mllib.util import MLUtils
import pandas as pd
from IPython.display import display, HTML

#Change Column Names
def delete_space(df):
    names = df.schema.names
    for name in names:
        newName = name.replace(" ","")
        df = df.withColumnRenamed(name, newName)
    return df

In [601]:
inputFile = translate_to_file_string("../data/data.csv")

Spark session creation 

In [602]:
spark = (SparkSession
       .builder
       .appName("Modell_LogisticalRegression")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [603]:
# create a DataFrame using an ifered Schema 
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) 

df = delete_space(df)
df_orig = df
df = df.where("MonthlyCharges Between 22 AND 95")
df = df.where("TotalCharges IS NOT NULL")

## Prepare training and test data.

### Use this for Pandas Dataframe

In [604]:

# #Create Pandas DataFrame
# df_pandas = df.toPandas()
# df_pandas_cat = df.toPandas()
# #Pandas Indexing Method to Integer or Category Datatype
# pandasCol = list(df_pandas)
# for col in pandasCol:
#     if df_pandas[col].dtypes=='object':
#         if not col == "Contract":
#             #Categorize
#             df_pandas_cat[col]= pd.Categorical(pd.factorize(df_pandas_cat[col])[0])
#             #ToInteger
#             df_pandas[col]= pd.factorize(df_pandas[col])[0]

# #Define whicht Columns should be normalized
# newCols = []
# for col in pandasCol:
#     if not col == "Contract":
#         newCols.append(col)

# #Normalize the selected Columns
# df_pandas[newCols]=(df_pandas[newCols]-df_pandas[newCols].min())/(df_pandas[newCols].max()-df_pandas[newCols].min())



# #Write Pandas Dataframe Back to Spark Dataframe
# df_temp = spark.createDataFrame(df_pandas)
# df = df_temp
# df.printSchema()
# # Create Indexer for Contract (Still needed)
# contractIndexer = StringIndexer().setInputCol("Contract").setOutputCol("Contract_Int").fit(df)

# #Create FeatureCols 
# featureCols = df.columns.copy()
# featureCols.remove("Contract")
# featureCols.remove("CustomerID")


###  Create Indexer

In [605]:
#Comment the Following if Pandas-Dataset is used
IDIndexer = StringIndexer().setInputCol("CustomerID").setOutputCol("CustomerID_Int").fit(df)
genderIndexer = StringIndexer().setInputCol("Gender").setOutputCol("Gender_Int").fit(df)
seniorIndexer = StringIndexer().setInputCol("SeniorCitizen").setOutputCol("SeniorCitizen_Int").fit(df)
partnerIndexer = StringIndexer().setInputCol("Partner").setOutputCol("Partner_Int").fit(df)
DependentsIndexer = StringIndexer().setInputCol("Dependents").setOutputCol("Dependents_Int").fit(df)
tenureIndexer = StringIndexer().setInputCol("Tenure").setOutputCol("Tenure_Int").fit(df)
phoneIndexer = StringIndexer().setInputCol("PhoneService").setOutputCol("PhoneService_Int").fit(df)
multipleIndexer = StringIndexer().setInputCol("MultipleLines").setOutputCol("MultipleLines_Int").fit(df)
internetIndexer = StringIndexer().setInputCol("InternetService").setOutputCol("InternetService_Int").fit(df)
onlineSecurityIndexer = StringIndexer().setInputCol("OnlineSecurity").setOutputCol("OnlineSecurity_Int").fit(df)
onlineBackupIndexer = StringIndexer().setInputCol("OnlineBackup").setOutputCol("OnlineBackup_Int").fit(df)
deviceIndexer = StringIndexer().setInputCol("DeviceProtection").setOutputCol("DeviceProtection_Int").fit(df)
techIndexer = StringIndexer().setInputCol("TechSupport").setOutputCol("TechSupport_Int").fit(df)
streamingTVIndexer = StringIndexer().setInputCol("StreamingTV").setOutputCol("StreamingTV_Int").fit(df)
streamingMoviesIndexer = StringIndexer().setInputCol("StreamingMovies").setOutputCol("StreamingMovies_Int").fit(df)
contractIndexer = StringIndexer().setInputCol("Contract").setOutputCol("Contract_Int").fit(df)
paperlessIndexer = StringIndexer().setInputCol("PaperlessBilling").setOutputCol("PaperlessBilling_Int").fit(df)
paymentIndexer = StringIndexer().setInputCol("PaymentMethod").setOutputCol("PaymentMethod_Int").fit(df)
monthlyIndexer = StringIndexer().setInputCol("MonthlyCharges").setOutputCol("MonthlyCharges_Int").fit(df)
totalIndexer = StringIndexer().setInputCol("TotalCharges").setOutputCol("TotalCharges_Int").fit(df)

In [606]:
#Comment this if Pandas Dataset is used
featureCols = df.columns.copy()
for col in featureCols:
    if not col == "Tenure" and not col == "MonthlyCharges" and not col == "TotalCharges":
        featureCols.remove(col)
        colname = col +"_Int"
        featureCols = featureCols + [colname]
    else:
        featureCols.remove(col)
        featureCols = featureCols + [col]

featureCols.remove("Contract_Int")
featureCols.remove("CustomerID_Int")
featureCols.remove("Gender")
featureCols = featureCols + ["Gender_Int"]

In [607]:
assembler =  VectorAssembler(outputCol="features", inputCols=list(featureCols))

featureIndexer = VectorIndexer(inputCol="features",outputCol="indexedFeatures", maxCategories=6) 

predConverter = IndexToString(inputCol="prediction",outputCol="predictedLabel",labels=contractIndexer.labels)

scaler = StandardScaler(inputCol="indexedFeatures", outputCol="scaledFeatures",withStd=True, withMean=False)

## Logistic Regression

In [608]:
lr = LogisticRegression(featuresCol="scaledFeatures", labelCol="Contract_Int")
paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [ 100, 120 , 150 ])\
                                 .addGrid(lr.regParam, [ 0.1, 0.01 ]) \
                                 .addGrid(lr.elasticNetParam, [0 , 0.3 , 0.6 , 0.8]) \
                                 .build()
# paramGrid = ParamGridBuilder().build()

### Create Train and Test Datasets

In [609]:
splits = df.randomSplit([0.9, 0.1 ], 12345)
train = splits[0]
test = splits[1]

### Build the Pipeline

In [610]:
#Use This for Pandas-Dataframe
# pipeline = Pipeline(stages= [contractIndexer, assembler, featureIndexer, scaler, lr, predConverter])

In [611]:
##Use This for Spark Dataframe
pipeline = Pipeline(stages= [genderIndexer, seniorIndexer, partnerIndexer, DependentsIndexer, phoneIndexer, multipleIndexer, internetIndexer, onlineSecurityIndexer, onlineBackupIndexer, deviceIndexer, techIndexer, streamingTVIndexer, streamingMoviesIndexer, contractIndexer, paperlessIndexer, paymentIndexer, assembler, featureIndexer, scaler, lr, predConverter])

### Build the Evaluator and Cross Validator

In [612]:
evaluator =  RegressionEvaluator(labelCol="Contract_Int", metricName="rmse")

cv = CrossValidator(estimator=pipeline,evaluator=evaluator,estimatorParamMaps=paramGrid,numFolds=10, parallelism=2)

### Train the Model

In [613]:
cvModel = cv.fit(train)

In [614]:
#stages[19] for Spark Dataframe ;  stages[4] for Pandas Dataframe 
treeModel = cvModel.bestModel.stages[19]
print("Learned classification tree model:\n",treeModel)
print("Best Params: \n", treeModel.explainParams())

Learned classification tree model:
 LogisticRegressionModel: uid=LogisticRegression_8d77984b9fc6, numClasses=3, numFeatures=18
Best Params: 
 aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0, current: 0.3)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features, current: scaledFeatures)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: Contract_Int)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes,

In [615]:
predictions = cvModel.transform(test)
predictions.select("prediction", "Contract_Int", "predictedLabel", "Contract", "features", "indexedFeatures").show()

+----------+------------+--------------+--------------+--------------------+--------------------+
|prediction|Contract_Int|predictedLabel|      Contract|            features|     indexedFeatures|
+----------+------------+--------------+--------------+--------------------+--------------------+
|       2.0|         2.0|      One year|      One year|[0.0,0.0,0.0,27.0...|[0.0,0.0,0.0,27.0...|
|       1.0|         1.0|      Two year|      Two year|[0.0,1.0,1.0,25.0...|[0.0,1.0,1.0,25.0...|
|       0.0|         0.0|Month-to-month|Month-to-month|(18,[3,5,8,9,12,1...|(18,[3,5,8,9,12,1...|
|       2.0|         0.0|      One year|Month-to-month|(18,[1,3,6,9,11,1...|(18,[1,3,6,9,11,1...|
|       2.0|         2.0|      One year|      One year|[1.0,0.0,0.0,44.0...|[1.0,0.0,0.0,44.0...|
|       2.0|         1.0|      One year|      Two year|[0.0,1.0,1.0,48.0...|[0.0,1.0,1.0,48.0...|
|       0.0|         0.0|Month-to-month|Month-to-month|(18,[1,2,3,6,12,1...|(18,[1,2,3,6,12,1...|
|       0.0|        

### Evaluation

In [616]:
predictionAndLabels = predictions.select("prediction", "Contract_Int").rdd.map(lambda p: [p[0], p[1]]) # Map to RDD prediction|label
metrics =  MulticlassMetrics(predictionAndLabels)

In [617]:
#Confusion Matrix
confusion = metrics.confusionMatrix()

In [618]:
#Kennzahlen pro Klasse
labels = predictionAndLabels.map(lambda x: x[1]).distinct().collect()
for label in  labels:
  print("Class %f precision = %f\n" % (label , metrics.precision(label)))
  print("Class %f recall = %f\n" % (label, metrics.recall(label)))
  print("Class %f F1 score = %f\n" % (label, metrics.fMeasure( label)))


Class 2.000000 precision = 0.534884

Class 2.000000 recall = 0.277108

Class 2.000000 F1 score = 0.365079

Class 1.000000 precision = 0.715909

Class 1.000000 recall = 0.863014

Class 1.000000 F1 score = 0.782609

Class 0.000000 precision = 0.839080

Class 0.000000 recall = 0.927966

Class 0.000000 F1 score = 0.881288



In [619]:
#Kennzahlen für das ganze Modell
print("Accuracy = %s\n" % metrics.accuracy) 
print("Test Error = %s\n" % (1 - metrics.accuracy))
print("Recall = %s\n" % metrics.weightedRecall)
print("Precision = %s\n" % metrics.weightedPrecision)
print("F1 = %s\n" % metrics.weightedFMeasure())

Accuracy = 0.7780612244897959

Test Error = 0.22193877551020413

Recall = 0.778061224489796

Precision = 0.751733931069694

F1 = 0.7536120548589773



In [620]:
spark.stop()