# PySpark MLlib: Logistic Regression

## Basic

In [1]:
from pyspark.sql import SparkSession

In [19]:
spark = SparkSession.builder.appName("log_reg_basic").getOrCreate()

In [20]:
df = spark.read.format("libsvm").load("./files/sample_libsvm_data.txt")
df.show()

24/06/09 21:44:03 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+


In [21]:
train_df, test_df = df.randomSplit([0.7, 0.3])

In [22]:
from pyspark.ml.classification import LogisticRegression

In [23]:
model = LogisticRegression().fit(train_df)

In [24]:
result = model.evaluate(test_df)

In [25]:
result.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[121,122,123...|[25.4030937027035...|[0.99999999999071...|       0.0|
|  0.0|(692,[124,125,126...|[27.1375820781175...|[0.99999999999836...|       0.0|
|  0.0|(692,[124,125,126...|[20.3607657754883...|[0.99999999856308...|       0.0|
|  0.0|(692,[126,127,128...|[25.9017561617222...|[0.99999999999436...|       0.0|
|  0.0|(692,[127,128,129...|[25.2670567872269...|[0.99999999998936...|       0.0|
|  0.0|(692,[127,128,129...|[21.8024816455057...|[0.99999999966013...|       0.0|
|  0.0|(692,[152,153,154...|[17.6969040624542...|[0.99999997937793...|       0.0|
|  0.0|(692,[153,154,155...|[30.1184893706196...|[0.99999999999991...|       0.0|
|  0.0|(692,[155,156,180...|[33.7502579117914...|[0.99999999999999...|       0.0|
|  0.0|(692,[181

In [26]:
result.accuracy

1.0

In [27]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [28]:
result = model.evaluate(test_df)

In [29]:
result.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[121,122,123...|[25.4030937027035...|[0.99999999999071...|       0.0|
|  0.0|(692,[124,125,126...|[27.1375820781175...|[0.99999999999836...|       0.0|
|  0.0|(692,[124,125,126...|[20.3607657754883...|[0.99999999856308...|       0.0|
|  0.0|(692,[126,127,128...|[25.9017561617222...|[0.99999999999436...|       0.0|
|  0.0|(692,[127,128,129...|[25.2670567872269...|[0.99999999998936...|       0.0|
|  0.0|(692,[127,128,129...|[21.8024816455057...|[0.99999999966013...|       0.0|
|  0.0|(692,[152,153,154...|[17.6969040624542...|[0.99999997937793...|       0.0|
|  0.0|(692,[153,154,155...|[30.1184893706196...|[0.99999999999991...|       0.0|
|  0.0|(692,[155,156,180...|[33.7502579117914...|[0.99999999999999...|       0.0|
|  0.0|(692,[181

In [30]:
prediction_and_label = result.predictions.select(["prediction", "label"])

In [32]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label", metricName="accuracy").evaluate(prediction_and_label)

In [34]:
evaluator

1.0

## Real Data

In [127]:
spark = SparkSession.builder.appName("log_reg_real").getOrCreate()

In [128]:
df = spark.read.csv("./files/titanic.csv", header=True, inferSchema=True)
df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
|          6|       0|     3|    Moran, Mr. James|  male|NULL|    0|    0|      

In [129]:
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [130]:
selected_df = df.select(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 
                         'Fare', 'Embarked', 'Survived'])

In [131]:
selected_df = selected_df.withColumnRenamed('Survived', 'label')
selected_df.show()

+------+------+----+-----+-----+-------+--------+-----+
|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|label|
+------+------+----+-----+-----+-------+--------+-----+
|     3|  male|22.0|    1|    0|   7.25|       S|    0|
|     1|female|38.0|    1|    0|71.2833|       C|    1|
|     3|female|26.0|    0|    0|  7.925|       S|    1|
|     1|female|35.0|    1|    0|   53.1|       S|    1|
|     3|  male|35.0|    0|    0|   8.05|       S|    0|
|     3|  male|NULL|    0|    0| 8.4583|       Q|    0|
|     1|  male|54.0|    0|    0|51.8625|       S|    0|
|     3|  male| 2.0|    3|    1| 21.075|       S|    0|
|     3|female|27.0|    0|    2|11.1333|       S|    1|
|     2|female|14.0|    1|    0|30.0708|       C|    1|
|     3|female| 4.0|    1|    1|   16.7|       S|    1|
|     1|female|58.0|    0|    0|  26.55|       S|    1|
|     3|  male|20.0|    0|    0|   8.05|       S|    0|
|     3|  male|39.0|    1|    5| 31.275|       S|    0|
|     3|female|14.0|    0|    0| 7.8542|       S

In [132]:
selected_df = selected_df.na.drop()

In [133]:
train_df, test_df = selected_df.randomSplit([0.7, 0.3])

In [134]:
train_df.show()

+------+------+----+-----+-----+--------+--------+-----+
|Pclass|   Sex| Age|SibSp|Parch|    Fare|Embarked|label|
+------+------+----+-----+-----+--------+--------+-----+
|     1|female| 2.0|    1|    2|  151.55|       S|    0|
|     1|female|14.0|    1|    2|   120.0|       S|    1|
|     1|female|15.0|    0|    1|211.3375|       S|    1|
|     1|female|16.0|    0|    1| 57.9792|       C|    1|
|     1|female|17.0|    1|    0|    57.0|       S|    1|
|     1|female|17.0|    1|    0|   108.9|       C|    1|
|     1|female|18.0|    0|    2|   79.65|       S|    1|
|     1|female|18.0|    2|    2| 262.375|       C|    1|
|     1|female|19.0|    0|    0|    30.0|       S|    1|
|     1|female|19.0|    0|    2| 26.2833|       S|    1|
|     1|female|21.0|    0|    0| 77.9583|       S|    1|
|     1|female|21.0|    2|    2| 262.375|       C|    1|
|     1|female|22.0|    0|    0|  151.55|       S|    1|
|     1|female|22.0|    0|    1|    55.0|       S|    1|
|     1|female|22.0|    0|    2

In [135]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

In [136]:
# sex_indexer = StringIndexer(inputCol="Sex", outputCol="sex_index")
# sex_encoder = OneHotEncoder(inputCol="sex_index", outputCol="sex_enc")

# selected_df = sex_indexer.fit(selected_df).transform(selected_df)
# selected_df = sex_encoder.fit(selected_df).transform(selected_df)

In [137]:
sex_indexer = StringIndexer(inputCol="Sex", outputCol="sex_index")
sex_encoder = OneHotEncoder(inputCol="sex_index", outputCol="sex_enc")

In [138]:
embarked_indexer = StringIndexer(inputCol="Embarked", outputCol="embarked_index")
embarked_encoder = OneHotEncoder(inputCol="embarked_index", outputCol="embarked_enc")

In [139]:
pclass_encoder = OneHotEncoder(inputCol="Pclass", outputCol="pclass_enc")
sibsp_encoder = OneHotEncoder(inputCol="SibSp", outputCol="sibsp_enc")
parch_encoder = OneHotEncoder(inputCol="Parch", outputCol="parch_enc")

In [140]:
assembler = VectorAssembler(inputCols=["sex_enc", "embarked_enc", "pclass_enc", "sibsp_enc", "parch_enc", "Age", "Fare"], outputCol="features")

In [141]:
log_reg = LogisticRegression()

In [142]:
from pyspark.ml import Pipeline

In [143]:
pipeline = Pipeline(stages=[sex_indexer, sex_encoder, embarked_indexer, embarked_encoder, pclass_encoder, sibsp_encoder, parch_encoder, assembler, log_reg])

In [144]:
pipeline = pipeline.fit(train_df)

                                                                                

In [145]:
result = pipeline.transform(test_df)

In [147]:
result.show()

+------+------+----+-----+-----+--------+--------+-----+---------+---------+--------------+-------------+-------------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|Pclass|   Sex| Age|SibSp|Parch|    Fare|Embarked|label|sex_index|  sex_enc|embarked_index| embarked_enc|   pclass_enc|    sibsp_enc|    parch_enc|            features|       rawPrediction|         probability|prediction|
+------+------+----+-----+-----+--------+--------+-----+---------+---------+--------------+-------------+-------------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|     1|female|16.0|    0|    0|    86.5|       S|    1|      1.0|(1,[],[])|           0.0|(2,[0],[1.0])|(3,[1],[1.0])|(5,[0],[1.0])|(6,[0],[1.0])|(19,[1,4,6,11,17,...|[-2.9109760106123...|[0.05161363915402...|       1.0|
|     1|female|16.0|    0|    1|    39.4|       S|    1|      1.0|(1,[],[])|           0.0|(2,[0],[1.0])|(3,[1],

In [148]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [149]:
auc = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="label").evaluate(result)

                                                                                

In [150]:
auc

0.78125

## Project

In [174]:
spark = SparkSession.builder.appName("log_reg_project").getOrCreate()

In [175]:
df = spark.read.csv('./files/customer_churn.csv', header=True, inferSchema=True)

In [176]:
df.show()

+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|              Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|   Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|      Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|
|        Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|
|      Phillip White|42.0|       8010.76|              0| 6.71|     10.0|2014-04-22 12:43:12|13120 Daniel Moun...|           Smith Inc|    1|
|     

In [177]:
selected_df = df.select(["Age", "Total_Purchase", "Years", "Num_Sites", "Churn"])
selected_df.show()

+----+--------------+-----+---------+-----+
| Age|Total_Purchase|Years|Num_Sites|Churn|
+----+--------------+-----+---------+-----+
|42.0|       11066.8| 7.22|      8.0|    1|
|41.0|      11916.22|  6.5|     11.0|    1|
|38.0|      12884.75| 6.67|     12.0|    1|
|42.0|       8010.76| 6.71|     10.0|    1|
|37.0|       9191.58| 5.56|      9.0|    1|
|48.0|      10356.02| 5.12|      8.0|    1|
|44.0|      11331.58| 5.23|     11.0|    1|
|32.0|       9885.12| 6.92|      9.0|    1|
|43.0|       14062.6| 5.46|     11.0|    1|
|40.0|       8066.94| 7.11|     11.0|    1|
|30.0|      11575.37| 5.22|      8.0|    1|
|45.0|       8771.02| 6.64|     11.0|    1|
|45.0|       8988.67| 4.84|     11.0|    1|
|40.0|       8283.32|  5.1|     13.0|    1|
|41.0|       6569.87|  4.3|     11.0|    1|
|38.0|      10494.82| 6.81|     12.0|    1|
|45.0|       8213.41| 7.35|     11.0|    1|
|43.0|      11226.88| 8.08|     12.0|    1|
|53.0|       5515.09| 6.85|      8.0|    1|
|46.0|        8046.4| 5.69|     

In [178]:
selected_df = selected_df.withColumnRenamed("Churn", "label")
selected_df.show()

+----+--------------+-----+---------+-----+
| Age|Total_Purchase|Years|Num_Sites|label|
+----+--------------+-----+---------+-----+
|42.0|       11066.8| 7.22|      8.0|    1|
|41.0|      11916.22|  6.5|     11.0|    1|
|38.0|      12884.75| 6.67|     12.0|    1|
|42.0|       8010.76| 6.71|     10.0|    1|
|37.0|       9191.58| 5.56|      9.0|    1|
|48.0|      10356.02| 5.12|      8.0|    1|
|44.0|      11331.58| 5.23|     11.0|    1|
|32.0|       9885.12| 6.92|      9.0|    1|
|43.0|       14062.6| 5.46|     11.0|    1|
|40.0|       8066.94| 7.11|     11.0|    1|
|30.0|      11575.37| 5.22|      8.0|    1|
|45.0|       8771.02| 6.64|     11.0|    1|
|45.0|       8988.67| 4.84|     11.0|    1|
|40.0|       8283.32|  5.1|     13.0|    1|
|41.0|       6569.87|  4.3|     11.0|    1|
|38.0|      10494.82| 6.81|     12.0|    1|
|45.0|       8213.41| 7.35|     11.0|    1|
|43.0|      11226.88| 8.08|     12.0|    1|
|53.0|       5515.09| 6.85|      8.0|    1|
|46.0|        8046.4| 5.69|     

In [179]:
train_df, test_df = selected_df.randomSplit([0.7, 0.3])

In [180]:
assembler = VectorAssembler(inputCols=["Age", "Total_Purchase", "Years", "Num_Sites"], outputCol="features")
train_df = assembler.transform(train_df)
test_df = assembler.transform(test_df)

In [181]:
model = LogisticRegression()

In [182]:
model = model.fit(train_df)

In [183]:
result = model.evaluate(test_df)

In [184]:
result.predictions.select(["label", "prediction"]).show()

+-----+----------+
|label|prediction|
+-----+----------+
|    1|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    1|       0.0|
|    0|       0.0|
|    0|       0.0|
|    1|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
+-----+----------+


In [186]:
auc = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="label").evaluate(result.predictions)

In [187]:
auc

0.7637049584231599