# Curso Big Data #8 - Logistic Regression

#### 1. Inicializamos la sesion en spark

In [1]:

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('logistic_regression').getOrCreate()

#### 2. Cargamos el dataset

In [2]:

df = spark.read.csv('C:/Users/pc/pruebas/customer_churn.csv', inferSchema=True, header=True)


In [3]:
df.show(5)

+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|
|     Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|
|   Phillip White|42.0|       8010.76|              0| 6.71|     10.0|2014-04-22 12:43:12|13120 Daniel Moun...|           Smith Inc|    1|
|  Cynthia Norton|37.0|    

In [4]:
df.createOrReplaceTempView("customer_churn")

sqlDF = spark.sql("SELECT * FROM customer_churn")
sqlDF.show(3)


+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|
|     Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|
+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
only showing top 3 rows



In [5]:
df.describe().show()

+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|        Names|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|            Location|             Company|              Churn|
+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|  count|          900|              900|              900|               900|              900|               900|                 900|                 900|                900|
|   mean|         null|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|                null|                null|0.16666666666666666|
| stddev|         null|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.764835592035

#### 3. Transformamos el data set para poder ser usadao en spark

In [12]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Age', 'Total_Purchase', 'Years', 'Num_Sites'],
                           outputCol='features')
final_df = assembler.transform(df).select('features', 'Churn')

#### 4. Train test split

In [8]:
train_data, test_data = final_df.randomSplit([0.7, 0.3])

#### 5. Creating the logistic regression model

In [9]:
from pyspark.ml.classification import LogisticRegression
classifier = LogisticRegression(featuresCol='features', labelCol='Churn', predictionCol='prediction')
fitted_classifier = classifier.fit(train_data)

#### 6. Evaluate

In [13]:
summary = fitted_classifier.summary
summary.predictions.describe().show()

+-------+-------------------+-------------------+
|summary|              Churn|         prediction|
+-------+-------------------+-------------------+
|  count|                607|                607|
|   mean|0.16474464579901152|0.12191103789126853|
| stddev| 0.3712558639736437| 0.3274528741849083|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+



In [22]:
pred_vs_actual =fitted_classifier.evaluate(test_data)
pred_vs_actual.predictions.describe().show()

+-------+-------------------+-------------------+
|summary|              Churn|         prediction|
+-------+-------------------+-------------------+
|  count|                293|                293|
|   mean|0.17064846416382254|0.12286689419795221|
| stddev| 0.3768451263941418|0.32884600951751697|
|    min|                  0|                0.0|
|    max|                  1|                1.0|
+-------+-------------------+-------------------+



 #### 4. Evaluamos usando el testset


In [14]:
pred_vs_actual = fitted_classifier.evaluate(test_data)
pred_vs_actual.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|Churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[22.0,11254.38,4....|    0|[5.20626170905429...|[0.99454776065714...|       0.0|
|[25.0,9672.03,5.4...|    0|[4.7200791919095,...|[0.99116429310038...|       0.0|
|[26.0,8787.39,5.4...|    1|[0.99066422373772...|[0.72921909900891...|       0.0|
|[28.0,8670.98,3.9...|    0|[8.01847410201920...|[0.99967078626575...|       0.0|
|[29.0,5900.78,5.5...|    0|[4.54584100136750...|[0.98950017136183...|       0.0|
|[29.0,8688.17,5.7...|    1|[3.10409672239031...|[0.95706141489491...|       0.0|
|[29.0,13240.01,4....|    0|[7.39812682137532...|[0.99938797598299...|       0.0|
|[29.0,13255.05,4....|    0|[4.70356938436280...|[0.99101852748948...|       0.0|
|[30.0,11575.37,5....|    1|[4.48552015195958...|[0.98885459674943...|       0.0|
|[30.0,12788.37,

In [15]:
# Evaluamos el area debajo de la curva ROC

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Churn')
evaluator.evaluate(pred_vs_actual.predictions)

0.7635390946502059

#### 5. Predecimos usando unlabeled data


In [29]:
# Loading the dataset

df.createOrReplaceTempView("new_customers")

df_unlabeled = sqlContext.sql('SELECT * FROM new_customers')

In [31]:
df_unlabeled.show(5)

+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|
|     Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|
|   Phillip White|42.0|       8010.76|              0| 6.71|     10.0|2014-04-22 12:43:12|13120 Daniel Moun...|           Smith Inc|    1|
|  Cynthia Norton|37.0|    

#### 5.1 Transformamos el dataframe en aceptable para PySpark

In [32]:

final_df_unlabeled = assembler.transform(df_unlabeled)

In [34]:
final_df_unlabeled.show(5)

+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+--------------------+
|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|            features|
+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+--------------------+
|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|[42.0,11066.8,7.2...|
|   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|[41.0,11916.22,6....|
|     Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|[38.0,12884.75,6....|
|   Phillip White|42.0|       8010.76|  

#### 6. Creamos un nuevo modelo usando todo el data set

In [35]:
classifier_all = classifier.fit(final_df)
results = classifier_all.transform(final_df_unlabeled)

In [36]:
results.select('Company', 'prediction').show()

+--------------------+----------+
|             Company|prediction|
+--------------------+----------+
|          Harvey LLC|       0.0|
|          Wilson PLC|       1.0|
|Miller, Johnson a...|       1.0|
|           Smith Inc|       0.0|
|          Love-Jones|       0.0|
|        Kelly-Warren|       0.0|
|   Reynolds-Sheppard|       1.0|
|          Singh-Cole|       0.0|
|           Lopez PLC|       1.0|
|       Reed-Martinez|       1.0|
|Briggs, Lamb and ...|       0.0|
|    Figueroa-Maynard|       1.0|
|     Abbott-Thompson|       0.0|
|Smith, Kim and Ma...|       1.0|
|Snyder, Lee and M...|       0.0|
|      Sanders-Pierce|       1.0|
|Andrews, Adams an...|       1.0|
|Morgan, Phillips ...|       1.0|
|      Villanueva LLC|       0.0|
|Berry, Orr and Ca...|       0.0|
+--------------------+----------+
only showing top 20 rows

