# Step 1: Create the Spark Session object

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('log_reg').getOrCreate()

# Step 2: Read the dataset

In [3]:
df = spark.read.csv('Log_Reg_dataset.csv', inferSchema=True, header=True)

# Step 3: Exploratory Data Analysis

In [4]:
def shape(df):
    return (df.count(), len(df.columns))

In [5]:
shape(df)

(20000, 6)

In [6]:
df.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Repeat_Visitor: integer (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Web_pages_viewed: integer (nullable = true)
 |-- Status: integer (nullable = true)



In [7]:
df.describe().show()

+-------+--------+-----------------+-----------------+--------+-----------------+------------------+
|summary| Country|              Age|   Repeat_Visitor|Platform| Web_pages_viewed|            Status|
+-------+--------+-----------------+-----------------+--------+-----------------+------------------+
|  count|   20000|            20000|            20000|   20000|            20000|             20000|
|   mean|    null|         28.53955|           0.5029|    null|           9.5533|               0.5|
| stddev|    null|7.888912950773227|0.500004090187782|    null|6.073903499824976|0.5000125004687693|
|    min|  Brazil|               17|                0|    Bing|                1|                 0|
|    max|Malaysia|              111|                1|   Yahoo|               29|                 1|
+-------+--------+-----------------+-----------------+--------+-----------------+------------------+



In [8]:
df.show(5)

+---------+---+--------------+--------+----------------+------+
|  Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|
+---------+---+--------------+--------+----------------+------+
|    India| 41|             1|   Yahoo|              21|     1|
|   Brazil| 28|             1|   Yahoo|               5|     0|
|   Brazil| 40|             0|  Google|               3|     0|
|Indonesia| 31|             1|    Bing|              15|     1|
| Malaysia| 32|             0|  Google|              15|     1|
+---------+---+--------------+--------+----------------+------+
only showing top 5 rows



In [9]:
df.groupBy('Country').count().show()

+---------+-----+
|  Country|count|
+---------+-----+
| Malaysia| 1218|
|    India| 4018|
|Indonesia|12178|
|   Brazil| 2586|
+---------+-----+



In [10]:
df.groupBy('Platform').count().show()

+--------+-----+
|Platform|count|
+--------+-----+
|   Yahoo| 9859|
|    Bing| 4360|
|  Google| 5781|
+--------+-----+



In [11]:
df.groupBy('Status').count().show()

+------+-----+
|Status|count|
+------+-----+
|     1|10000|
|     0|10000|
+------+-----+



In [12]:
df.groupBy('Country').mean().show()

+---------+------------------+-------------------+---------------------+--------------------+
|  Country|          avg(Age)|avg(Repeat_Visitor)|avg(Web_pages_viewed)|         avg(Status)|
+---------+------------------+-------------------+---------------------+--------------------+
| Malaysia|27.792282430213465| 0.5730706075533661|   11.192118226600986|  0.6568144499178982|
|    India|27.976854156296664| 0.5433051269288203|   10.727227476356397|  0.6212045793927327|
|Indonesia| 28.43159796354081| 0.5207751683363442|    9.985711939563148|  0.5422893742814913|
|   Brazil|30.274168600154677|  0.322892498066512|    4.921113689095128|0.038669760247486466|
+---------+------------------+-------------------+---------------------+--------------------+



In [13]:
df.groupBy('Status').mean().show()

+------+--------+-------------------+---------------------+-----------+
|Status|avg(Age)|avg(Repeat_Visitor)|avg(Web_pages_viewed)|avg(Status)|
+------+--------+-------------------+---------------------+-----------+
|     1| 26.5435|             0.7019|              14.5617|        1.0|
|     0| 30.5356|             0.3039|               4.5449|        0.0|
+------+--------+-------------------+---------------------+-----------+



# Step 4: Feature Engineering

Convert categorical variable into numerical form using `StringIndexer`, and create a single vector combining all the input features by using Spark's `VectorAssembler`.

In [14]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler

In [15]:
search_engine_indexer = StringIndexer(inputCol='Platform', outputCol='Platform_Num').fit(df)
df = search_engine_indexer.transform(df)

In [16]:
df.show(5)

+---------+---+--------------+--------+----------------+------+------------+
|  Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|Platform_Num|
+---------+---+--------------+--------+----------------+------+------------+
|    India| 41|             1|   Yahoo|              21|     1|         0.0|
|   Brazil| 28|             1|   Yahoo|               5|     0|         0.0|
|   Brazil| 40|             0|  Google|               3|     0|         1.0|
|Indonesia| 31|             1|    Bing|              15|     1|         2.0|
| Malaysia| 32|             0|  Google|              15|     1|         1.0|
+---------+---+--------------+--------+----------------+------+------------+
only showing top 5 rows



In [17]:
df.groupBy('Platform').count().orderBy('count', ascending=False).show(5)

+--------+-----+
|Platform|count|
+--------+-----+
|   Yahoo| 9859|
|  Google| 5781|
|    Bing| 4360|
+--------+-----+



In [18]:
df.groupBy('Platform_Num').count().orderBy('count', ascending=False).show(5)

+------------+-----+
|Platform_Num|count|
+------------+-----+
|         0.0| 9859|
|         1.0| 5781|
|         2.0| 4360|
+------------+-----+



In [19]:
# Represent each of the values into the form of one hot encoded vector.
from pyspark.ml.feature import OneHotEncoder

search_engine_encoder = OneHotEncoder(inputCol='Platform_Num', outputCol='Platform_Vector')
df = search_engine_encoder.transform(df)

In [20]:
df.show(3)

+-------+---+--------------+--------+----------------+------+------------+---------------+
|Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|Platform_Num|Platform_Vector|
+-------+---+--------------+--------+----------------+------+------------+---------------+
|  India| 41|             1|   Yahoo|              21|     1|         0.0|  (2,[0],[1.0])|
| Brazil| 28|             1|   Yahoo|               5|     0|         0.0|  (2,[0],[1.0])|
| Brazil| 40|             0|  Google|               3|     0|         1.0|  (2,[1],[1.0])|
+-------+---+--------------+--------+----------------+------+------------+---------------+
only showing top 3 rows



In [22]:
df.groupBy('Platform_Vector').count().orderBy('count', ascending=False).show()

+---------------+-----+
|Platform_Vector|count|
+---------------+-----+
|  (2,[0],[1.0])| 9859|
|  (2,[1],[1.0])| 5781|
|      (2,[],[])| 4360|
+---------------+-----+



In [24]:
# Repeat the same for Country column.
country_indexer = StringIndexer(inputCol='Country', outputCol='Country_Num').fit(df)
df = country_indexer.transform(df)

In [25]:
df.groupBy('Country').count().orderBy('count', ascending=False).show(5)

+---------+-----+
|  Country|count|
+---------+-----+
|Indonesia|12178|
|    India| 4018|
|   Brazil| 2586|
| Malaysia| 1218|
+---------+-----+



In [26]:
df.groupBy('Country_Num').count().orderBy('count', ascending=False).show(5)

+-----------+-----+
|Country_Num|count|
+-----------+-----+
|        0.0|12178|
|        1.0| 4018|
|        2.0| 2586|
|        3.0| 1218|
+-----------+-----+



In [27]:
country_encoder = OneHotEncoder(inputCol='Country_Num', outputCol='Country_Vector')
df = country_encoder.transform(df)

In [28]:
df.select(['Country', 'Country_Num', 'Country_Vector']).show(3)

+-------+-----------+--------------+
|Country|Country_Num|Country_Vector|
+-------+-----------+--------------+
|  India|        1.0| (3,[1],[1.0])|
| Brazil|        2.0| (3,[2],[1.0])|
| Brazil|        2.0| (3,[2],[1.0])|
+-------+-----------+--------------+
only showing top 3 rows



In [33]:
df.groupBy('Country_Vector').count().orderBy('count', ascending=False).show()

+--------------+-----+
|Country_Vector|count|
+--------------+-----+
| (3,[0],[1.0])|12178|
| (3,[1],[1.0])| 4018|
| (3,[2],[1.0])| 2586|
|     (3,[],[])| 1218|
+--------------+-----+



In [34]:
df.columns

['Country',
 'Age',
 'Repeat_Visitor',
 'Platform',
 'Web_pages_viewed',
 'Status',
 'Platform_Num',
 'Platform_Vector',
 'Country_Num',
 'Country_Vector']

In [35]:
df_assembler = VectorAssembler(inputCols=['Platform_Vector', 'Country_Vector', 'Age', 'Repeat_Visitor', 'Web_pages_viewed'],
                               outputCol='features')
df = df_assembler.transform(df)

In [36]:
df.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Repeat_Visitor: integer (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Web_pages_viewed: integer (nullable = true)
 |-- Status: integer (nullable = true)
 |-- Platform_Num: double (nullable = false)
 |-- Platform_Vector: vector (nullable = true)
 |-- Country_Num: double (nullable = false)
 |-- Country_Vector: vector (nullable = true)
 |-- features: vector (nullable = true)



In [38]:
df.select(['features', 'Status']).show(10)

+--------------------+------+
|            features|Status|
+--------------------+------+
|[1.0,0.0,0.0,1.0,...|     1|
|[1.0,0.0,0.0,0.0,...|     0|
|(8,[1,4,5,7],[1.0...|     0|
|(8,[2,5,6,7],[1.0...|     1|
|(8,[1,5,7],[1.0,3...|     1|
|(8,[1,4,5,7],[1.0...|     0|
|(8,[1,4,5,7],[1.0...|     0|
|(8,[1,2,5,7],[1.0...|     0|
|(8,[0,2,5,7],[1.0...|     0|
|(8,[2,5,6,7],[1.0...|     1|
+--------------------+------+
only showing top 10 rows



In [39]:
model_df = df.select(['features', 'Status'])

# Step 5: Splitting the Dataset

In [41]:
train_df, test_df = model_df.randomSplit([.75, .25])
shape(train_df), shape(test_df)

((15001, 2), (4999, 2))

In [42]:
# Ensure we have a balance dataset.
train_df.groupBy('Status').count().show()

+------+-----+
|Status|count|
+------+-----+
|     1| 7472|
|     0| 7529|
+------+-----+



In [43]:
test_df.groupBy('Status').count().show()

+------+-----+
|Status|count|
+------+-----+
|     1| 2528|
|     0| 2471|
+------+-----+



# Step 6: Build and Train Logistic Regression Model

In [44]:
from pyspark.ml.classification import LogisticRegression

In [45]:
log_reg = LogisticRegression(labelCol='Status').fit(train_df)

In [46]:
train_results = log_reg.evaluate(train_df).predictions
train_results.filter(train_results['Status'] == 1).filter(train_results['prediction'] == 1).select(['Status', 'prediction', 'probability']).show(10)

+------+----------+--------------------+
|Status|prediction|         probability|
+------+----------+--------------------+
|     1|       1.0|[0.30709238167482...|
|     1|       1.0|[0.17206932221585...|
|     1|       1.0|[0.17206932221585...|
|     1|       1.0|[0.17206932221585...|
|     1|       1.0|[0.08880484143428...|
|     1|       1.0|[0.08880484143428...|
|     1|       1.0|[0.08880484143428...|
|     1|       1.0|[0.08880484143428...|
|     1|       1.0|[0.04370517002470...|
|     1|       1.0|[0.04370517002470...|
+------+----------+--------------------+
only showing top 10 rows



# Step 7: Evaluate Linear Regression Model on Test Data

In [48]:
results = log_reg.evaluate(test_df).predictions
results.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Status: integer (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [49]:
results.select(['Status', 'prediction']).show(10)

+------+----------+
|Status|prediction|
+------+----------+
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
|     0|       0.0|
+------+----------+
only showing top 10 rows



## Confusion Matrix

In [51]:
tp = results[(results.Status == 1) & (results.prediction == 1)].count()
tn = results[(results.Status == 0) & (results.prediction == 0)].count()
fp = results[(results.Status == 0) & (results.prediction == 1)].count()
fn = results[(results.Status == 1) & (results.prediction == 0)].count()

## Accuracy

In [53]:
accuracy = float((tp + tn) / results.count())
accuracy

0.9363872774554911

## Recall

In [54]:
recall = float(tp) / (tp + fn)
recall

0.9291930379746836

## Precision

In [56]:
precision = float(tp) / (tp + fp)
precision

0.9441318327974276