# <center> Introduction to Spark In-memory Computing via Python PySpark </center>

In [None]:
import sys
import os
import pyspark

print(os.environ['SPARK_ROOT'])
print(os.environ['SPARK_CONFIG_FILE'])
print(os.environ['SPARK_ROOT'])
print(os.environ['SPARK_MASTER_HOST'])
print(os.environ['SPARK_MASTER_PORT'])
print(os.environ['SPARK_MASTER_WEBUI_PORT'])

## Application: spam filtering



|     | viagra  | learning  | the | dating | prince | spam?   |
| --- | ------- | --------- | --- | ------ | ------ | ------- | 
| X1  | 1       |  0        |  1  |  0     | 0      | Y1 = 1  |
| X2  | 0       |  1        |  1  |  0     | 0      | Y2 = -1 | 
| X3  | 0       |  0        |  0  |  0     | 1      | Y3 = 1  |


- Instance spaces X1, X2, X3 belong to set X (data points)
  - Binary or real-valued feature vector X of word occurrences
  - `d` features (words and other things, d is approximately 100,000)
- Class Y
  - Spam = 1
  - Ham  = -1


## Support Vector Machine

 
- Originally developed by Vapnik and collaborators as a linear classifier.
- Could be modified to support non-linear classification by mapping into high-dimensional spaces.

## Simple example

- https://spark.apache.org/docs/2.4.5/api/python/pyspark.mllib.html#pyspark.mllib.classification.SVMModel
- https://spark.apache.org/docs/2.4.5/mllib-data-types.html#labeled-point
- https://spark.apache.org/docs/2.4.5/api/python/pyspark.sql.html#pyspark.sql.DataFrame (look for randomSplit)

In [None]:
spark = pyspark.sql.SparkSession(sc)

In [None]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import SVMWithSGD

In [None]:
data = [
    LabeledPoint(0.0, [0.0]),
    LabeledPoint(1.0, [1.0]),
    LabeledPoint(1.0, [2.0]),
    LabeledPoint(1.0, [3.0])
]
svm = SVMWithSGD.train(sc.parallelize(data), iterations=10)
svm.predict([1.0])

### Question: Can you predict whether a client will subscribe to a term deposit (feature deposit)?

### Problems:
- What data should the bank data be converted to?
- How to handle categorical data?

In [None]:
df = spark.read.csv('bank.csv', header = True, inferSchema = True)
df.printSchema()

In [None]:
df.take(5)

In [None]:
unique_deposit = df.select('deposit').distinct().collect()
print(unique_deposit)

In [None]:
df.select('marital').distinct().collect()

### Do we want all columns?

In [None]:
columns = df.columns
print(columns)

### How do we build categorical data?

In [None]:
cat_dictionary = {}
cat_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'day', 'month', 'campaign', 'pdays', 'previous', 'poutcome', 'deposit']

for c in cat_columns:
    unique_c = df.select(c).distinct().collect()
    #print(c + ": " + str(len(unique_c)))
    cat_dictionary[c] = {}
    i = 0
    for v in unique_c:
        cat_dictionary[c][v[c]] = i
        i += 1

In [None]:
cat_dictionary

### Generate LabelPoints

In [None]:
def dataPrep(r):
    key = 0
    value = []
    for c in columns:
        if c == 'deposit':
            key = cat_dictionary[c][r[columns.index(c)]]
        else:
            if c in cat_columns:
                value.append(cat_dictionary[c][r[columns.index(c)]])
            else:
                value.append(r[columns.index(c)])
    return LabeledPoint(key, value)

In [None]:
df.rdd.take(1)

In [None]:
dataPrep(df.rdd.take(1)[0])

In [None]:
df_clean = df.rdd.map(dataPrep)
df_clean.take(20)

### Training and Testing sets

In [None]:
df_svm = df_clean.randomSplit([0.8, 0.2], 1234)

In [None]:
print(df_clean.count())
print(df_svm[0].count())
print(df_svm[1].count())

### Training

In [None]:
svm_bank = SVMWithSGD.train(df_svm[0], iterations=200)

### Testing

In [None]:
def testPrediction(p):
    prediction = svm_bank.predict(p.features)
    if prediction == p.label:
        return ("correct", 1)
    else:
        return ("incorrect", 1)

In [None]:
df_results = df_svm[1].map(testPrediction).reduceByKey(lambda x, y: x + y)
df_results.collect()

## Challenge

- To be completed at home
- Can you change the column combinations to make this prediction better?