# Basic Machine Learning & More. Session I

In [1]:
import findspark
import pyspark
import time
import operator
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark import SparkFiles
import pandas as pd


conf = SparkConf()
conf.setMaster("local")
conf.setAppName("spark-basic")
sc = SparkContext(conf = conf)

url = "https://raw.githubusercontent.com/guru99-edu/R-Programming/master/adult_data.csv"
sc.addFile(url)
sqlContext = SQLContext(sc)

### Waking up & PySpark

In [None]:
def WhatIsThisDoing0(x):
    if x == 1:
        return 1
    else:
        return x * WhatIsThisDoing0(x-1)

3 * 2 * 1

In [None]:
def WhatIsThisDoing1(x):
    if x == 0:
        return 0
    elif x == 1:
        return 1
    else:
        return WhatIsThisDoing1(x-1) + WhatIsThisDoing1(x-2)

In [None]:
def WhatIsThisDoing2(x):
    a, b = 0, 1
    for i in range(x):
        a, b = b, a + b
    return a

In [None]:
dicty = {0:0, 1:1}

def WhatIsThisDoing3(x):
    if not x in dicty:
        dicty[x] = WhatIsThisDoing3(x-1) + WhatIsThisDoing3(x-2)
    return dicty[x]

In [None]:
def WhatIsThisDoing5(df, A, B):
    for a in A: 
        df = df.withColumn(a, df[a].cast(B))
    return df 

WhatIsThisDoing5(mydata, ["col1","col2", "col3"], "asFloat")

### How would we connect pyspark to a SQL Database (not-working examples)

Question: SQL vs NoSQL?

#### JDBC

In [None]:
from pyspark import SparkContext, SparkConf, SQLContext

appName = "PySpark SQL Server Example - via JDBC"
master = "local"
conf = SparkConf() \
    .setAppName(appName) \
    .setMaster(master) \
    .set("spark.driver.extraClassPath","sqljdbc_7.2/enu/mssql-jdbc-7.2.1.jre8.jar") # LOCAL Driver
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
spark = sqlContext.sparkSession

database = "test"
table = "dbo.Employees"
user = "zeppelin"
password  = "zeppelin"

jdbcDF = spark.read.format("jdbc") \
    .option("url", f"jdbc:sqlserver://localhost:1433;databaseName={database}") \ #LOCAL Database
    .option("dbtable", "Employees") \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .load()

jdbcDF.show()

#### Windows Server

In [None]:
#pip install pymssql

from pyspark import SparkContext, SparkConf, SQLContext
import _mssql
import pandas as pd

appName = "PySpark SQL Server Example - via pymssql"
master = "local"

conf = SparkConf().setAppName(appName).setMaster(master) 
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
spark = sqlContext.sparkSession

database = "test"
table = "dbo.Employees"
user = "zeppelin"
password  = "zeppelin"

conn = _mssql.connect(server='localhost:1433', user=user, password=password,database=database)
query = f"SELECT EmployeeID, EmployeeName, Position FROM {table}"
conn.execute_query(query)
rs = [ row for row in conn ]
pdf = pd.DataFrame(rs)
sparkDF = spark.createDataFrame(pdf)
sparkDF.show()
conn.close()


#### ODBC

In [None]:
#pip install pyodbc


from pyspark import SparkContext, SparkConf, SQLContext
import pyodbc
import pandas as pd

appName = "PySpark SQL Server Example - via ODBC"
master = "local"
conf = SparkConf() \
    .setAppName(appName) \
    .setMaster(master) 
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
spark = sqlContext.sparkSession

database = "test"
table = "dbo.Employees"
user = "zeppelin"
password  = "zeppelin"

conn = pyodbc.connect(f'DRIVER={{ODBC Driver 13 for SQL Server}};SERVER=localhost,1433;DATABASE={database};UID={user};PWD={password}')
query = f"SELECT EmployeeID, EmployeeName, Position FROM {table}"
pdf = pd.read_sql(query, conn)
sparkDF =  spark.createDataFrame(pdf)
sparkDF.show()

Some prior questions...

- What is Machine Learning?
- What's the difference between ML and AI?
- What's the difference between ML and BI?
- Which areas of ML do you know?
- How would you define a pipeline?

## A Machine Learning Pipeline. Full Example

## Cricket Example

In [2]:
my_data = sqlContext.read.csv('data\ind-ban-comment.csv',header=True,inferSchema=True)
my_data.printSchema()

root
 |-- Batsman: integer (nullable = true)
 |-- Batsman_Name: string (nullable = true)
 |-- Bowler: integer (nullable = true)
 |-- Bowler_Name: string (nullable = true)
 |-- Commentary: string (nullable = true)
 |-- Detail: string (nullable = true)
 |-- Dismissed: integer (nullable = true)
 |-- Id: integer (nullable = true)
 |-- Isball: boolean (nullable = true)
 |-- Isboundary: integer (nullable = true)
 |-- Iswicket: integer (nullable = true)
 |-- Over: double (nullable = true)
 |-- Runs: integer (nullable = true)
 |-- Timestamp: string (nullable = true)



Defining manually the Schema

In [3]:
import pyspark.sql.types as tp

my_schema = tp.StructType([
    tp.StructField(name= 'Batsman',      dataType= tp.IntegerType(),   nullable= True),
    tp.StructField(name= 'Batsman_Name', dataType= tp.StringType(),    nullable= True),
    tp.StructField(name= 'Bowler',       dataType= tp.IntegerType(),   nullable= True),
    tp.StructField(name= 'Bowler_Name',  dataType= tp.StringType(),    nullable= True),
    tp.StructField(name= 'Commentary',   dataType= tp.StringType(),    nullable= True),
    tp.StructField(name= 'Detail',       dataType= tp.StringType(),    nullable= True),
    tp.StructField(name= 'Dismissed',    dataType= tp.IntegerType(),   nullable= True),
    tp.StructField(name= 'Id',           dataType= tp.IntegerType(),   nullable= True),
    tp.StructField(name= 'Isball',       dataType= tp.BooleanType(),   nullable= True), 
    tp.StructField(name= 'Isboundary',   dataType= tp.IntegerType(),   nullable= True), # Should be Boolean, but...
    tp.StructField(name= 'Iswicket',     dataType= tp.IntegerType(),   nullable= True), # Should be Boolean, but...
    tp.StructField(name= 'Over',         dataType= tp.DoubleType(),    nullable= True),
    tp.StructField(name= 'Runs',         dataType= tp.IntegerType(),   nullable= True),
    tp.StructField(name= 'Timestamp',    dataType= tp.TimestampType(), nullable= True)    
])

my_data = sqlContext.read.csv('data\ind-ban-comment.csv',header= True,schema= my_schema)
my_data.printSchema()

root
 |-- Batsman: integer (nullable = true)
 |-- Batsman_Name: string (nullable = true)
 |-- Bowler: integer (nullable = true)
 |-- Bowler_Name: string (nullable = true)
 |-- Commentary: string (nullable = true)
 |-- Detail: string (nullable = true)
 |-- Dismissed: integer (nullable = true)
 |-- Id: integer (nullable = true)
 |-- Isball: boolean (nullable = true)
 |-- Isboundary: integer (nullable = true)
 |-- Iswicket: integer (nullable = true)
 |-- Over: double (nullable = true)
 |-- Runs: integer (nullable = true)
 |-- Timestamp: timestamp (nullable = true)



In [5]:
my_data = my_data.drop(*['Batsman', 'Bowler', 'Id']) # Dropping unnecessary columns 

Exercise: Find a way to check how many nulls we find in each column. Plus: Are you able to do it in one line?

In [6]:
### Your Code
from pyspark.sql.functions import isnan, when, count, col
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

NameError: name 'df' is not defined

### Encoding our text variables

In [7]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

# create object of StringIndexer class and specify input and output column
SI_batsman = StringIndexer(inputCol='Batsman_Name',outputCol='Batsman_Index')
SI_bowler = StringIndexer(inputCol='Bowler_Name',outputCol='Bowler_Index')

# transform the data
my_data = SI_batsman.fit(my_data).transform(my_data)
my_data = SI_bowler.fit(my_data).transform(my_data)

# view the transformed data
my_data.select('Batsman_Name', 'Batsman_Index', 'Bowler_Name', 'Bowler_Index').show(10)

+-----------------+-------------+------------------+------------+
|     Batsman_Name|Batsman_Index|       Bowler_Name|Bowler_Index|
+-----------------+-------------+------------------+------------+
|   Mohammed Shami|         18.0| Mustafizur Rahman|         0.0|
|Bhuvneshwar Kumar|         16.0| Mustafizur Rahman|         0.0|
|   Mohammed Shami|         18.0| Mustafizur Rahman|         0.0|
|Bhuvneshwar Kumar|         16.0| Mustafizur Rahman|         0.0|
|         MS Dhoni|          7.0| Mustafizur Rahman|         0.0|
|         MS Dhoni|          7.0| Mustafizur Rahman|         0.0|
|         MS Dhoni|          7.0| Mustafizur Rahman|         0.0|
|         MS Dhoni|          7.0|Mohammad Saifuddin|         8.0|
|         MS Dhoni|          7.0|Mohammad Saifuddin|         8.0|
|         MS Dhoni|          7.0|Mohammad Saifuddin|         8.0|
+-----------------+-------------+------------------+------------+
only showing top 10 rows



Be aware that OneHotEncoder is an old version of OneHotEncoderEstimator!

In [8]:
# create object and specify input and output column
OHE = OneHotEncoder(inputCols=['Batsman_Index', 'Bowler_Index'],outputCols=['Batsman_OHE', 'Bowler_OHE'])

# transform the data
my_data = OHE.fit(my_data).transform(my_data)

# view and transform the data
my_data.select('Batsman_Name', 'Batsman_Index', 'Batsman_OHE', 'Bowler_Name', 'Bowler_Index', 'Bowler_OHE').show(10)

# Linear regression <- Betas depend on scale
# RF <- Importance Metrics

+-----------------+-------------+---------------+------------------+------------+--------------+
|     Batsman_Name|Batsman_Index|    Batsman_OHE|       Bowler_Name|Bowler_Index|    Bowler_OHE|
+-----------------+-------------+---------------+------------------+------------+--------------+
|   Mohammed Shami|         18.0|(19,[18],[1.0])| Mustafizur Rahman|         0.0|(11,[0],[1.0])|
|Bhuvneshwar Kumar|         16.0|(19,[16],[1.0])| Mustafizur Rahman|         0.0|(11,[0],[1.0])|
|   Mohammed Shami|         18.0|(19,[18],[1.0])| Mustafizur Rahman|         0.0|(11,[0],[1.0])|
|Bhuvneshwar Kumar|         16.0|(19,[16],[1.0])| Mustafizur Rahman|         0.0|(11,[0],[1.0])|
|         MS Dhoni|          7.0| (19,[7],[1.0])| Mustafizur Rahman|         0.0|(11,[0],[1.0])|
|         MS Dhoni|          7.0| (19,[7],[1.0])| Mustafizur Rahman|         0.0|(11,[0],[1.0])|
|         MS Dhoni|          7.0| (19,[7],[1.0])| Mustafizur Rahman|         0.0|(11,[0],[1.0])|
|         MS Dhoni|          7

What is this output showing?

In [9]:
from pyspark.ml.feature import VectorAssembler

# specify the input and output columns of the vector assembler
assembler = VectorAssembler(inputCols=['Isboundary', # Needs to be transformed to Integer
                                       'Iswicket', # Needs to be transformed to Integer
                                       'Over',
                                       'Runs',
                                       'Batsman_Index',
                                       'Bowler_Index',
                                       'Batsman_OHE',
                                       'Bowler_OHE'],
                           outputCol='vector')

# fill the null values
my_data = my_data.fillna(0)

final_data = assembler.transform(my_data)
final_data.select('vector').show()

+--------------------+
|              vector|
+--------------------+
|(36,[1,2,4,24,25]...|
|(36,[1,2,3,4,22,2...|
|(36,[2,3,4,24,25]...|
|(36,[2,3,4,22,25]...|
|(36,[1,2,4,13,25]...|
|(36,[2,4,13,25],[...|
|(36,[2,4,13,25],[...|
|(36,[2,3,4,5,13,3...|
|(36,[0,2,3,4,5,13...|
|(36,[2,4,5,13,33]...|
|(36,[2,4,5,13,33]...|
|(36,[0,2,3,4,5,13...|
|(36,[2,3,4,5,13,3...|
|(36,[2,4,22,25],[...|
|(36,[2,3,4,13,25]...|
|(36,[2,4,13,25],[...|
|(36,[2,3,4,22,25]...|
|(36,[1,2,4,19,25]...|
|(36,[2,3,4,13,25]...|
|(36,[2,3,4,5,13,3...|
+--------------------+
only showing top 20 rows



But... Where are the Pipelines?! 

Let's see our **first toy example** ! Comment the following code

In [None]:
from pyspark.ml import Pipeline

sample_df = spark.createDataFrame([
    (1, 'L101', 'R'),
    (2, 'L201', 'C'),
    (3, 'D111', 'R'),
    (4, 'F210', 'R'),
    (5, 'D110', 'C')
], ['id', 'category_1', 'category_2'])

sample_df.show()

stage_1 = StringIndexer(inputCol= 'category_1', outputCol= 'category_1_index')
stage_2 = StringIndexer(inputCol= 'category_2', outputCol= 'category_2_index') # 1,2,1,1,2
stage_3 = OneHotEncoder(inputCols=['category_2_index'], outputCols=['category_2_OHE']) # 1,0,1,1,0 / # 0,1,0,0,1

pipeline = Pipeline(stages=[stage_1, stage_2, stage_3])

pipeline_model = pipeline.fit(sample_df)
sample_df_updated = pipeline_model.transform(sample_df)

sample_df_updated.show()

Draw a schema of the pipeline that transforms the original data into the final one

Rawdata.py -> preprocessdata.py -> ([listfeatures], int (number missings)) featuring.py  ([listfeatures2], int (number missings), int listnewfeat) -> 

Let's do the same with a-bit-more-complex **second toy example**! Machine Learning appears!

In [None]:
from pyspark.ml.classification import LogisticRegression

sample_data_train = sqlContext.createDataFrame([
    (2.0, 'A', 'S10', 40, 1.0),
    (1.0, 'X', 'E10', 25, 1.0),
    (4.0, 'X', 'S20', 10, 0.0),
    (3.0, 'Z', 'S10', 20, 0.0),
    (4.0, 'A', 'E10', 30, 1.0),
    (2.0, 'Z', 'S10', 40, 0.0),
    (5.0, 'X', 'D10', 10, 1.0),
], ['feature_1', 'feature_2', 'feature_3', 'feature_4', 'label'])

sample_data_train.show()

In [None]:

if wantencoding == True:
    stage_1 = StringIndexer(inputCol= 'feature_2', outputCol= 'feature_2_index')
else:
    stage_1 = StringIndexer(inputCol= 'feature_2_mod', outputCol= 'feature_2_mod_index')



stage_2 = StringIndexer(inputCol= 'feature_3', outputCol= 'feature_3_index')

stage_3 = OneHotEncoder(inputCols=[stage_1.getOutputCol(), stage_2.getOutputCol()], 
                                 outputCols= ['feature_2_encoded', 'feature_3_encoded'])

stage_4 = VectorAssembler(inputCols=['feature_1', 'feature_2_encoded', 'feature_3_encoded', 'feature_4'],
                          outputCol='features')

stage_5 = LogisticRegression(featuresCol='features',labelCol='label')

regression_pipeline = Pipeline(stages= [stage_1, stage_2, stage_3, stage_4, stage_5])

model = regression_pipeline.fit(sample_data_train)

sample_data_train = model.transform(sample_data_train)

sample_data_train.select('features', 'label', 'rawPrediction', 'probability', 'prediction').show()

Draw a schema of the pipeline that transforms the original data into the final one

And, FINALLY, we can use the pipeline for a real test example! Check it!

In [None]:
sample_data_test = sqlContext.createDataFrame([
    (3.0, 'Z', 'S10', 40),
    (1.0, 'X', 'E10', 20),
    (4.0, 'A', 'S20', 10),
    (3.0, 'A', 'S10', 20),
    (4.0, 'X', 'D10', 30),
    (1.0, 'Z', 'E10', 20),
    (4.0, 'A', 'S10', 30),
], ['feature_1', 'feature_2', 'feature_3', 'feature_4'])

sample_data_test = model.transform(sample_data_test) #this model was trained with the train data

sample_data_test.select('features', 'rawPrediction', 'probability', 'prediction').show()

### Exercise 1. (More BI)

In [None]:
df = sqlContext.read.csv(SparkFiles.get("adult_data.csv"), header=True, inferSchema= True)

In [None]:
df

In [None]:
df.printSchema()

Check what would have happened if we didn't infer the Schema!

Exercise: Comment the following code! Why do we need it? What are A and B?

In [4]:
from pyspark.sql.types import *

df_wrong = sqlContext.read.csv(SparkFiles.get("adult_data.csv"), header=True, inferSchema= False)

def WhatIsThisDoing(df, A, B):
    for a in A: 
        df = df.withColumn(a, df[a].cast(B))
    return df 

CONTI_FEATURES  = ['age', 'fnlwgt','capital-gain', 'educational-num', 'capital-loss', 'hours-per-week']

df_wrong = WhatIsThisDoing(df_wrong, CONTI_FEATURES, FloatType())

df_wrong.printSchema()

root
 |-- x: string (nullable = true)
 |-- age: float (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: float (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: float (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: float (nullable = true)
 |-- capital-loss: float (nullable = true)
 |-- hours-per-week: float (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)



In [8]:
df_wrong.select(["education","educational-num"]).take(20)

[Row(education='11th', educational-num=7.0),
 Row(education='HS-grad', educational-num=9.0),
 Row(education='Assoc-acdm', educational-num=12.0),
 Row(education='Some-college', educational-num=10.0),
 Row(education='Some-college', educational-num=10.0),
 Row(education='10th', educational-num=6.0),
 Row(education='HS-grad', educational-num=9.0),
 Row(education='Prof-school', educational-num=15.0),
 Row(education='Some-college', educational-num=10.0),
 Row(education='7th-8th', educational-num=4.0),
 Row(education='HS-grad', educational-num=9.0),
 Row(education='Bachelors', educational-num=13.0),
 Row(education='HS-grad', educational-num=9.0),
 Row(education='HS-grad', educational-num=9.0),
 Row(education='HS-grad', educational-num=9.0),
 Row(education='Masters', educational-num=14.0),
 Row(education='Some-college', educational-num=10.0),
 Row(education='HS-grad', educational-num=9.0),
 Row(education='HS-grad', educational-num=9.0),
 Row(education='Doctorate', educational-num=16.0)]

Brush up: Create a summary of the number of people by education level, ordered

In [None]:
### YOUR CODE HERE

Check if there are some native countries with few observations. Discuss how valuable are these rows! Should we delete them?

In [None]:
### YOUR CODE HERE

#### Preparing the data. 
Exercise. Feature Engineering, data cleaning and summarizing the data

Do your best :)

In [None]:
### YOUR CODE HERE

## Exercise 2. (more ML)

Adapt the Cricket Example to the Pipeline structure! Add some data cleaning, relabeling, feature engineering, scaling...And merge it up! After that, explore some machine learning algorithm to predict one of the meaningful events of the dataframe (isball, Runs...), using a train/test split and evaluate the result

In [None]:
### YOUR CODE HERE

We were following these amazing tutorials: 
 - https://www.guru99.com/pyspark-tutorial.html  
 - https://www.analyticsvidhya.com/blog/2019/11/build-machine-learning-pipelines-pyspark/