# ML Basics

We are going to go over a few ML Basics to get the basic concepts.

In [1]:
# initialize Spark Session
import os
import sys
top_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))
if top_dir not in sys.path:
    sys.path.append(top_dir)

from init_spark import init_spark
spark = init_spark()
spark

Initializing Spark...
Spark found in :  /Users/sujee/spark
Spark config:
	 spark.app.name=TestApp
	spark.master=local[*]
	executor.memory=2g
	spark.sql.warehouse.dir=/var/folders/lp/qm_skljd2hl4xtps5vw0tdgm0000gn/T/tmp9s92_c44
	some_property=some_value
Spark UI running on port 4040


## Step 1: Vectors

In [2]:
from pyspark.ml.linalg import Vectors

# dense
v1 = Vectors.dense(3,2,1)
print(v1)

# sparse
v2 = Vectors.sparse(10, (0, 9), (100, 200))
print(v2)
print(v2.toArray())


[3.0,2.0,1.0]
(10,[0,9],[100.0,200.0])
[100.   0.   0.   0.   0.   0.   0.   0.   0. 200.]


## Step 2: Describe Data
Quick way to understand data set very quickly

In [3]:
df = spark.read.csv("/data/college-admissions/admission-data.csv", header=True, inferSchema=True)
df.show()

# use describe() on all columns
df.describe().show()

# use describe on one column : GRE
df.describe('gre').show()

+-----+---+----+----+
|admit|gre| gpa|rank|
+-----+---+----+----+
|    0|380|3.61|   3|
|    1|660|3.67|   3|
|    1|800| 4.0|   1|
|    0|640|3.19|   4|
|    0|520|2.93|   4|
|    1|760| 3.0|   2|
|    0|560|2.98|   1|
|    0|400|3.08|   2|
|    0|540|3.39|   3|
|    1|700|3.92|   2|
|    1|800| 4.0|   4|
|    0|440|3.22|   1|
|    1|760| 4.0|   1|
|    1|700|3.08|   2|
|    1|700| 4.0|   1|
|    0|480|3.44|   3|
|    1|780|3.87|   4|
|    0|360|2.56|   3|
|    1|800|3.75|   2|
|    0|540|3.81|   1|
+-----+---+----+----+
only showing top 20 rows

+-------+-------------------+------------------+------------------+-----------------+
|summary|              admit|               gre|               gpa|             rank|
+-------+-------------------+------------------+------------------+-----------------+
|  count|                100|               100|               100|              100|
|   mean|               0.43|             600.0| 3.390699999999998|             2.52|
| stddev|0.49756

In [4]:
## Pretty describe using Pandas
df.describe().toPandas()

Unnamed: 0,summary,admit,gre,gpa,rank
0,count,100.0,100.0,100.0,100.0
1,mean,0.43,600.0,3.390699999999998,2.52
2,stddev,0.497569851956243,124.46248065545332,0.3971877275408833,1.019803902718557
3,min,0.0,300.0,2.42,1.0
4,max,1.0,800.0,4.0,4.0


## Step3: Split Dataset into Training & Testing
Run the following cell a few times, and observe the test / train sets.
Each run will have differnet data for train/test.

Q : How can we always get the same data for training and test?
hint : Set the seed value to any integer   
df.randomSplit (weights, seed)

In [5]:
## create a range data
df = spark.range(1,100)
df.show()

+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
| 20|
+---+
only showing top 20 rows



In [6]:
## TODO : let's split 70% for training and 30% for testing
##    - first argument for randomSPlit is : 0.7  (representing 70%)
##    - second argument for randomSPlit is : 0.3  (representing 30%)

(train, test) = df.randomSplit([0.7, 0.3])
print("----training data set-----")
print("count: ", train.count())
train.show()

print("----testing data set-----")
print("count: ", test.count())
test.show()

## There should NO common data between training and test
common = train.intersect(test)
print("----common data set-----")
print("count: ", common.count())
common.show()

----training data set-----
count:  67
+---+
| id|
+---+
|  1|
|  2|
|  4|
|  5|
|  6|
|  7|
|  8|
| 10|
| 11|
| 12|
| 13|
| 14|
| 16|
| 17|
| 19|
| 20|
| 21|
| 22|
| 23|
| 25|
+---+
only showing top 20 rows

----testing data set-----
count:  32
+---+
| id|
+---+
|  3|
|  9|
| 15|
| 18|
| 24|
| 31|
| 32|
| 34|
| 36|
| 37|
| 38|
| 39|
| 41|
| 46|
| 49|
| 50|
| 55|
| 61|
| 63|
| 65|
+---+
only showing top 20 rows

----common data set-----
count:  0
+---+
| id|
+---+
+---+



In [7]:
## now let's split a 'real world dataset'

dataset = spark.read.csv("/data/college-admissions/admission-data.csv",\
                         header=True, inferSchema=True)

## TODO : split training 80%,  testing 20%
## Hint : arguments are 0.8  and 0.2
(training, test) = dataset.randomSplit([0.8, 0.2])
print("----training data set-----")
print("count: ", training.count())
training.show()

print("----testing data set-----")
print("count: ", test.count())
test.show()

----training data set-----
count:  82
+-----+---+----+----+
|admit|gre| gpa|rank|
+-----+---+----+----+
|    0|300|2.92|   4|
|    0|360|2.56|   3|
|    0|360|3.14|   1|
|    0|380|2.91|   4|
|    0|380|2.94|   3|
|    0|380|3.61|   3|
|    0|400|3.05|   2|
|    0|400|3.31|   3|
|    0|400|3.35|   3|
|    0|400|3.65|   2|
|    0|440|2.48|   4|
|    0|440|3.13|   4|
|    0|480|3.39|   4|
|    0|480|3.57|   2|
|    0|500|2.71|   2|
|    0|500|2.97|   4|
|    0|500|3.17|   3|
|    0|520|2.93|   4|
|    0|520|2.98|   2|
|    0|520|3.29|   1|
+-----+---+----+----+
only showing top 20 rows

----testing data set-----
count:  18
+-----+---+----+----+
|admit|gre| gpa|rank|
+-----+---+----+----+
|    0|400|3.08|   2|
|    0|440|3.22|   1|
|    0|480|3.44|   3|
|    0|500|3.31|   3|
|    0|520| 2.9|   3|
|    0|560|2.98|   1|
|    0|560|3.36|   3|
|    0|580|3.69|   1|
|    0|600|3.48|   2|
|    0|640|3.52|   4|
|    0|640| 4.0|   3|
|    1|460|3.45|   3|
|    1|620| 4.0|   1|
|    1|660|3.63|   

In [10]:
## TODO : evaluate how the data is split by 'admit' column
## Hint : groupBy('admit')
print("training data split")
training.groupBy("admit").count().show()

print("testing data split")
test.groupBy("admit").count().show()

training data split
+-----+-----+
|admit|count|
+-----+-----+
|    1|   36|
|    0|   46|
+-----+-----+

testing data split
+-----+-----+
|admit|count|
+-----+-----+
|    1|    7|
|    0|   11|
+-----+-----+



## Step 4: Vector Assemblers

In [11]:
from pyspark.ml.feature import VectorAssembler

df = spark.read.csv("/data/college-admissions/admission-data.csv", \
                    header=True, inferSchema=True)
df.show()

+-----+---+----+----+
|admit|gre| gpa|rank|
+-----+---+----+----+
|    0|380|3.61|   3|
|    1|660|3.67|   3|
|    1|800| 4.0|   1|
|    0|640|3.19|   4|
|    0|520|2.93|   4|
|    1|760| 3.0|   2|
|    0|560|2.98|   1|
|    0|400|3.08|   2|
|    0|540|3.39|   3|
|    1|700|3.92|   2|
|    1|800| 4.0|   4|
|    0|440|3.22|   1|
|    1|760| 4.0|   1|
|    1|700|3.08|   2|
|    1|700| 4.0|   1|
|    0|480|3.44|   3|
|    1|780|3.87|   4|
|    0|360|2.56|   3|
|    1|800|3.75|   2|
|    0|540|3.81|   1|
+-----+---+----+----+
only showing top 20 rows



In [12]:
## create a vector consisting : gre, gpa , rank
## we call this vector 'features'
assembler = VectorAssembler(inputCols=["gre", "gpa", "rank"], outputCol="features") 
feature_vector = assembler.transform(df) 
feature_vector.show(40)

+-----+---+----+----+----------------+
|admit|gre| gpa|rank|        features|
+-----+---+----+----+----------------+
|    0|380|3.61|   3|[380.0,3.61,3.0]|
|    1|660|3.67|   3|[660.0,3.67,3.0]|
|    1|800| 4.0|   1| [800.0,4.0,1.0]|
|    0|640|3.19|   4|[640.0,3.19,4.0]|
|    0|520|2.93|   4|[520.0,2.93,4.0]|
|    1|760| 3.0|   2| [760.0,3.0,2.0]|
|    0|560|2.98|   1|[560.0,2.98,1.0]|
|    0|400|3.08|   2|[400.0,3.08,2.0]|
|    0|540|3.39|   3|[540.0,3.39,3.0]|
|    1|700|3.92|   2|[700.0,3.92,2.0]|
|    1|800| 4.0|   4| [800.0,4.0,4.0]|
|    0|440|3.22|   1|[440.0,3.22,1.0]|
|    1|760| 4.0|   1| [760.0,4.0,1.0]|
|    1|700|3.08|   2|[700.0,3.08,2.0]|
|    1|700| 4.0|   1| [700.0,4.0,1.0]|
|    0|480|3.44|   3|[480.0,3.44,3.0]|
|    1|780|3.87|   4|[780.0,3.87,4.0]|
|    0|360|2.56|   3|[360.0,2.56,3.0]|
|    1|800|3.75|   2|[800.0,3.75,2.0]|
|    0|540|3.81|   1|[540.0,3.81,1.0]|
|    0|500|3.17|   3|[500.0,3.17,3.0]|
|    1|660|3.63|   2|[660.0,3.63,2.0]|
|    0|600|2.82|   4|[600

## Step 5: String Indexers

In [13]:
# create a pandas df
import pandas as pd

df_pd = pd.DataFrame({"id":[1,2,3,4,5,6,7], 
                      "color":['red', 'white', 'blue', 'blue', 'white' ,'yellow', 'blue' ]})
df_pd

Unnamed: 0,id,color
0,1,red
1,2,white
2,3,blue
3,4,blue
4,5,white
5,6,yellow
6,7,blue


In [14]:
# convert it to spark df
df_spark = spark.createDataFrame(df_pd)
df_spark.show()

+---+------+
| id| color|
+---+------+
|  1|   red|
|  2| white|
|  3|  blue|
|  4|  blue|
|  5| white|
|  6|yellow|
|  7|  blue|
+---+------+



In [15]:
# run String Indexer
from pyspark.ml.feature import IndexToString, StringIndexer

str_indexer = StringIndexer(inputCol="color", outputCol="colorIndex")

model = str_indexer.fit(df_spark)
indexed = model.transform(df_spark)
indexed.show()


+---+------+----------+
| id| color|colorIndex|
+---+------+----------+
|  1|   red|       2.0|
|  2| white|       1.0|
|  3|  blue|       0.0|
|  4|  blue|       0.0|
|  5| white|       1.0|
|  6|yellow|       3.0|
|  7|  blue|       0.0|
+---+------+----------+



## Step 6: Reverse String Indexer

In [16]:
from pyspark.ml.feature import IndexToString

converter = IndexToString(inputCol="colorIndex", outputCol="originalColor")
converted = converter.transform(indexed)
converted.show()


+---+------+----------+-------------+
| id| color|colorIndex|originalColor|
+---+------+----------+-------------+
|  1|   red|       2.0|          red|
|  2| white|       1.0|        white|
|  3|  blue|       0.0|         blue|
|  4|  blue|       0.0|         blue|
|  5| white|       1.0|        white|
|  6|yellow|       3.0|       yellow|
|  7|  blue|       0.0|         blue|
+---+------+----------+-------------+



## Step 7: One Hot Encoding

In [17]:
# Step 1 : create a pandas df and then a spark df
import pandas as pd

df2_pd = pd.DataFrame({"id":[1,2,3,4,5,6,7], 
                      "status":['married', 'single', 'single', 'divorced', 'married' ,'single', 'married' ]})
df2_pd
df2_spark = spark.createDataFrame(df2_pd)
df2_spark.show()

+---+--------+
| id|  status|
+---+--------+
|  1| married|
|  2|  single|
|  3|  single|
|  4|divorced|
|  5| married|
|  6|  single|
|  7| married|
+---+--------+



In [18]:
## Step 2 : convert  categorical data to indexes 

from pyspark.ml.feature import OneHotEncoder
from pyspark.sql.functions import exp

# first String Indexer
string_indexer = StringIndexer(inputCol="status", outputCol="statusIndex")
model = string_indexer.fit(df2_spark)
indexed = model.transform(df2_spark)
indexed.show()



+---+--------+-----------+
| id|  status|statusIndex|
+---+--------+-----------+
|  1| married|        1.0|
|  2|  single|        0.0|
|  3|  single|        0.0|
|  4|divorced|        2.0|
|  5| married|        1.0|
|  6|  single|        0.0|
|  7| married|        1.0|
+---+--------+-----------+



In [19]:
## Step 3 : encode the indexes into a vector

encoder = OneHotEncoder(inputCol="statusIndex", outputCol="statusVector", dropLast=False)
encoded = encoder.transform(indexed)
encoded.show()

# View dense vectors in pandas
encoded_pd = encoded.toPandas()
print(encoded_pd)

+---+--------+-----------+-------------+
| id|  status|statusIndex| statusVector|
+---+--------+-----------+-------------+
|  1| married|        1.0|(3,[1],[1.0])|
|  2|  single|        0.0|(3,[0],[1.0])|
|  3|  single|        0.0|(3,[0],[1.0])|
|  4|divorced|        2.0|(3,[2],[1.0])|
|  5| married|        1.0|(3,[1],[1.0])|
|  6|  single|        0.0|(3,[0],[1.0])|
|  7| married|        1.0|(3,[1],[1.0])|
+---+--------+-----------+-------------+

   id    status  statusIndex     statusVector
0   1   married          1.0  (0.0, 1.0, 0.0)
1   2    single          0.0  (1.0, 0.0, 0.0)
2   3    single          0.0  (1.0, 0.0, 0.0)
3   4  divorced          2.0  (0.0, 0.0, 1.0)
4   5   married          1.0  (0.0, 1.0, 0.0)
5   6    single          0.0  (1.0, 0.0, 0.0)
6   7   married          1.0  (0.0, 1.0, 0.0)


## Step 8:  Scaling Data

### 8.1: StandardScaler
[Standard Scaler documentation](https://spark.apache.org/docs/2.2.0/mllib-feature-extraction.html#standardscaler)

In [20]:
# Step 1: create a pandas df and then spark df
import pandas as pd
from pyspark.ml.feature import VectorAssembler 


df_pd = pd.DataFrame({"home_runs": [ 30,  22,  17,  12, 44,   38,  40], 
                      "salary_in_k":[ 700, 450,340, 250, 1200, 800, 950 ]})
df_pd
df_spark = spark.createDataFrame(df_pd)
df_spark.show()



+---------+-----------+
|home_runs|salary_in_k|
+---------+-----------+
|       30|        700|
|       22|        450|
|       17|        340|
|       12|        250|
|       44|       1200|
|       38|        800|
|       40|        950|
+---------+-----------+



In [21]:
## Step 2 : create a vector
assembler = VectorAssembler(inputCols=["home_runs", "salary_in_k"], outputCol="features") 
feature_vector = assembler.transform(df_spark) 
feature_vector.show(40) 

+---------+-----------+-------------+
|home_runs|salary_in_k|     features|
+---------+-----------+-------------+
|       30|        700| [30.0,700.0]|
|       22|        450| [22.0,450.0]|
|       17|        340| [17.0,340.0]|
|       12|        250| [12.0,250.0]|
|       44|       1200|[44.0,1200.0]|
|       38|        800| [38.0,800.0]|
|       40|        950| [40.0,950.0]|
+---------+-----------+-------------+



In [22]:
## Step 3 : Scale data
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="scaled_features",
                        withStd=True, withMean=False)
scalerModel = scaler.fit(feature_vector)
scaledData = scalerModel.transform(feature_vector)
scaledData.show(10, False)

+---------+-----------+-------------+---------------------------------------+
|home_runs|salary_in_k|features     |scaled_features                        |
+---------+-----------+-------------+---------------------------------------+
|30       |700        |[30.0,700.0] |[2.4359938288234506,2.03376119068933]  |
|22       |450        |[22.0,450.0] |[1.7863954744705304,1.3074179083002835]|
|17       |340        |[17.0,340.0] |[1.3803965029999552,0.987826864049103] |
|12       |250        |[12.0,250.0] |[0.9743975315293802,0.7263432823890463]|
|44       |1200       |[44.0,1200.0]|[3.5727909489410608,3.4864477554674225]|
|38       |800        |[38.0,800.0] |[3.0855921831763706,2.324298503644948] |
|40       |950        |[40.0,950.0] |[3.2479917717646005,2.760104473078376] |
+---------+-----------+-------------+---------------------------------------+



### 8.2: MinMaxScaler
[MinMaxScaler docs](https://spark.apache.org/docs/2.1.0/ml-features.html#minmaxscaler)

In [23]:
## Step 4 : Try a MinMaxScaler
from pyspark.ml.feature import MinMaxScaler

## TODO : define minMaxScaler with  min=1  and max=100
mmScaler = MinMaxScaler(min=1, max=100, inputCol="features", outputCol="scaled_features2")
scaledModel2 = mmScaler.fit(feature_vector)
scaledData2 = scaledModel2.transform(feature_vector)
scaledData2.show(10, False)

+---------+-----------+-------------+-----------------------------+
|home_runs|salary_in_k|features     |scaled_features2             |
+---------+-----------+-------------+-----------------------------+
|30       |700        |[30.0,700.0] |[56.6875,47.89473684210526]  |
|22       |450        |[22.0,450.0] |[31.9375,21.842105263157894] |
|17       |340        |[17.0,340.0] |[16.46875,10.378947368421054]|
|12       |250        |[12.0,250.0] |[1.0,1.0]                    |
|44       |1200       |[44.0,1200.0]|[100.0,100.0]                |
|38       |800        |[38.0,800.0] |[81.4375,58.31578947368421]  |
|40       |950        |[40.0,950.0] |[87.625,73.94736842105263]   |
+---------+-----------+-------------+-----------------------------+

