# ML Basics

We are going to go over a few ML Basics to get the basic concepts.

## Vectors

In [None]:
from pyspark.ml.linalg import Vectors

# dense
v1 = Vectors.dense(3,2,1)
print(v1)

# sparse
v2 = Vectors.sparse(10, (0, 9), (100, 200))
print(v2)
print(v2.toArray())


## Describe Data
Quick way to understand data set very quickly

In [None]:
df = spark.read.csv("/data/college-admissions/admission-data.csv", header=True, inferSchema=True)
df.show()

# use describe() on all columns
df.describe().show()

# use describe on one column : GRE
df.describe('gre').show()

## Split Dataset into Training & Testing
Run the following cell a few times, and observe the test / train sets.
Each run will have differnet data for train/test.

Q : How can we always get the same data for training and test?
hint : Set the seed value to any integer   
df.randomSplit (weights, seed)

In [None]:
## create a range data
df = spark.range(1,100)
df.show()

In [None]:
## TODO : let's split 70% for training and 30% for testing
##    - first argument for randomSPlit is : 0.7  (representing 70%)
##    - second argument for randomSPlit is : 0.3  (representing 30%)

(train, test) = df.randomSplit([???, ???])
print("----training data set-----")
print("count: ", train.count())
train.show()

print("----testing data set-----")
print("count: ", test.count())
test.show()

## There should NO common data between training and test
common = train.intersect(test)
print("----common data set-----")
print("count: ", common.count())
common.show()

In [None]:
## now let's split a 'real world dataset'

dataset = spark.read.csv("/data/college-admissions/admission-data.csv",\
                         header=True, inferSchema=True)

## TODO : split training 80%,  testing 20%
## Hint : arguments are 0.8  and 0.2
(training, test) = dataset.randomSplit([???, ???])
print("----training data set-----")
print("count: ", training.count())
training.show()

print("----testing data set-----")
print("count: ", test.count())
test.show()

In [None]:
## TODO : evaluate how the data is split by 'admit' column
## Hint : groupBy('admit')
print("training data split")
training.groupBy("???").count().show()

print("testing data split")
test.groupBy("???").count().show()

## Vector Assemblers

In [None]:
from pyspark.ml.feature import VectorAssembler

df = spark.read.csv("/data/college-admissions/admission-data.csv", \
                    header=True, inferSchema=True)
df.show()

In [None]:
## create a vector consisting : gre, gpa , rank
## we call this vector 'features'
assembler = VectorAssembler(inputCols=["gre", "gpa", "rank"], outputCol="features") 
feature_vector = assembler.transform(df) 
feature_vector.show(40)

## String Indexers

In [None]:
# create a pandas df
import pandas as pd

df_pd = pd.DataFrame({"id":[1,2,3,4,5,6,7], 
                      "color":['red', 'white', 'blue', 'blue', 'white' ,'yellow', 'blue' ]})
df_pd

In [None]:
# convert it to spark df
df_spark = spark.createDataFrame(df_pd)
df_spark.show()

In [None]:
# run String Indexer
from pyspark.ml.feature import IndexToString, StringIndexer

str_indexer = StringIndexer(inputCol="color", outputCol="colorIndex")

model = str_indexer.fit(df_spark)
indexed = model.transform(df_spark)
indexed.show()


## Reverse String Indexer

In [None]:
from pyspark.ml.feature import IndexToString

converter = IndexToString(inputCol="colorIndex", outputCol="originalColor")
converted = converter.transform(indexed)
converted.show()


## One Hot Encoding

In [None]:
# Step 1 : create a pandas df and then a spark df
import pandas as pd

df2_pd = pd.DataFrame({"id":[1,2,3,4,5,6,7], 
                      "status":['married', 'single', 'single', 'divorced', 'married' ,'single', 'married' ]})
df2_pd
df2_spark = spark.createDataFrame(df2_pd)
df2_spark.show()

In [None]:
## Step 2 : convert  categorical data to indexes 

from pyspark.ml.feature import OneHotEncoder
from pyspark.sql.functions import exp

# first String Indexer
string_indexer = StringIndexer(inputCol="status", outputCol="statusIndex")
model = string_indexer.fit(df2_spark)
indexed = model.transform(df2_spark)
indexed.show()



In [None]:
## Step 3 : encode the indexes into a vector

encoder = OneHotEncoder(inputCol="statusIndex", outputCol="statusVector", dropLast=False)
encoded = encoder.transform(indexed)
encoded.show()

# View dense vectors in pandas
encoded_pd = encoded.toPandas()
print(encoded_pd)

##  Scaling Data

### StandardScaler
[Standard Scaler documentation](https://spark.apache.org/docs/2.2.0/mllib-feature-extraction.html#standardscaler)

In [None]:
# Step 1: create a pandas df and then spark df
import pandas as pd
from pyspark.ml.feature import VectorAssembler 


df_pd = pd.DataFrame({"home_runs": [ 30,  22,  17,  12, 44,   38,  40], 
                      "salary_in_k":[ 700, 450,340, 250, 1200, 800, 950 ]})
df_pd
df_spark = spark.createDataFrame(df_pd)
df_spark.show()



In [None]:
## Step 2 : create a vector
assembler = VectorAssembler(inputCols=["home_runs", "salary_in_k"], outputCol="features") 
feature_vector = assembler.transform(df_spark) 
feature_vector.show(40) 

In [None]:
## Step 3 : Scale data
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="scaled_features",
                        withStd=True, withMean=False)
scalerModel = scaler.fit(feature_vector)
scaledData = scalerModel.transform(feature_vector)
scaledData.show(10, False)

### MinMaxScaler
[MinMaxScaler docs](https://spark.apache.org/docs/2.1.0/ml-features.html#minmaxscaler)

In [None]:
## Step 4 : Try a MinMaxScaler
from pyspark.ml.feature import MinMaxScaler

## TODO : define minMaxScaler with  min=1  and max=100
mmScaler = MinMaxScaler(min=???, max=???, inputCol="features", outputCol="scaled_features2")
scaledModel2 = mmScaler.fit(feature_vector)
scaledData2 = scaledModel2.transform(feature_vector)
scaledData2.show(10, False)