# ML Basics

We are going to go over a few ML Basics to get the basic concepts.

## Vectors

In [None]:
from pyspark.ml.linalg import Vectors

# dense
v1 = Vectors.dense(3,2,1)
print(v1)

# sparse
v2 = Vectors.sparse(10, (0, 9), (100, 200))
print(v2)
print(v2.toArray())

## TODO 
# declare a sparse vector that has lenght of 100
# every 10 element filled with a (random) value

## Split Dataset into Training & Testing
Run the following cell a few times, and observe the test / train sets.
Each run will have differnet data for train/test.

Q : How can we always get the same data for training and test?
hint : Set the seed value to any integer   
df.randomSplit (weights, seed)

In [None]:
df = spark.range(1,100)
df.show()
(train, test) = df.randomSplit([0.7, 0.3])
print("----training data set-----")
print("count: ", train.count())
train.show()

print("----testing data set-----")
print("count: ", test.count())
test.show()

common = train.intersect(test)
print("----common data set-----")
print("count: ", common.count())
common.show()

In [None]:
dataset = spark.read.csv("/data/college-admissions/admission-data.csv", header=True, inferSchema=True)
(training, test) = dataset.randomSplit([0.8, 0.2])
print("----training data set-----")
print("count: ", training.count())
training.show()

print("----testing data set-----")
print("count: ", test.count())
test.show()

## Vector Assemblers

In [None]:
from pyspark.ml.feature import VectorAssembler

df = spark.read.csv("/data/college-admissions/admission-data.csv", header=True, inferSchema=True)
df.show()

assembler = VectorAssembler(inputCols=["gre", "gpa", "rank"], outputCol="features") 
feature_vector = assembler.transform(df) 
feature_vector.show(40) 


## String Indexers

In [None]:
# create a pandas df
import pandas as pd

df_pd = pd.DataFrame({"id":[1,2,3,4,5,6,7], 
                      "color":['red', 'white', 'blue', 'blue', 'white' ,'yellow', 'blue' ]})
df_pd

In [None]:
# convert it to spark df
df_spark = spark.createDataFrame(df_pd)
df_spark.show()

In [None]:
# run String Indexer
from pyspark.ml.feature import IndexToString, StringIndexer

str_indexer = StringIndexer(inputCol="color", outputCol="colorIndex")

model = str_indexer.fit(df_spark)
indexed = model.transform(df_spark)
indexed.show()


### Reverse String Indexer

In [None]:
#from pyspark.ml.feature import IndexToString

converter = IndexToString(inputCol="colorIndex", outputCol="originalColor")
converted = converter.transform(indexed)
converted.show()


## One Hot Encoding

In [None]:
# create a pandas df
import pandas as pd

df2_pd = pd.DataFrame({"id":[1,2,3,4,5,6,7], 
                      "status":['married', 'single', 'single', 'divorced', 'married' ,'single', 'married' ]})
df2_pd
df2_spark = spark.createDataFrame(df2_pd)
df2_spark.show()

In [None]:
from pyspark.ml.feature import OneHotEncoder
from pyspark.sql.functions import exp

# first String Indexer
string_indexer = StringIndexer(inputCol="status", outputCol="statusIndex")
model = string_indexer.fit(df2_spark)
indexed = model.transform(df2_spark)
indexed.show()

encoder = OneHotEncoder(inputCol="statusIndex", outputCol="statusVector", dropLast=False)
encoded = encoder.transform(indexed)
encoded.show()

# View dense vectors in pandas
encoded_pd = encoded.toPandas()
print(encoded_pd)

## Standard Scalar

In [None]:
# create a pandas df
import pandas as pd
from pyspark.ml.feature import VectorAssembler 


df_pd = pd.DataFrame({"home_runs": [ 30,  22,  17,  12, 44,   38,  40], 
                      "salary_in_k":[ 700, 450,340, 250, 1200, 800, 950 ]})
df_pd
df_spark = spark.createDataFrame(df_pd)
df_spark.show()

assembler = VectorAssembler(inputCols=["home_runs", "salary_in_k"], outputCol="features") 
feature_vector = assembler.transform(df_spark) 
feature_vector.show(40) 


In [None]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="scaled_features",
                        withStd=True, withMean=False)
scalerModel = scaler.fit(feature_vector)
scaledData = scalerModel.transform(feature_vector)
scaledData.show(10, False)