# ML Basics

## Vectors

In [None]:
from pyspark.ml.linalg import Vectors

# dense
v1 = Vectors.dense(3,2,1)
print(v1)

# sparse
v2 = Vectors.sparse(10, (0, 9), (100, 200))
print(v2)
print(v2.toArray())

## TODO 
# declare a sparse vector that has lenght of 100
# every 10 element filled with a (random) value

## Split Dataset into Training & Testing
Run the following cell a few times, and observe the test / train sets.
Each run will have differnet data for train/test.

Q : How can we always get the same data for training and test?
hint : Set the seed value to any integer   
df.randomSplit (weights, seed)

In [None]:
df = spark.range(1,100)
df.show()
(train, test) = df.randomSplit([0.7, 0.3])
print("----training data set-----")
print("count: ", train.count())
train.show()

print("----testing data set-----")
print("count: ", test.count())
test.show()

common = train.intersect(test)
print("----common data set-----")
print("count: ", common.count())
common.show()

In [None]:
dataset = spark.read.csv("/data/college-admissions/admission-data.csv", header=True, inferSchema=True)
(training, test) = dataset.randomSplit([0.8, 0.2])
print("----training data set-----")
print("count: ", training.count())
training.show()

print("----testing data set-----")
print("count: ", test.count())
test.show()

## Correlation Matrix

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation

data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),
        (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
        (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
        (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]

df = spark.createDataFrame(data, ["features"])
df.show()


r1 = Correlation.corr(df, "features").head()
print("Pearson correlation matrix:\n" + str(r1[0]))

r2 = Correlation.corr(df, "features", "spearman").head()
print("Spearman correlation matrix:\n" + str(r2[0]))

## Hypothesis Testing

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import ChiSquareTest

data = [(0.0, Vectors.dense(0.5, 10.0)),
        (0.0, Vectors.dense(1.5, 20.0)),
        (1.0, Vectors.dense(1.5, 30.0)),
        (0.0, Vectors.dense(3.5, 30.0)),
        (0.0, Vectors.dense(3.5, 40.0)),
        (1.0, Vectors.dense(3.5, 40.0))]
df = spark.createDataFrame(data, ["label", "features"])
df.show()

r = ChiSquareTest.test(df, "features", "label").head()
print("pValues: " + str(r.pValues))
print("degreesOfFreedom: " + str(r.degreesOfFreedom))
print("statistics: " + str(r.statistics))