# Preprocessing

## Normalising datasets

In [None]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors

features_df = spark.createDataFrame([
  (1, Vectors.dense ( [10.0, 10000.0, 1.0]),),
  (2, Vectors.dense( [20.0, 30000.0, 2.01]),),
  (3, Vectors.dense ( [30.0, 40000.0, 3.01]),)],
  1, ["id","features"])

# print the first line of the df
features_df.take(1)

feature_scaler = MinMaxScaler (inputCol="features", outputCol="sfeatures")
smodel = feature_scaler.fit (features_df)
sfeatures_df = smodel.transform(features_df)

#Look at the whole dataset
sfeatures_df.select("features" "sfeatures").show()

## Standardisation
Mean =0, variance = 1

In [None]:
from pyspark.ml.feature import StandardScaler
from pyspark.ml.linalg import Vectors

feature_stand_scaler = StandardScaler (inputCol="features", outputCol="sfeatures", withStd=True, withMean=True)
stand_smodel = feature_stand_scaler.fit (features_df)
stand_sfeatures_df = stand_smodel.transform (features_df)
stand_sfeatures_df.take (1)
stand_sfeatures_df.show()

## Bucketize

In [None]:
from pyspark.ml.feature import Bucketizer
splits = [-float("inf"), -10.0, 0.0, 10.0, float("inf")]

b_data = [ (-800.0, ), (-10.5, ), (-1.7,), (0.0,), (8.2, ), (90.1,) ]
b_df = spark.createDataFrame(b_data, ["features"] )

bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bfeatures")
bucketed_df = bucketizer.transform(b_df)
bucketed_df.show()

## TF-IDF
### Tokeniser

In [None]:
from pyspark.ml.feature import Tokenizer

sentences_df = spark.createDataFrame ([
  (1, "This is an introduction to Spark MLlib"),
  (2, "MLlib includes libraries for classification and regression"),
  (3, "It also contains supporting tools for pipelines")],
  ["id","sentence"])

sent_token = Tokenizer(inputCol="sentence", outputCol="words")
sent_tokenized_df = sent_token.transform(sentences_df)
sent_tokenized_df.show()

### TF - IDF

In [None]:
from pyspark.ml. feature import HashingTF, IDF

hashingTF = HashingTF(inputCol="words", outputCol="rawfeatures", numFeatures=20)
# numFeatures is the number of features you want to keep track of

sent_hfTF_df = hashingTF.transform(sent_tokenized_df)

idf = IDF(inputCol="rawFeatures", outputCol="idf_features")
idfModel = idf. fit(sent_hfTF_df)
tfidf_df = idfModel.transform(sent_hfTF_df)

you get the TF and the IDF in 2 seperate columns. 

you probably need to do something to get the TF-IDF score for each word: https://www.geeksforgeeks.org/understanding-tf-idf-term-frequency-inverse-document-frequency/

# Clustering

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

vectorAssembler = VectorAssembler(inputCols=["col1", "col2", "col3"] , outputCol="features" )
vcluster_df = vectorAssembler.transform(cluster_df)
vcluster_df.show()

vector assembler creates a feature vector column which the kmeans algo will work with

## Kmeans clustering model

In [None]:
kmeans = KMeans().setK(3)
kmeans = kmeans.setSeed(1)
kmodel = kmeans.fit(vcluster_df)

choose the number of clusters, set the seed, then fit our data to the kmeans. You'll notice a couple of error messages, or warning messages here. This is just indicating that the BLAS library, which is a basic linear algebra library, wasn't able to load. That has no effect on the outcome. BLAS is useful for speeding up some linear algebra operations

In [None]:
centers = kmodel. clusterCenters ( )
centers

## Heirarchal Clustering

In [None]:
from pyspark.ml.clustering import BisectingKMeans
bkmeans = BisectingKMeans().setK(3)
bkmeans = bkmeans.setSeed(1)

bkmodel = bkmeans.fit(vcluster_df)
bkcenters = bkmodel.clusterCenters()
bkcenters