### **Create Vectors**

In [2]:
from pyspark.ml.linalg import Vectors

# Dense Vector
dv = Vectors.dense([1.0,0.0,3.0])

# Sparse
sv = Vectors.sparse(3,{0:1.0, 2:3.0})

In [3]:
dv

DenseVector([1.0, 0.0, 3.0])

In [4]:
sv

SparseVector(3, {0: 1.0, 2: 3.0})

### **Create Matrixes**

In [5]:
from pyspark.ml.linalg import Matrices

# Create a dense matrix with two rows and three columns
# 3.0 0.0 0.0
# 1.0 1.5 2.0
dm = Matrices.dense(2,3,[3.0, 1.0, 0.0, 1.5, 0.0, 2.0])

# Create a sparse version of the same matrix
sm = Matrices.sparse(2,3, [0, 2, 3, 4], [0, 1, 1, 1] , [3,1,1.5,2])

In [6]:
dm

DenseMatrix(2, 3, [3.0, 1.0, 0.0, 1.5, 0.0, 2.0], False)

In [7]:
sm

SparseMatrix(2, 3, [0, 2, 3, 4], [0, 1, 1, 1], [3.0, 1.0, 1.5, 2.0], False)

## **Vector Assembler**
Is a transformer that combines a given list of columns into a single vector column.

In [None]:
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

# input and output folders
inputPath = "data/exampleDataAssembler.csv“

# Create a DataFrame from the input data
inputDF = spark.read.load(inputPath,\
format="csv", header=True, inferSchema=True)

# Create a VectorAssembler that combines columns colB and colC
# The new vetor column is called features
myVectorAssembler = VectorAssembler(inputCols = ['colB', 'colC'],\
outputCol = 'features')

# Apply myVectorAssembler on the input DataFrame
transformedDF = myVectorAssembler.transform(inputDF)

## **Standard Scaler**
StandardScaler is an Estimator that returns a Transformer. StandardScalerModel transforms a vector column of an input DataFrame normalizing each “feature” of the input vector column to have unit standard deviation and/or zero mean.

In [None]:
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

# input and output folders
inputPath = "data/exampleDataAssembler.csv“

# Create a DataFrame from the input data
inputDF = spark.read.load(inputPath,\
format="csv", header=True, inferSchema=True)

# Create a VectorAssembler that combines columns colB and colC
# The new vetor column is called features
myVectorAssembler = VectorAssembler(inputCols = ['colB', 'colC'],\
outputCol = 'features')

# Apply myVectorAssembler on the input DataFrame
transformedDF = myVectorAssembler.transform(inputDF)


# Create a Standard Scaler to scale the content of features
myScaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

# Compute summary statistics by fitting the StandardScaler

# Before normalizing the content of the data we need to compute mean and

# standard deviation of the analyzed data
scalerModel = myScaler.fit(transformedDF)

# Apply myScaler on the input column features
scaledDF = scalerModel.transform(transformedDF)

## **String Indexer**
Frequently the input data are characterized by categorical attributes. The Spark MLlib classification and regression algorithms work only with numerical values. Categorical columns must be mapped to double values.

StringIndexerModel encodes a string column of “labels” to a column of “label indices”.


In [8]:
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import StringIndexer

# input DataFrame
df = spark.createDataFrame([(1, "a"), (2, "b"), (3, "c"), (4, "c"), (5, "a")],\
["id", "category"])

# Create a StringIndexer to map the content of category to a set of “integers”
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")

# Analyze the input data to define the mapping string -> integer
indexerModel = indexer.fit(df)

# Apply indexerModel on the input column category
indexedDF = indexerModel.transform(df)

In [11]:
df.show()

+---+--------+
| id|category|
+---+--------+
|  1|       a|
|  2|       b|
|  3|       c|
|  4|       c|
|  5|       a|
+---+--------+



In [13]:
indexedDF.show()

+---+--------+-------------+
| id|category|categoryIndex|
+---+--------+-------------+
|  1|       a|          0.0|
|  2|       b|          2.0|
|  3|       c|          1.0|
|  4|       c|          1.0|
|  5|       a|          0.0|
+---+--------+-------------+



## IndexToString
IndexToString, which is symmetrical to StringIndexer, is a Transformer that maps a column of “label indices” back to a column containing the original “labels” as strings.

In [1]:
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import IndexToString

# input DataFrame
df = spark.createDataFrame([(1, "a"), (2, "b"), (3, "c"), (4, "c"), (5, "a")],\
["id", "category"])

# Create a StringIndexer to map the content of category to a set of “integers”
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")

# Analyze the input data to define the mapping string -> integer
indexerModel = indexer.fit(df)

# Apply indexerModel on the input column category
indexedDF = indexerModel.transform(df)


# Create an IndexToString to map the content of numerical attribute categoryIndex
# to the original string value
converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory",\
labels=indexerModel.labels)

# Apply converter on the input column categoryIndex
reconvertedDF = converter.transform(indexedDF)

In [2]:
df.show()

+---+--------+
| id|category|
+---+--------+
|  1|       a|
|  2|       b|
|  3|       c|
|  4|       c|
|  5|       a|
+---+--------+



In [4]:
indexedDF.show()

+---+--------+-------------+
| id|category|categoryIndex|
+---+--------+-------------+
|  1|       a|          0.0|
|  2|       b|          2.0|
|  3|       c|          1.0|
|  4|       c|          1.0|
|  5|       a|          0.0|
+---+--------+-------------+



In [3]:
reconvertedDF.show()

+---+--------+-------------+----------------+
| id|category|categoryIndex|originalCategory|
+---+--------+-------------+----------------+
|  1|       a|          0.0|               a|
|  2|       b|          2.0|               b|
|  3|       c|          1.0|               c|
|  4|       c|          1.0|               c|
|  5|       a|          0.0|               a|
+---+--------+-------------+----------------+

