### Feature Extraction Methods in Spark for Text Data and Numeric Data
___
___

In [None]:
#Importing relvant libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [None]:
spark = SparkSession.builder.appName('Basic Feature Extraction').getOrCreate()

In [None]:
#Creating text data
sentenceData = spark.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish java could use case classes"),
    (1.0, "Logistic regression models are neat"),
], ["label", "sentence"])

In [None]:
sentenceData.printSchema()

### Text Feature Manipulations

In [None]:
#Extracting words from sentence
tokenizer = Tokenizer(inputCol = "sentence", outputCol = "words")
wordsData = tokenizer.transform(sentenceData)

#Coverting words into raw features
hashingTF = HashingTF(inputCol = "words", outputCol = "rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)

#Converting raw features to numerics
idf = IDF(inputCol = "rawFeatures", outputCol = 'features')
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.select("label", "features").show()

### Word2vec : Words to vector method
___

In [None]:
from pyspark.ml.feature import Word2Vec

In [None]:
#Creating data
documentDF = spark.createDataFrame([
    ("HI I heard about Saprk".split(" "),),
    ("I wish java could use case classes".split(" "),),
    ("Logistic regressions models are neat".split(" "),)
], ["text"])

In [None]:
#word2vec
word2vect = Word2Vec(vectorSize = 3, minCount = 0, inputCol = "text", outputCol = "result")
model = word2vect.fit(documentDF)

result = model.transform(documentDF)

for row in result.collect():
    text, vector = row
    print("Text: [%s] => \n Vector: %s\n" % (",  ".join(text), str(vector)))

### Onehot encoding
___

In [None]:
from pyspark.ml.feature import oneHotEncoder

df = spark.createDataFrame([
    (0.0,1.0),
    (1.0,0.0),
    (2.0,1.0),
    (0.0,2.0),
    (0.0,1.0),
    (2.0,0.0)
], ["categoryIndex1", "categoryIndex2"])

encoder = oneHotEncoder(inputCol = ["categoryIndex1", "categoryIndex2"], outputCol = ["categoryvec1", "categoryvec2"])

model = encoder.fit(df)
encoded = model.transfom(df)
model.show()

### Standard Scaler 
___

In [None]:
from pyspark.ml.feature import StandardScaler
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Basic Feature Extraction').getOrCreate()

In [None]:
dataframe = spark.read.format('libsvm').load("data/mllib/sample_libsvm_data.txt")
scaler = StandardScaler(inputCol = 'features', outputCol = "scaledFeatures", withStd = True, withMean = False)

scalerModel = scaler.fit(dataframe)

scaledData = scalerModel.transform(dataframe)
scaledData.show()