# Chapter 5: Bot-Or-Not feature Extraction


In [1]:
from pyspark.sql import SparkSession 
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression, LinearRegressionModel
from pyspark.ml.fpm import FPGrowth, FPGrowthModel


from pyspark.ml.evaluation import RegressionEvaluator

spark = SparkSession.builder \
    .master('local[*]') \
    .appName("Pipelines") \
    .getOrCreate()

In [2]:
df_train = spark.read.parquet("classified_train_data")

In [4]:
df_train = df_train.fillna({'bot':0})


In [6]:
data = spark.read.parquet('train_data_only_description')
data = data.fillna({'label':0})


In [18]:
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="description", outputCol="words")
wordsData = tokenizer.transform(data)

# Text Feature Extraction
## TF-IDF feature extraction
Leveraging text technique to try and extract meaningful features 

In [19]:
wordsData.select('label').distinct().show()

+-----+
|label|
+-----+
|    1|
|   13|
|    3|
|   19|
|   10|
|    0|
+-----+



In [16]:
# first TF using HashingTF. alternatively, we can use CountVectorizer to get term frequency vectors

from pyspark.ml.feature import HashingTF, IDF, Tokenizer

hashingTF = HashingTF(inputCol="words", outputCol="frequencyFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)

featurizedData.select("label", "frequencyFeatures").show(5)

+-----+--------------------+
|label|   frequencyFeatures|
+-----+--------------------+
|    1|(20,[0,2,3,4,5,7,...|
|    0|(20,[3,13,16,17],...|
|    0|(20,[1,2,4,5,6,7,...|
|    0|(20,[0,1,4,5,7,8,...|
|    1|(20,[0,1,3,4,5,6,...|
+-----+--------------------+
only showing top 5 rows



In [17]:
# second IDF model

idf = IDF(inputCol="frequencyFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.select("label", "features").show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    1|(20,[0,2,3,4,5,7,...|
|    0|(20,[3,13,16,17],...|
|    0|(20,[1,2,4,5,6,7,...|
|    0|(20,[0,1,4,5,7,8,...|
|    1|(20,[0,1,3,4,5,6,...|
|    0|(20,[5,11,15,16],...|
|    0|(20,[5,6,8,10,12,...|
|    0|(20,[2,6,8,12,16,...|
|    0|(20,[1,3,7,8,9,10...|
|   13|(20,[1,3,4,5,7,8,...|
|    1|(20,[1,4,5,7,8,9,...|
|    0|(20,[0,1,2,8,10,1...|
|    0|(20,[0,1,2,5,7,11...|
|    0|(20,[2,5,6,7,8,9,...|
|    0|(20,[0,4,5,9,11,1...|
|    0|(20,[3,10,15,16],...|
|    0|(20,[0,1,3,4,6,7,...|
|    0|(20,[0,1,3,4,5,8,...|
|    0|(20,[0,2,4,5,7,11...|
|    0|(20,[0,1,3,4,17,1...|
+-----+--------------------+
only showing top 20 rows



## N-Gram feature extraction

In [20]:
from pyspark.ml.feature import NGram

ngram = NGram(n=2, inputCol="words", outputCol="ngrams")

ngramDataFrame = ngram.transform(wordsData)
ngramDataFrame.select("ngrams").show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|ngrams                                                                                                                                                                                                                                                                                                                                           |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [21]:
ngram.explainParams()

'inputCol: input column name. (current: words)\nn: number of elements per n-gram (>=1) (default: 2, current: 2)\noutputCol: output column name. (default: NGram_2cf15b34fc22__output, current: ngrams)'