In [1]:
from pyspark.sql import SparkSession 

spark = SparkSession.builder \
    .master('local[*]') \
    .appName("Intro") \
    .getOrCreate()



In [2]:
df = spark.createDataFrame([
    (0, "Hi I think pyspark is cool ","happy"),
    (1, "All I want is a pyspark cluster","indifferent"),
    (2, "I finally understand how ML works","fulfill"),
    (3, "Yet another sentence about pyspark and ML","indifferent"),
    (4, "Why didn't I know about mllib before","sad"),
    (5, "Yes, I can","happy")
], ["id", "sentence", "sentiment"])

In [3]:
df

DataFrame[id: bigint, sentence: string, sentiment: string]

# Start Featurization process

# Leverage algo to target Imbalanced Data: 
Similar featurization process as before, only now we have one label for every data entry

In [4]:
# stop words:
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import Tokenizer

remover = StopWordsRemover()
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
tokenized = tokenizer.transform(df)

remover.setInputCol("words")
remover.setOutputCol("clean_words")

df = remover.transform(tokenized)

In [5]:
from pyspark.ml.feature import NGram

ngram = NGram(n=3, inputCol="clean_words", outputCol="ngrams")
test = ngram.transform(df)
test.show(5,truncate=False)

+---+-----------------------------------------+-----------+-------------------------------------------------+-------------------------------------+---------------------------------------------------------------------+
|id |sentence                                 |sentiment  |words                                            |clean_words                          |ngrams                                                               |
+---+-----------------------------------------+-----------+-------------------------------------------------+-------------------------------------+---------------------------------------------------------------------+
|0  |Hi I think pyspark is cool               |happy      |[hi, i, think, pyspark, is, cool]                |[hi, think, pyspark, cool]           |[hi think pyspark, think pyspark cool]                               |
|1  |All I want is a pyspark cluster          |indifferent|[all, i, want, is, a, pyspark, cluster]          |[want, pyspark, clu

In [6]:
df = test

In [7]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml import Pipeline

hashtf = HashingTF(numFeatures=2**16, inputCol="ngrams", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=2) #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol = "sentiment", outputCol = "label")

# example of how pipelines can help us orchestrate the featurization! 
pipeline = Pipeline(stages=[hashtf, idf, label_stringIdx])
pipelineFit = pipeline.fit(test)
train_df = pipelineFit.transform(test)
train_df.select("features","label").show(5,truncate=False)

# now our train data is ready to be used to produce classifier

+-----------------------------------------+-----+
|features                                 |label|
+-----------------------------------------+-----+
|(65536,[16887,26010],[0.0,0.0])          |0.0  |
|(65536,[57587],[0.0])                    |1.0  |
|(65536,[34782,39758],[0.0,0.0])          |2.0  |
|(65536,[11730,34744,49304],[0.0,0.0,0.0])|1.0  |
|(65536,[],[])                            |3.0  |
+-----------------------------------------+-----+
only showing top 5 rows



# Using GBTClassifier with specifiying strategy

In [17]:
from pyspark.ml.classification import RandomForestClassifier

# Train a RandomForestClassifier model with a dedicate feature subset strategy
rf = RandomForestClassifier(labelCol="label", featuresCol="features",featureSubsetStrategy="log2")
model = rf.fit(train_df)

In [18]:
# Make predictions.
predictions = model.transform(train_df)

# Select example rows to display.
predictions

DataFrame[id: bigint, sentence: string, sentiment: string, words: array<string>, clean_words: array<string>, ngrams: array<string>, tf: vector, features: vector, label: double, rawPrediction: vector, probability: vector, prediction: double]

In [19]:
predictions.printSchema()

root
 |-- id: long (nullable = true)
 |-- sentence: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- clean_words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ngrams: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tf: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- label: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [20]:
predictions.select("rawPrediction","label","probability","prediction").show(5,truncate=True)

+--------------------+-----+--------------------+----------+
|       rawPrediction|label|         probability|prediction|
+--------------------+-----+--------------------+----------+
|[6.90649350649350...|  0.0|[0.34532467532467...|       1.0|
|[6.90649350649350...|  1.0|[0.34532467532467...|       1.0|
|[6.90649350649350...|  2.0|[0.34532467532467...|       1.0|
|[6.90649350649350...|  1.0|[0.34532467532467...|       1.0|
|[6.90649350649350...|  3.0|[0.34532467532467...|       1.0|
+--------------------+-----+--------------------+----------+
only showing top 5 rows

