In [54]:
import pyspark
from pyspark.sql import SQLContext
sc = pyspark.SparkContext.getOrCreate()
sqlContext = SQLContext(sc)
import pandas as pd
import re
from pyspark.sql import SparkSession
from snorkel.labeling import PandasLFApplier,LFAnalysis
spark = SparkSession.builder.appName('pandasToSparkDF').getOrCreate()
pd_dev = pd.read_csv("review_dev_labelled.csv",header=0, index_col=0)
df_dev = spark.createDataFrame(pd_dev)

In [55]:
#Try to apply Labeling function
import pyspark.sql.functions as F
from snorkel.labeling import LabelModel
from snorkel.labeling.apply.spark import SparkLFApplier
import pandas as pd
import numpy as np
#from snorkel.labeling import ,LFAnalysis
from snorkel.labeling import LFAnalysis
from pyspark.sql import Row
from snorkel.labeling.lf import labeling_function
#from snorkel.labeling.lf.nlp_spark import spark_nlp_labeling_function
from snorkel.preprocess import preprocessor

ABSTAIN = -1
NEGATIVE = 0
POSITIVE = 1

## Pre-processing

In [56]:
dataset = df_dev.select("cool", "funny", "stars", "useful", "label")
cols = dataset.columns
dataset.show()

+----+-----+-----+------+-----+
|cool|funny|stars|useful|label|
+----+-----+-----+------+-----+
|   1|    0|    2|     0|    1|
|   0|    0|    2|     0|    1|
|   2|    0|    4|     4|    0|
|   0|    0|    5|     0|    0|
|   0|    0|    3|     0|    1|
|   1|    0|    4|     2|    0|
|   0|    0|    4|     1|    0|
|   1|    1|    2|     2|    1|
|   0|    0|    5|     3|    0|
|   0|    0|    4|     0|    0|
|   0|    0|    5|     0|    0|
|   0|    0|    5|     0|    0|
|   0|    0|    5|     0|    0|
|   0|    0|    4|     0|    1|
|   0|    0|    4|     0|    1|
|   1|    1|    1|     1|    1|
|   0|    0|    5|     0|    0|
|   0|    0|    5|     0|    0|
|   0|    0|    4|     1|    0|
|   0|    0|    5|     1|    0|
+----+-----+-----+------+-----+
only showing top 20 rows



In [57]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
categoricalColumns = ["cool", "funny", "stars", "useful"]
stages = [] # stages in our Pipeline
for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    # Use OneHotEncoder to convert categorical variables into binary SparseVectors
    # encoder = OneHotEncoderEstimator(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "classVec")
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]

In [58]:
# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol="label", outputCol="label1")
stages += [label_stringIdx]

In [59]:
# Transform all features into a vector using VectorAssembler
#numericCols = ["age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week"]
assemblerInputs = [c + "classVec" for c in categoricalColumns]# + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [60]:
partialPipeline = Pipeline().setStages(stages)
pipelineModel = partialPipeline.fit(dataset)
preppedDataDF = pipelineModel.transform(dataset)

In [61]:
selectedcols = ["label1", "features"] + cols
dataset = preppedDataDF.select(selectedcols)

In [62]:
### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=123)
print(trainingData.count())
print(testData.count())

348
152


## Logistic Regression

In [63]:
from pyspark.ml.classification import LogisticRegression
# Create initial LogisticRegression model
lr = LogisticRegression(labelCol="label1", featuresCol="features", maxIter=10)

# Train model with Training Data
lrModel = lr.fit(trainingData)

In [64]:
# Make predictions on test data using the transform() method.
# LogisticRegression.transform() will only use the 'features' column.
predictions = lrModel.transform(testData)

In [65]:
selected = predictions.select("label1", "prediction", "funny", "stars")
display(selected)

DataFrame[label1: double, prediction: double, funny: bigint, stars: bigint]

In [66]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
print(evaluator.evaluate(predictions))
print(evaluator.getMetricName())

0.8771006463527239
areaUnderROC


In [67]:
# This is the explanation of all the parameters in case we want to tune them
#print(lr.explainParams())

## Decision Tree Classifier

In [68]:
from pyspark.ml.classification import DecisionTreeClassifier

# Create initial Decision Tree Model
dt = DecisionTreeClassifier(labelCol="label1", featuresCol="features", maxDepth=3)

# Train model with Training Data
dtModel = dt.fit(trainingData)

# Print details of the model
print("numNodes = ", dtModel.numNodes)
print("depth = ", dtModel.depth)
display(dtModel)

numNodes =  9
depth =  3


DecisionTreeClassificationModel (uid=DecisionTreeClassifier_507e36d241f1) of depth 3 with 9 nodes

In [69]:
# Make predictions on test data using the Transformer.transform() method.
predictions = dtModel.transform(testData)

# Print the metric and the score
print(evaluator.getMetricName())
print(evaluator.evaluate(predictions))

areaUnderROC
0.8995383194829177


### Tune Hyperparameters of the Decision Tree

In [46]:
# Get default impurity measure
dt.getImpurity()

# This is in case we want to change to entropy and test it
#dt.setImpurity("Entropy")
#dtModel = dt.fit(trainingData)
#predictions = dtModel.transform(testData)
#print(evaluator.getMetricName())
#print(evaluator.evaluate(predictions))

'gini'

In [47]:
# Create ParamGrid for Cross Validation
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [1, 2, 6, 10])
             .addGrid(dt.maxBins, [20, 40, 80])
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=dt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Run cross validations
cvModel = cv.fit(trainingData)

# Print the details of the best model
print("numNodes = ", cvModel.bestModel.numNodes)
print("depth = ", cvModel.bestModel.depth)

numNodes =  5
depth =  2


In [48]:
# Use test set to measure the accuracy of our model on new data
predictions = cvModel.transform(testData)

# cvModel uses the best model found from the Cross Validation
# Evaluate best model
print(evaluator.getMetricName())
print(evaluator.evaluate(predictions))

areaUnderROC
0.897045244690674


In [30]:
# Check whats happening when you apply it on the training data
pr = cvModel.transform(trainingData)

evaluator.evaluate(pr)

0.8927702004201821

## Random Forest Classifier

In [70]:
from pyspark.ml.classification import RandomForestClassifier

# Create an initial RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

# Train model with Training Data
rfModel = rf.fit(trainingData)

In [71]:
# Make predictions on test data using the Transformer.transform() method.
predictions = rfModel.transform(testData)

# Print the metric and the score
print(evaluator.getMetricName())
print(evaluator.evaluate(predictions))

areaUnderROC
0.9005540166204986


### Tuning Hyperparameters of Random Forest Classifier

In [72]:
paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [2, 4, 6])
             .addGrid(rf.maxBins, [20, 60])
             .addGrid(rf.numTrees, [5, 20])
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Run cross validations
cvModel = cv.fit(trainingData)

In [74]:
# Use test set here so we can measure the accuracy of our model on new data
predictions = cvModel.transform(testData)

# cvModel uses the best model found from the Cross Validation
# Evaluate best model
evaluator.evaluate(predictions)

0.907202216066482

In [75]:
# Check whats happening when you apply it on the training data
pr = cvModel.transform(trainingData)

evaluator.evaluate(pr)

0.9248507686664224

In [82]:
# Feature Importances
## WARNING! The columns are one-hot encoded therefore it does not make sense now
list(zip(cols, rfModel.featureImportances))

[('cool', 0.010644985107737726),
 ('funny', 0.025520850177026804),
 ('stars', 0.00419381054476834),
 ('useful', 0.01384711783173082),
 ('label', 0.008741266700017783)]

In [79]:
rfModel.featureImportances

SparseVector(36, {0: 0.0106, 1: 0.0255, 2: 0.0042, 3: 0.0138, 4: 0.0087, 5: 0.0026, 6: 0.0013, 7: 0.0007, 8: 0.0036, 10: 0.012, 11: 0.0108, 12: 0.0188, 13: 0.0057, 14: 0.0071, 15: 0.0013, 16: 0.0032, 17: 0.0055, 18: 0.0026, 19: 0.4271, 20: 0.056, 21: 0.0932, 22: 0.1832, 23: 0.0279, 24: 0.0158, 25: 0.0198, 26: 0.0006, 27: 0.0118, 28: 0.0018, 29: 0.0142, 30: 0.0101, 33: 0.0005})