In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=e0a49aaa5160f3b48a71c636c5ff9dc81cba8bd2ac49beb322c758cf783a8956
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [9]:
# Import modules
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.sql.types import IntegerType, FloatType
from pyspark.ml.feature import VectorAssembler, StringIndexer, IndexToString, SQLTransformer
from pyspark.ml.classification import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import udf
from pyspark.sql.types import LongType, BooleanType, DoubleType

In [7]:
# Define input path and constants
inputData = 'Lab09.csv'

In [10]:
# Load the data
# Create a DataFrame from Reviews.csv
spark = SparkSession.builder.getOrCreate()
reviews = spark.read.load(inputData,\
                     format="csv",\
                     header=True,\
                     inferSchema=True)

In [12]:
reviews.show(5)

+---+---------+------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+
| Id|ProductId|UserId|         ProfileName|HelpfulnessNumerator|HelpfulnessDenominator|Score|      Time|             Summary|                Text|
+---+---------+------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+
|  1|       B1|    A2|          delmartian|                   1|                     1|    5|1303862400|Good Quality Dog ...|I have bought sev...|
|  2|       B1|    A4|              dll pa|                   0|                     0|    4|1346976000|   Not as Advertised|"Product arrived ...|
|  3|       B1|    A5|"Natalia Corres "...|                   1|                     1|    1|1219017600|"""Delight"" says...|"This is a confec...|
|  4|       B2|    A1|                Karl|                   3|                     3|    3|1307923200|      Cough Me

In [13]:
# Select only the records with HelpfulnessDenominator>0 (i.e., rated reviews)
reviews = reviews.filter('HelpfulnessDenominator>0')

In [14]:
reviews.show(5)

+---+---------+------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+
| Id|ProductId|UserId|         ProfileName|HelpfulnessNumerator|HelpfulnessDenominator|Score|      Time|             Summary|                Text|
+---+---------+------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+
|  1|       B1|    A2|          delmartian|                   1|                     1|    5|1303862400|Good Quality Dog ...|I have bought sev...|
|  3|       B1|    A5|"Natalia Corres "...|                   1|                     1|    1|1219017600|"""Delight"" says...|"This is a confec...|
|  4|       B2|    A1|                Karl|                   3|                     3|    3|1307923200|      Cough Medicine|If you are lookin...|
|  9|       B4|    A4|            R. James|                   1|                     1|    5|1322006400|          Yay 

In [81]:
# Create and compute the value of Column label for the selected rated reviews
spark.udf.register('assignLabel', lambda num, den: 1.0 if (num/den)>0.9 else 0.0, DoubleType())
reviews.createOrReplaceTempView("reviews")
labeledDF = spark.sql('SELECT *, HelpfulnessNumerator/HelpfulnessDenominator as Ratio, \
                   assignLabel(HelpfulnessNumerator, HelpfulnessDenominator) AS label FROM reviews')

In [82]:
labeledDF.show(5)

+---+---------+------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+-----+-----+
| Id|ProductId|UserId|         ProfileName|HelpfulnessNumerator|HelpfulnessDenominator|Score|      Time|             Summary|                Text|Ratio|label|
+---+---------+------+--------------------+--------------------+----------------------+-----+----------+--------------------+--------------------+-----+-----+
|  1|       B1|    A2|          delmartian|                   1|                     1|    5|1303862400|Good Quality Dog ...|I have bought sev...|  1.0|  1.0|
|  3|       B1|    A5|"Natalia Corres "...|                   1|                     1|    1|1219017600|"""Delight"" says...|"This is a confec...|  1.0|  1.0|
|  4|       B2|    A1|                Karl|                   3|                     3|    3|1307923200|      Cough Medicine|If you are lookin...|  1.0|  1.0|
|  9|       B4|    A4|            R. James|   

In [83]:
# Split the dataframe with Column label in training and test set
df = labeledDF.select('Ratio','label')
(reviews_train, reviews_test) = df.randomSplit([0.75, 0.25], seed=10)

In [85]:
# Create/Define the preprocessing steps and the classification algorithm you want to use
# and the content of the pipeline that is used to train the model on reviews_train and apply it on reviews_test
# Implement a first solution with one single values in features: text length
from pyspark.ml.classification import DecisionTreeClassifier
assembler = VectorAssembler(inputCols=['Ratio'], outputCol='features')
dt = DecisionTreeClassifier()
dt.setImpurity('gini')
pipeline = Pipeline().setStages([assembler,dt])

In [86]:
# Fit/Train the model
model = pipeline.fit(reviews_train)

In [87]:
# Apply the model on the test set
predictions = model.transform(reviews_test).cache()

In [88]:
# Compute statistics
# Accuracy, F1, weighted recall, weighted precision
evaluatorAcc = MulticlassClassificationEvaluator(labelCol="label" , predictionCol= "prediction", metricName = "accuracy")
evaluatorF1 = MulticlassClassificationEvaluator(labelCol="label" , predictionCol= "prediction", metricName = "f1")
evaluatorRecall = MulticlassClassificationEvaluator(labelCol="label" , predictionCol= "prediction", metricName = "weightedRecall")
evaluatorPrecision = MulticlassClassificationEvaluator(labelCol="label" , predictionCol= "prediction", metricName = "weightedPrecision")

print("Accuracy:", evaluatorAcc.evaluate(predictions))
print("F1:", evaluatorF1.evaluate(predictions))
print("Weighted Recall:", evaluatorRecall.evaluate(predictions))
print("Weighted Precision:", evaluatorPrecision.evaluate(predictions))

Accuracy: 1.0
F1: 1.0
Weighted Recall: 1.0
Weighted Precision: 1.0


In [89]:
#  Compute the confusion matrix
#                     Predicted
#  Actual       Useful   Useless
#  Useful          A        B
#  Useless          C        D

A = predictions.filter("prediction=1 and label=1").count()
B = predictions.filter("prediction=0 and label=1").count()
C = predictions.filter("prediction=1 and label=0").count()
D = predictions.filter("prediction=0 and label=0").count()

print("                       Predicted")
print("  Actual \t Useful\tUseless")
print("  Useful \t "+str(A)+ "\t\t"+str(B))
print("  Useless \t "+str(C)+ "\t\t"+str(D))

                       Predicted
  Actual 	 Useful	Useless
  Useful 	 3		0
  Useless 	 0		0


In [91]:
# Precision and recall for the two classes
# Useful
if A+C==0:
    print("Precision(Useful): undefined")
else:
    print("Precision(Useful):"+str(A/(A+C)))


print("Recall(Useful):"+str(A/(A+B)))

# Useless
if B+D==0:
    print("Precision(Useless): undefined")
else:
    print("Precision(Useless):"+str(D/(B+D)))


Precision(Useful):1.0
Recall(Useful):1.0
Precision(Useless): undefined
