In [2]:
from pyspark.sql import SparkSession, Row
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.mllib.evaluation import BinaryClassificationMetrics

from freamon.adapters.pyspark.provenance import trace_provenance

spark = SparkSession.builder \
    .master("local") \
    .getOrCreate()


with trace_provenance() as tr:
    
    products = tr.read_csv(spark, "datasets/reviews/products.csv")
    categories = tr.read_csv(spark, "datasets/reviews/categories.csv")

    filtered_products = products \
      .filter(lambda row: 'playstation' in row['product_title'].lower()) \
      .select(['product_id', 'product_title', 'category_id'])\
      .withColumn('title_length', lambda row: len(row['product_title']))

    products_with_categories = filtered_products \
      .join(categories, left_on='category_id', right_on='id') \
      .select(['product_title', 'category', 'title_length', 'category_id'])

    train, test = products_with_categories.randomSplit(0.8, 42)

    train.cache()
    test.cache()

    train = train.withColumn('label', lambda row: int(len(row['product_title']) >= 30))
    test = train.withColumn('label', lambda row: int(len(row['product_title']) < 30))


    tokenizer = Tokenizer(inputCol="product_title", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), numFeatures=100, outputCol="features")
    lr = LogisticRegression(maxIter=10, regParam=0.001)
    pipeline = tr.make_pipeline(stages=[tokenizer, hashingTF, lr])

    model = pipeline.fit(train)
    predictions = model.transform(test)

22/07/03 06:47:13 WARN Utils: Your hostname, MacBook-Pro-4.local resolves to a loopback address: 127.0.0.1; using 192.168.178.19 instead (on interface en0)
22/07/03 06:47:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/07/03 06:47:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/07/03 06:47:21 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , product_id, product_parent, product_title, category_id
 Schema: _c0, product_id, product_parent, product_title, category_id
Expected: _c0 but found: 
CSV file: file:///Users/ssc/projects/freamon-opt/datasets/reviews/products.csv


[Stage 2:>                                                          (0 + 1) / 2]

22/07/03 06:47:23 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , id, category
 Schema: _c0, id, category
Expected: _c0 but found: 
CSV file: file:///Users/ssc/projects/freamon-opt/datasets/reviews/categories.csv


                                                                                

In [3]:
predictions.select(['probability', 'prediction']).collect()

[Row(probability=DenseVector([0.9979, 0.0021]), prediction=0.0),
 Row(probability=DenseVector([0.998, 0.002]), prediction=0.0),
 Row(probability=DenseVector([0.998, 0.002]), prediction=0.0),
 Row(probability=DenseVector([0.998, 0.002]), prediction=0.0),
 Row(probability=DenseVector([0.998, 0.002]), prediction=0.0),
 Row(probability=DenseVector([0.9979, 0.0021]), prediction=0.0),
 Row(probability=DenseVector([0.0013, 0.9987]), prediction=1.0),
 Row(probability=DenseVector([0.0019, 0.9981]), prediction=1.0),
 Row(probability=DenseVector([0.998, 0.002]), prediction=0.0),
 Row(probability=DenseVector([0.0016, 0.9984]), prediction=1.0),
 Row(probability=DenseVector([0.0024, 0.9976]), prediction=1.0),
 Row(probability=DenseVector([0.0032, 0.9968]), prediction=1.0),
 Row(probability=DenseVector([0.0018, 0.9982]), prediction=1.0),
 Row(probability=DenseVector([0.0015, 0.9985]), prediction=1.0),
 Row(probability=DenseVector([0.0017, 0.9983]), prediction=1.0),
 Row(probability=DenseVector([0.001

In [5]:
from freamon.adapters.pyspark.provenance import SingletonProvStore

SingletonProvStore().test_features_and_predictions.toPandas()

Unnamed: 0,features,label,probability,prediction
0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,"[0.9979020985377153, 0.0020979014622847236]",0.0
1,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,"[0.9979848318675658, 0.0020151681324341864]",0.0
2,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,"[0.9979848318675658, 0.0020151681324341864]",0.0
3,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,"[0.9979848318675658, 0.0020151681324341864]",0.0
4,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,"[0.9979848318675658, 0.0020151681324341864]",0.0
5,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,"[0.9979020985377153, 0.0020979014622847236]",0.0
6,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",0,"[0.0012749951337742667, 0.9987250048662257]",1.0
7,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",0,"[0.0019010657428531691, 0.9980989342571468]",1.0
8,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,"[0.9979848318675658, 0.0020151681324341864]",0.0
9,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,"[0.0016183673312031048, 0.9983816326687969]",1.0


22/07/03 07:46:17 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1086630 ms exceeds timeout 120000 ms
22/07/03 07:46:18 WARN SparkContext: Killing executors is not supported by current scheduler.
