In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.mllib.evaluation import BinaryClassificationMetrics

from freamon.provenance.pyspark.provenance import trace_provenance

spark = SparkSession.builder \
    .master("local") \
    .getOrCreate()


with trace_provenance() as tr:
    
    products = tr.read_csv(spark, "datasets/reviews/products.csv")
    categories = tr.read_csv(spark, "datasets/reviews/categories.csv")

    filtered_products = products\
      .filter(lambda row: 'playstation' in row['product_title'].lower())\
      .select(['product_id', 'product_title', 'category_id'])\
      .withColumn('title_length', lambda row: len(row['product_title']))

    products_with_categories = filtered_products\
      .join(categories, left_on='category_id', right_on='id')\
      .select(['product_title', 'category', 'title_length', 'category_id'])

    train, test = products_with_categories.randomSplit(0.8, 42)

    train.cache()
    test.cache()

    train = train.withColumn('label', lambda row: int(len(row['product_title']) >= 30))
    test = train.withColumn('label', lambda row: int(len(row['product_title']) < 30))

    print(train.count())
    print(test.count())

    tokenizer = Tokenizer(inputCol="product_title", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), numFeatures=100, outputCol="features")
    lr = LogisticRegression(maxIter=10, regParam=0.001)
    pipeline = tr.make_pipeline(stages=[tokenizer, hashingTF, lr])

    model = pipeline.fit(train)
    predictions = model.transform(test)

22/06/30 16:41:42 WARN Utils: Your hostname, MacBook-Pro-4.local resolves to a loopback address: 127.0.0.1; using 192.168.178.19 instead (on interface en0)
22/06/30 16:41:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/06/30 16:41:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/06/30 16:41:47 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , product_id, product_parent, product_title, category_id
 Schema: _c0, product_id, product_parent, product_title, category_id
Expected: _c0 but found: 
CSV file: file:///Users/ssc/projects/freamon-opt/datasets/reviews/products.csv


[Stage 2:>                                                          (0 + 1) / 2]

22/06/30 16:41:49 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , id, category
 Schema: _c0, id, category
Expected: _c0 but found: 
CSV file: file:///Users/ssc/projects/freamon-opt/datasets/reviews/categories.csv
16


                                                                                

16


In [2]:
predictions.select(['probability', 'prediction']).collect()

[Row(probability=DenseVector([0.9979, 0.0021]), prediction=0.0),
 Row(probability=DenseVector([0.998, 0.002]), prediction=0.0),
 Row(probability=DenseVector([0.998, 0.002]), prediction=0.0),
 Row(probability=DenseVector([0.998, 0.002]), prediction=0.0),
 Row(probability=DenseVector([0.998, 0.002]), prediction=0.0),
 Row(probability=DenseVector([0.9979, 0.0021]), prediction=0.0),
 Row(probability=DenseVector([0.0013, 0.9987]), prediction=1.0),
 Row(probability=DenseVector([0.0019, 0.9981]), prediction=1.0),
 Row(probability=DenseVector([0.998, 0.002]), prediction=0.0),
 Row(probability=DenseVector([0.0016, 0.9984]), prediction=1.0),
 Row(probability=DenseVector([0.0024, 0.9976]), prediction=1.0),
 Row(probability=DenseVector([0.0032, 0.9968]), prediction=1.0),
 Row(probability=DenseVector([0.0018, 0.9982]), prediction=1.0),
 Row(probability=DenseVector([0.0015, 0.9985]), prediction=1.0),
 Row(probability=DenseVector([0.0017, 0.9983]), prediction=1.0),
 Row(probability=DenseVector([0.001

In [3]:
from freamon.provenance.pyspark.provenance import SingletonProvStore

SingletonProvStore().sources[0].collect()

                                                                                

[(Row(_c0='0', product_id='B013PURRZW', product_parent='603406193', product_title='Madden NFL 16 - Xbox One Digital Code', category_id='0'),
  {(2, 0)}),
 (Row(_c0='1', product_id='B00F4CEHNK', product_parent='341969535', product_title='Xbox Live Gift Card', category_id='0'),
  {(2, 1)}),
 (Row(_c0='2', product_id='B00DNHLFQA', product_parent='951665344', product_title='Command & Conquer The Ultimate Collection [Instant Access]', category_id='0'),
  {(2, 2)}),
 (Row(_c0='3', product_id='B004RMK5QG', product_parent='395682204', product_title='Playstation Plus Subscription', category_id='0'),
  {(2, 3)}),
 (Row(_c0='4', product_id='B00G9BNLQE', product_parent='640460561', product_title='Saints Row IV - Enter The Dominatrix [Online Game Code]', category_id='0'),
  {(2, 4)}),
 (Row(_c0='5', product_id='B00IMIL498', product_parent='621922192', product_title='Double Dragon: Neon [Online Game Code]', category_id='0'),
  {(2, 5)}),
 (Row(_c0='6', product_id='B00S00IJH8', product_parent='215163