In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.mllib.evaluation import BinaryClassificationMetrics

from freamon.adapters.pyspark.provenance import trace_provenance, from_trace

spark = SparkSession.builder \
    .master("local") \
    .getOrCreate()


with trace_provenance() as tr:
    
    products = tr.read_csv(spark, "datasets/reviews/products.csv")
    categories = tr.read_csv(spark, "datasets/reviews/categories.csv")
    
#.filter(lambda row: 'playstation' in row['product_title'].lower()) \
    filtered_products = products \
      .select(['product_id', 'product_title', 'category_id'])\
      .withColumn('title_length', lambda row: len(row['product_title']))

    products_with_categories = filtered_products \
      .join(categories, left_on='category_id', right_on='id') \
      .select(['product_title', 'category', 'title_length', 'category_id'])

    train, test = products_with_categories.randomSplit(0.8, 42)

    train.cache()
    test.cache()

    train = train.withColumn('label', lambda row: int(len(row['product_title']) >= 30))
    test = test.withColumn('label', lambda row: int(len(row['product_title']) >= 30))


    tokenizer = Tokenizer(inputCol="product_title", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), numFeatures=100, outputCol="features")
    lr = LogisticRegression(maxIter=10, regParam=0.001)
    pipeline = tr.make_pipeline(stages=[tokenizer, hashingTF, lr])

    model = pipeline.fit(train)
    predictions = model.transform(test)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/02 11:56:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/09/02 11:56:54 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , product_id, product_parent, product_title, category_id
 Schema: _c0, product_id, product_parent, product_title, category_id
Expected: _c0 but found: 
CSV file: file:///Users/ssc/projects/freamon-opt/datasets/reviews/products.csv


[Stage 2:>                                                          (0 + 1) / 2]

22/09/02 11:56:56 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , id, category
 Schema: _c0, id, category
Expected: _c0 but found: 
CSV file: file:///Users/ssc/projects/freamon-opt/datasets/reviews/categories.csv
22/09/02 11:56:56 WARN BlockManager: Task 4 already completed, not releasing lock for rdd_47_0


                                                                                

In [2]:
view_gen = from_trace(tr, spark)

[Stage 33:>                                                         (0 + 1) / 1]                                                                                

In [3]:
df = view_gen.test_view(
    sliceable_by=['category', 'category_id'], 
    with_features=False, 
    with_y_true=True, 
    with_y_pred=True)

df

                                                                                

Unnamed: 0,category,category_id,y_true,y_pred
0,Digital_Video_Games,0,0,0.0
1,Digital_Video_Games,0,0,0.0
2,Digital_Video_Games,0,1,1.0
3,Digital_Video_Games,0,1,1.0
4,Digital_Video_Games,0,1,1.0
...,...,...,...,...
2140,Digital_Software,1,1,1.0
2141,Digital_Software,1,0,0.0
2142,Digital_Software,1,0,1.0
2143,Digital_Software,1,0,0.0


In [4]:
train.count()

8787