In [14]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF, ChiSqSelector, StringIndexer
from pyspark.ml import Pipeline

In [15]:
spark = SparkSession.builder \
    .appName("task2") \
    .getOrCreate()

df = spark.read.json("hdfs:///user/dic25_shared/amazon-reviews/full/reviews_devset.json")
df.head()

Row(asin='0981850006', category='Patio_Lawn_and_Garde', helpful=[6, 7], overall=5.0, reviewText="This was a gift for my other husband.  He's making us things from it all the time and we love the food.  Directions are simple, easy to read and interpret, and fun to make.  We all love different kinds of cuisine and Raichlen provides recipes from everywhere along the barbecue trail as he calls it. Get it and just open a page.  Have at it.  You'll love the food and it has provided us with an insight into the culture that produced it. It's all about broadening horizons.  Yum!!", reviewTime='12 3, 2009', reviewerID='A2VNYWOPJ13AFP', reviewerName='Amazon Customer "carringt0n"', summary='Delish', unixReviewTime=1259798400)

In [16]:
tokenizer = RegexTokenizer(
    inputCol="reviewText",
    outputCol="tokens",
    pattern=r"[\s\d()\[\]{}\.\!\?,;:+=\-_\"'`~#@&\*\%€\$§\\/]+"
)


stopwords = spark.sparkContext.textFile(
    "hdfs:///user/e01652446/input/stopwords.txt"
).collect()
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered", stopWords=stopwords)


CountVec = CountVectorizer(inputCol="filtered", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="tfidf")

indexer = StringIndexer(inputCol="category", outputCol="label", handleInvalid="skip")

selector = ChiSqSelector(
    numTopFeatures=2000,
    featuresCol="tfidf",
    outputCol="selectedFeatures",
    labelCol="label"
)



In [18]:
pipeline = Pipeline(stages=[
    tokenizer,
    remover,
    CountVec,
    idf,
    indexer,    
    selector   
])
model = pipeline.fit(df)

                                                                                

25/05/12 22:16:14 WARN DAGScheduler: Broadcasting large task binary with size 1063.1 KiB


                                                                                

25/05/12 22:16:28 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
25/05/12 22:16:29 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB


                                                                                

25/05/12 22:16:41 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB


                                                                                

In [20]:
vocab = model.stages[2].vocabulary 
selected_indices = model.stages[-1].selectedFeatures

selected_terms = [vocab[i] for i in selected_indices]

In [21]:
sc = spark.sparkContext
sc.parallelize(selected_terms, 1) \
  .saveAsTextFile("output_ds.txt")

                                                                                