In [None]:
from pyspark.sql import SparkSession
import os


In [None]:
spark

In [None]:
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("value", StringType(), True)
])
total_df = spark.createDataFrame([], schema)
for file_name in os.listdir("/home/cloudera/data"):
    df = spark.read.option("header", "true").text('file:///home/cloudera/data/' + file_name)
    total_df = total_df.union(df)

In [None]:
total_df.count()

In [None]:
total_df.show()

In [None]:
#The dataset is messy. Let us clean it up
import re

def process_line(x):
    line = x['value']
    parts = re.split("\s+",line,1)
    sub_parts = re.split('--', parts[0])
    parts_1 = ''
    if len(sub_parts) > 1:
       parts_1 = sub_parts[1] + ' ' + parts[1]
    else:
       parts_1 = parts[1]
    return ([sub_parts[0],parts_1])

In [None]:
input_rdd = total_df.rdd .filter(lambda x : x['value'] not in ['### introduction ###','### abstract ###']) .map(lambda x : process_line(x))

In [None]:
input_df = input_rdd.toDF()
input_df.show()

In [None]:
input_df.groupBy('_1').count().show()

In [None]:
#In case you need to do preprocessing of data
#import gensim.parsing.preprocessing as gsp
#from pyspark.sql.functions import udf
#from pyspark.sql.types import StringType
#from gensim import utils


#filters = [
#           gsp.strip_tags, 
#           gsp.strip_punctuation,
#           gsp.strip_multiple_whitespaces,
#           gsp.strip_numeric,
#           gsp.remove_stopwords, 
#           gsp.strip_short, 
#           gsp.stem_text
#          ]

#def clean_text(x):
#    s = x[1]
#    s = s.lower()
#    s = utils.to_unicode(s)
#    for f in filters:
#        s = f(s)
#    return (x[0],s)

In [None]:
from pyspark.ml.feature import Word2Vec
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="_2", outputCol="tokens")
w2v = Word2Vec(vectorSize=300, minCount=0, inputCol="tokens", outputCol="features")
doc2vec_pipeline = Pipeline(stages=[tokenizer,w2v])
doc2vec_model = doc2vec_pipeline.fit(input_df)
doc2vecs_df = doc2vec_model.transform(input_df)

In [None]:
doc2vecs_df.show()

In [None]:
w2v_train_df, w2v_test_df = doc2vecs_df.randomSplit([0.8, 0.2])

In [None]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

si = StringIndexer(inputCol="_1", outputCol="label")
rf_classifier = RandomForestClassifier(labelCol="label", featuresCol="features")

rf_classifier_pipeline = Pipeline(stages=[si,rf_classifier])
rf_predictions = rf_classifier_pipeline.fit(w2v_train_df).transform(w2v_test_df)

rf_model_evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")

In [None]:
accuracy = rf_model_evaluator.evaluate(rf_predictions)
print("Accuracy = %g" % (accuracy))

In [None]:
rf_predictions.show()

In [None]:
from pyspark.ml.classification import LogisticRegression

lr_classifier = LogisticRegression(family="multinomial")

lr_classifier_pipeline = Pipeline(stages=[si,lr_classifier])
lr_predictions = lr_classifier_pipeline.fit(w2v_train_df).transform(w2v_test_df)

lr_model_evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")

In [None]:
accuracy = lr_model_evaluator.evaluate(lr_predictions)
print("Accuracy = %g" % (accuracy))

In [None]:
lr_predictions.show()