In [None]:
import os
# Find the latest version of spark 3.2  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.2.2'
spark_version = 'spark-<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CloudETLProject").getOrCreate()

In [6]:
from pyspark import SparkFiles
# Load in data into a DataFrame

url = "/content/drive/MyDrive/Colab Notebooks/2U/etl/06-Stu_Cloud_ETL_Project/Solved/drugsComTrain_raw.tsv" #enter correct address here
#spark.sparkContext.addFile(url) No need for sparkcontext if read from google drive

df = spark.read.option('header', 'true').csv(SparkFiles.get(url), inferSchema=True, sep='\t', timestampFormat="mm/dd/yy")
df.show(10)

+--------------------+--------------------+--------------------+--------------------+------+-----------------+-----------+
|                 _c0|            drugName|           condition|              review|rating|             date|usefulCount|
+--------------------+--------------------+--------------------+--------------------+------+-----------------+-----------+
|              206461|           Valsartan|Left Ventricular ...|"""It has no side...|   9.0|     May 20, 2012|         27|
|               95260|          Guanfacine|                ADHD|"""My son is half...|  null|             null|       null|
|We have tried man...|                 8.0|      April 27, 2010|                 192|  null|             null|       null|
|               92703|              Lybrel|       Birth Control|"""I used to take...|  null|             null|       null|
|The positive side...|                 5.0|   December 14, 2009|                  17|  null|             null|       null|
|              1

## Transform DataFrame to fit review_rating table

In [15]:
review_df = df.select(["review","rating", "date"])
review_df.show()

+--------------------+------+-----------------+
|              review|rating|             date|
+--------------------+------+-----------------+
|"""It has no side...|   9.0|     May 20, 2012|
|"""My son is half...|  null|             null|
|                 192|  null|             null|
|"""I used to take...|  null|             null|
|                  17|  null|             null|
|"""This is my fir...|   8.0| November 3, 2015|
|"""Suboxone has c...|   9.0|November 27, 2016|
|"""2nd day on 5mg...|   2.0|November 28, 2015|
|"""He pulled out,...|   1.0|    March 7, 2017|
|"""Abilify change...|  10.0|   March 14, 2015|
|""" I Ve had  not...|   1.0|   August 9, 2016|
|"""I had been on ...|   8.0| December 8, 2016|
|"""I have been on...|   9.0|  January 1, 2015|
|"""I have taken a...|  null|             null|
|                null|  null|             null|
|                  54|  null|             null|
|"""I had Crohn&#0...|   4.0|     July 6, 2013|
|"""Have a little ...|   4.0|September 7

In [16]:
from pyspark.sql.functions import regexp_extract, length
review_df = df.withColumnRenamed("rating", "label").select(["label", "date", "review"])
review_df = review_df.withColumn('review_length', length(review_df['review'])).dropna()
review_df.cache()
review_df.show()

+-----+------------------+--------------------+-------------+
|label|              date|              review|review_length|
+-----+------------------+--------------------+-------------+
|  9.0|      May 20, 2012|"""It has no side...|           83|
|  8.0|  November 3, 2015|"""This is my fir...|          452|
|  9.0| November 27, 2016|"""Suboxone has c...|          723|
|  2.0| November 28, 2015|"""2nd day on 5mg...|          407|
|  1.0|     March 7, 2017|"""He pulled out,...|          146|
| 10.0|    March 14, 2015|"""Abilify change...|          737|
|  1.0|    August 9, 2016|""" I Ve had  not...|          197|
|  8.0|  December 8, 2016|"""I had been on ...|          741|
|  9.0|   January 1, 2015|"""I have been on...|          734|
|  4.0|      July 6, 2013|"""I had Crohn&#0...|          407|
|  4.0| September 7, 2017|"""Have a little ...|          595|
|  9.0|  January 19, 2017|"""I have been ta...|          737|
|  9.0|September 22, 2017|"""This drug work...|          680|
|  9.0| 

## Create Data Pipeline

In [17]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
# Create all the features to the data set
tokenizer = Tokenizer(inputCol="review", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="token_text", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

In [18]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors (merge idf_token and review_length)
clean_up = VectorAssembler(inputCols=['idf_token', 'review_length'], outputCol='features')

In [19]:
# Create and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[tokenizer, stopremove, hashingTF, idf, clean_up])

## Transform DataFrame

In [20]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(review_df)
cleaned = cleaner.transform(review_df)

In [None]:
# Show label of ham spam and resulting features
cleaned.select(['label', 'features']).show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    5|(262145,[9639,991...|
|    4|(262145,[512,1588...|
|    4|(262145,[3578,963...|
|    2|(262145,[9639,157...|
|    4|(262145,[3294,736...|
|    4|(262145,[14,8443,...|
|    4|(262145,[14,604,3...|
|    5|(262145,[14,4543,...|
|    3|(262145,[3890,392...|
|    5|(262145,[991,2437...|
|    4|(262145,[14,326,3...|
|    3|(262145,[6922,736...|
|    3|(262145,[6922,963...|
|    5|(262145,[4081,158...|
|    5|(262145,[1076,199...|
|    5|(262145,[14,329,1...|
|    5|(262145,[14,1998,...|
|    4|(262145,[14,5281,...|
|    4|(262145,[7388,963...|
|    4|(262145,[9639,158...|
+-----+--------------------+
only showing top 20 rows



## Run NaiveBayes

In [49]:
from pyspark.ml.classification import NaiveBayes
# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3])

# Create a Naive Bayes model and fit training data
nb = NaiveBayes() #labelCol='label', featuresCol='features'
predictor = nb.fit(training)

In [50]:
training.show()

+-----+-------------+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|label|         date|              review|review_length|          token_text|         stop_tokens|          hash_token|           idf_token|            features|
+-----+-------------+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  1.0|April 1, 2009|"""It made me ext...|           87|["""it, made, me,...|["""it, made, ext...|(262144,[19923,30...|(262144,[19923,30...|(262145,[19923,30...|
|  1.0|April 1, 2014|"""I have bronchi...|          430|["""i, have, bron...|["""i, bronchial,...|(262144,[12098,15...|(262144,[12098,15...|(262145,[12098,15...|
|  1.0|April 1, 2015|"""I came down wi...|          349|["""i, came, down...|["""i, came, flu,...|(262144,[5381,942...|(262144,[5381,942...|(262145,[5381,942...|
|  1.0|April 1, 2015|"""I ca

In [51]:
# Tranform the model with the testing data
test_results = predictor.transform(testing)
test_results.show(5)

+-----+-------------+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|label|         date|              review|review_length|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+-----+-------------+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  1.0|April 1, 2008|"""this medicine ...|          118|["""this, medicin...|["""this, medicin...|(262144,[71929,86...|(262144,[71929,86...|(262145,[71929,86...|[-656.29562241667...|[0.99999999999999...|       0.0|
|  1.0|April 1, 2009|"""It made me ext...|           87|["""it, made, me,...|["""it, made, ext...|(262144,[19923,30...|(262144,[19923,30...|

## Predict accuracy of the model

In [52]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting reviews was: %f" % acc)

Accuracy of model at predicting reviews was: 0.067418
