In [None]:
import os
# Find the latest version of spark 3.2  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.2.2'
spark_version = 'spark-3.<enter version>'

os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [4]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Hashing").getOrCreate()

In [5]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover

In [6]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-classroom/v1.1/22-big-data/day_2/unreal_restaurant_tweets.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("unreal_restaurant_tweets.csv"), sep=",", header=True)

# Show DataFrame
df.show()

+--------------------+
|   Restaurant Tweets|
+--------------------+
|@UnrealRestaurant...|
|@UnrealRestaurant...|
|@UnrealRestaurant...|
|@UnrealRestaurant...|
|"@UnrealRestauran...|
+--------------------+



In [7]:
# Tokenize DataFrame
tokened = Tokenizer(inputCol="Restaurant Tweets", outputCol="words")
tokened_transformed = tokened.transform(df)
tokened_transformed.show()

+--------------------+--------------------+
|   Restaurant Tweets|               words|
+--------------------+--------------------+
|@UnrealRestaurant...|[@unrealrestauran...|
|@UnrealRestaurant...|[@unrealrestauran...|
|@UnrealRestaurant...|[@unrealrestauran...|
|@UnrealRestaurant...|[@unrealrestauran...|
|"@UnrealRestauran...|["@unrealrestaura...|
+--------------------+--------------------+



In [8]:
# Remove stop words
stop_list = ["@UnrealRestaurant", "$30", "@unrealrestaurant"]
remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=stop_list)
removed_frame = remover.transform(tokened_transformed)
removed_frame.show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|Restaurant Tweets                                                                                                     |words                                                                                                                                  |filtered                                                                                                                               |
+----------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------

In [9]:
# Run the hashing term frequency
hashing = HashingTF(inputCol="filtered", outputCol="hashedValues", numFeatures=pow(2,4))

# Transform into a DF
hashed_df = hashing.transform(removed_frame)
hashed_df.show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------+
|Restaurant Tweets                                                                                                     |words                                                                                                                                  |filtered                                                                                                                               |hashedValues                                                            |
+-------------------------------------------------------------------

In [10]:
# Fit the IDF on the data set 
idf = IDF(inputCol="hashedValues", outputCol="features")
idfModel = idf.fit(hashed_df)
rescaledData = idfModel.transform(hashed_df)

In [11]:
# Display the DataFrame
rescaledData.select("words", "features").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|words                                                                                                                                  |features                                                                                                                                                                                                       |
+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------