In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover

In [2]:
# Start a spark session
spark = SparkSession.builder.appName('stuf_idf').getOrCreate()

In [8]:
from pyspark import SparkFiles
url ="combined.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("combined.csv"), sep=",", header=True)
df.head()

Row(_c0='0', Title='Pulp Fiction', Actors='Tim Roth, Amanda Plummer, Laura Lovelace, John Travolta', Box Office=None, Country='USA', Director='Quentin Tarantino', Genre='Crime, Drama', IMDB='8.9/10', Language='English, Spanish, French', Metacritic='94/100', Production='Miramax Films', Rated='R', Released='34621.0', Rotten Tomatoes='0.94', Runtime='154 min', Type='movie', Writer='Quentin Tarantino (stories), Roger Avary (stories), Quentin Tarantino', Year='1994', Unnamed: 0='6', Rank='726', Release Year='1994.0', Worldwide Box Office='$212,928,762 ', Domestic Box Office='$107,928,762 ', International Box Office='$105,000,000 ', Awards='1.0', Nominations='7.0')

In [9]:
tokenizer = Tokenizer(inputCol="Actors", outputCol="words")
tokenized_df = tokenizer.transform(df)
tokenized_df.show(truncate=False)

+---+-------------------------------------------------+---------------------------------------------------------------------+-----------+----------------+-------------------------------+---------------------------------------------+------+------------------------------------+----------+---------------------------+-----+--------+---------------+-------+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [15]:
# Remove stop words
stop_words = [" "]
remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=stop_words)
remover_df = remover.transform(tokenized_df)
remover_df.head()

Row(_c0='0', Title='Pulp Fiction', Actors='Tim Roth, Amanda Plummer, Laura Lovelace, John Travolta', Box Office=None, Country='USA', Director='Quentin Tarantino', Genre='Crime, Drama', IMDB='8.9/10', Language='English, Spanish, French', Metacritic='94/100', Production='Miramax Films', Rated='R', Released='34621.0', Rotten Tomatoes='0.94', Runtime='154 min', Type='movie', Writer='Quentin Tarantino (stories), Roger Avary (stories), Quentin Tarantino', Year='1994', Unnamed: 0='6', Rank='726', Release Year='1994.0', Worldwide Box Office='$212,928,762 ', Domestic Box Office='$107,928,762 ', International Box Office='$105,000,000 ', Awards='1.0', Nominations='7.0', words=['tim', 'roth,', 'amanda', 'plummer,', 'laura', 'lovelace,', 'john', 'travolta'], filtered=['tim', 'roth,', 'amanda', 'plummer,', 'laura', 'lovelace,', 'john', 'travolta'])

In [16]:
# Run the hashing term frequency
hashing = HashingTF(inputCol="filtered", outputCol="hashedValues", numFeatures=pow(2, 4))

# Transform in a DF
hashed_df = hashing.transform(remover_df)
hashed_df.show()

+---+--------------------+--------------------+-----------+----------------+--------------------+--------------------+------+--------------------+----------+--------------------+-----+--------+---------------+-------+-----+--------------------+--------------------+----------+-----+------------+--------------------+-------------------+------------------------+--------+-----------+--------------------+--------------------+--------------------+
|_c0|               Title|              Actors| Box Office|         Country|            Director|               Genre|  IMDB|            Language|Metacritic|          Production|Rated|Released|Rotten Tomatoes|Runtime| Type|              Writer|                Year|Unnamed: 0| Rank|Release Year|Worldwide Box Office|Domestic Box Office|International Box Office|  Awards|Nominations|               words|            filtered|        hashedValues|
+---+--------------------+--------------------+-----------+----------------+--------------------+-----------

In [17]:
# Fit the idf on the data set
idf = IDF(inputCol="hashedValues", outputCol="features")
idfModel = idf.fit(hashed_df)
rescaled_df = idfModel.transform(hashed_df)

In [13]:
# Display the dataframe
rescaled_df.select("words", "features").show(truncate=False)

+------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|words                                                                         |features                                                                                                                                                                                                |
+------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[tim, roth,, amanda, plummer,, laura, lovelace,, john, travolta]              |(16,[0,2,3,4,5,9,15],[0.9332213810557625,1.0377733406770357,0.884147622802