In [1]:
import pandas as pd
import pyspark
from pyspark.sql import *

In [3]:
sc = pyspark.SparkContext()

In [4]:
spark = SparkSession \
        .builder \
        .appName("PA3") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

In [5]:
train = spark.read.csv('train.csv', inferSchema=True, header=True, escape='"')

In [11]:
mapping = spark.read.csv('mapping.csv', inferSchema=True, header=True, escape='"')

In [13]:
train_pd = train.toPandas()

In [14]:
train_pd.head()

Unnamed: 0,movie_id,movie_name,plot,genre
0,23890098,Taxi Blues,"Shlykov, a hard-working taxi driver and Lyosha...","['World cinema', 'Drama']"
1,31186339,The Hunger Games,The nation of Panem consists of a wealthy Capi...,"['Action/Adventure', 'Action', 'Science Fictio..."
2,20663735,Narasimham,Poovalli Induchoodan is sentenced for six yea...,"['Musical', 'Action', 'Drama']"
3,2231378,The Lemon Drop Kid,"The Lemon Drop Kid , a New York City swindler,...",['Comedy']
4,595909,A Cry in the Dark,Seventh-day Adventist Church pastor Michael Ch...,"['Crime Fiction', 'World cinema', 'Drama']"


In [6]:
train.registerTempTable('movie')
mapping.registerTempTable('')

In [7]:
# to run SQL queries:

sqlContext = SQLContext(sc)
# train_sp = sqlContext.createDataFrame(train)

In [9]:
sqlContext.sql('select * from movie').show()

+--------+--------------------+--------------------+--------------------+
|movie_id|          movie_name|                plot|               genre|
+--------+--------------------+--------------------+--------------------+
|23890098|          Taxi Blues|Shlykov, a hard-w...|['World cinema', ...|
|31186339|    The Hunger Games|The nation of Pan...|['Action/Adventur...|
|20663735|          Narasimham|Poovalli Induchoo...|['Musical', 'Acti...|
| 2231378|  The Lemon Drop Kid|The Lemon Drop Ki...|          ['Comedy']|
|  595909|   A Cry in the Dark|Seventh-day Adven...|['Crime Fiction',...|
| 5272176|            End Game|The president is ...|['Action/Adventur...|
| 1952976|          Dark Water|{{plot}} The film...|['Thriller', 'Dra...|
|24225279|                Sing|The story begins ...|           ['Drama']|
| 2462689|       Meet John Doe|Infuriated at bei...|['Black-and-white...|
|20532852|Destination Meatball|A line of people ...|['Animation', 'Sh...|
|15401493|    Husband for Hire|Lola  a

In [10]:
sqlContext.sql('select max(genre) from movie').show()

+----------------+
|      max(genre)|
+----------------+
|['World cinema']|
+----------------+



In [14]:
train.show(10)

+--------+--------------------+--------------------+--------------------+
|movie_id|          movie_name|                plot|               genre|
+--------+--------------------+--------------------+--------------------+
|23890098|          Taxi Blues|Shlykov, a hard-w...|['World cinema', ...|
|31186339|    The Hunger Games|The nation of Pan...|['Action/Adventur...|
|20663735|          Narasimham|Poovalli Induchoo...|['Musical', 'Acti...|
| 2231378|  The Lemon Drop Kid|The Lemon Drop Ki...|          ['Comedy']|
|  595909|   A Cry in the Dark|Seventh-day Adven...|['Crime Fiction',...|
| 5272176|            End Game|The president is ...|['Action/Adventur...|
| 1952976|          Dark Water|{{plot}} The film...|['Thriller', 'Dra...|
|24225279|                Sing|The story begins ...|           ['Drama']|
| 2462689|       Meet John Doe|Infuriated at bei...|['Black-and-white...|
|20532852|Destination Meatball|A line of people ...|['Animation', 'Sh...|
+--------+--------------------+-------

In [16]:
mapping.show()

+---+----------------+
|_c0|               0|
+---+----------------+
|  0|           Drama|
|  1|          Comedy|
|  2|    Romance Film|
|  3|        Thriller|
|  4|          Action|
|  5|    World cinema|
|  6|   Crime Fiction|
|  7|          Horror|
|  8| Black-and-white|
|  9|           Indie|
| 10|Action/Adventure|
| 11|       Adventure|
| 12|     Family Film|
| 13|      Short Film|
| 14|  Romantic drama|
| 15|       Animation|
| 16|         Musical|
| 17| Science Fiction|
| 18|         Mystery|
| 19| Romantic comedy|
+---+----------------+



In [25]:
train.groupBy("genre") \
    .count() \
    .orderBy(col("count").desc()) \
    .show(200)

+--------------------+-----+
|               genre|count|
+--------------------+-----+
|           ['Drama']| 3677|
|          ['Comedy']| 1371|
|          ['Horror']|  986|
|      ['Short Film']|  630|
|['World cinema', ...|  628|
|['Romance Film', ...|  491|
| ['Comedy', 'Drama']|  470|
|          ['Action']|  440|
|['Black-and-white...|  378|
|['Comedy', 'Roman...|  365|
|    ['Romance Film']|  362|
|['Crime Fiction',...|  347|
|  ['Indie', 'Drama']|  338|
|        ['Thriller']|  321|
|['Animation', 'Sh...|  276|
|['Animation', 'Sh...|  272|
|['Thriller', 'Dra...|  263|
|['Thriller', 'Hor...|  255|
| ['Action', 'Drama']|  232|
|['Black-and-white...|  215|
|['Family Film', '...|  203|
|['Comedy', 'Famil...|  200|
|['Drama', 'Romanc...|  199|
|['Crime Fiction',...|  194|
| ['Black-and-white']|  189|
|['Indie', 'Comedy...|  187|
|['Animation', 'Co...|  181|
|       ['Animation']|  180|
|['Drama', 'Romanc...|  180|
|['Science Fiction...|  177|
|['Action/Adventur...|  177|
|         ['Mu

In [15]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, CountVectorizer, OneHotEncoderEstimator, StringIndexer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.ml.classification import LogisticRegression

In [27]:
regexTokenizer = RegexTokenizer(inputCol="plot", outputCol="words", pattern="\\W")

stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered")

countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

# label_stringIdx = StringIndexer(inputCol = "genre", outputCol = "label")

oneHotEncoder = (OneHotEncoderEstimator()
    .setInputCols(["genre"])
    .setOutputCols(["encoded"])
    .setDropLast(False))

In [19]:
from pyspark.ml import Pipeline

In [28]:
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, oneHotEncoder])

# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(train)
train_sp = pipelineFit.transform(train)


IllegalArgumentException: 'requirement failed: Column genre must be of type numeric but was actually of type string.'

In [24]:
train_sp.select("movie_name","genre","encoded").show()

+--------------------+--------------------+------+-------------------+
|          movie_name|               genre| label|            encoded|
+--------------------+--------------------+------+-------------------+
|          Taxi Blues|['World cinema', ...|   4.0|   (2426,[4],[1.0])|
|    The Hunger Games|['Action/Adventur...|1605.0|(2426,[1605],[1.0])|
|          Narasimham|['Musical', 'Acti...| 316.0| (2426,[316],[1.0])|
|  The Lemon Drop Kid|          ['Comedy']|   1.0|   (2426,[1],[1.0])|
|   A Cry in the Dark|['Crime Fiction',...| 113.0| (2426,[113],[1.0])|
|            End Game|['Action/Adventur...|  95.0|  (2426,[95],[1.0])|
|          Dark Water|['Thriller', 'Dra...| 103.0| (2426,[103],[1.0])|
|                Sing|           ['Drama']|   0.0|   (2426,[0],[1.0])|
|       Meet John Doe|['Black-and-white...| 290.0| (2426,[290],[1.0])|
|Destination Meatball|['Animation', 'Sh...|  15.0|  (2426,[15],[1.0])|
|    Husband for Hire|          ['Comedy']|   1.0|   (2426,[1],[1.0])|
|     

In [15]:
train_pd = train_sp.toPandas()

In [16]:
train_pd[["genre","label", "encoded"]]

Unnamed: 0,genre,label,encoded
0,"['World cinema', 'Drama']",4.0,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"['Action/Adventure', 'Action', 'Science Fictio...",1603.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"['Musical', 'Action', 'Drama']",316.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,['Comedy'],1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"['Crime Fiction', 'World cinema', 'Drama']",113.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...
31103,['Horror'],2.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
31104,"['Romantic drama', 'Romance Film', 'World cine...",34.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
31105,['Comedy'],1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
31106,['Drama'],0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [30]:
train_pd.head()

Unnamed: 0,movie_id,movie_name,plot,genre,words,filtered,features,label,encoded
0,23890098,Taxi Blues,"Shlykov, a hard-working taxi driver and Lyosha...","['World cinema', 'Drama']","[shlykov, a, hard, working, taxi, driver, and,...","[shlykov, hard, working, taxi, driver, lyosha,...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4.0,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
1,31186339,The Hunger Games,The nation of Panem consists of a wealthy Capi...,"['Action/Adventure', 'Action', 'Science Fictio...","[the, nation, of, panem, consists, of, a, weal...","[nation, panem, consists, wealthy, capitol, tw...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 2.0, 0.0, ...",1603.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,20663735,Narasimham,Poovalli Induchoodan is sentenced for six yea...,"['Musical', 'Action', 'Drama']","[poovalli, induchoodan, is, sentenced, for, si...","[poovalli, induchoodan, sentenced, six, years,...","(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 0.0, 0.0, 4.0, ...",316.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,2231378,The Lemon Drop Kid,"The Lemon Drop Kid , a New York City swindler,...",['Comedy'],"[the, lemon, drop, kid, a, new, york, city, sw...","[lemon, drop, kid, new, york, city, swindler, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, ...",1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,595909,A Cry in the Dark,Seventh-day Adventist Church pastor Michael Ch...,"['Crime Fiction', 'World cinema', 'Drama']","[seventh, day, adventist, church, pastor, mich...","[seventh, day, adventist, church, pastor, mich...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",113.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [36]:
train_pd.iloc[(0,8)]

SparseVector(2423, {4: 1.0})