In [1]:
import sys
import os

sys.path.insert(0, '/usr/hdp/current/spark2-client/python')
sys.path.insert(0, '/usr/hdp/current/spark2-client/python/lib/py4j-0.10.7-src.zip')

os.environ['SPARK_HOME'] = '/usr/hdp/current/spark2-client/'
os.environ['SPARK_CONF_DIR'] = '/etc/spark2/conf'
os.environ['PYSPARK_PYTHON'] = '/opt/anaconda3/bin/python'

import pyspark
conf = pyspark.SparkConf()
conf.setMaster("yarn")
conf.set("spark.driver.memory","2g")
conf.set("spark.executor.instances", "5")
conf.set("spark.executor.memory","5g")
conf.set("spark.executor.cores","5")

sc = pyspark.SparkContext(conf=conf)

In [2]:
sc

### Movie Ratings

An independent movie company is looking to invest in a new movie project. With limited finances, the company wants to 
analyze the reactions of audiences, particularly toward various movie genres, in order to identify a 
movie project to focus on which will help the business earn more profit. The company relies on data collected from a publicly available recommendation service by [MovieLens](http://dl.acm.org/citation.cfm?id=2827872). This [dataset](http://files.grouplens.org/datasets/movielens/ml-10m-README.html) contains **24,404,096** ratings and **668,953** tags applied across **40,110** movies. This data was created by **247,753** users between January 09, 1995 and January 29, 2016. This dataset was generated on October 17, 2016. 

From this dataset, several analyses are possible, include the following:
1.   Find movies which have the highest average ratings over the years and identify the corresponding genre.
2.   Find genres which have the highest average ratings over the years.
3.   Find users who rate movies most frequently in order to contact them for an in-depth marketing analysis.

These types of analyses, which are somewhat ambiguous, demand the ability to quickly process large amounts of data in 
a relatively short amount of time for justifying business decisions. In these situations, the size of the data typically makes analysis done on a single machine impossible and analysis done using a remote storage system impractical. For the remainder of the lessons, we will learn how HDFS provides the basis to store a massive amount of data and to enable the programming approach to analyze this data.

In [3]:
!hdfs dfs -put ../data/ml-latest-small ./
!hdfs dfs -ls
!hdfs dfs -ls ./ml-latest-small

put: `ml-latest-small/links.csv': File exists
put: `ml-latest-small/tags.csv': File exists
put: `ml-latest-small/ratings.csv': File exists
put: `ml-latest-small/README.txt': File exists
put: `ml-latest-small/movies.csv': File exists
Found 3 items
drwxr-xr-x   - lngo hadoop          0 2020-01-06 15:32 .sparkStaging
drwxr-xr-x   - lngo hadoop          0 2020-01-06 13:50 ml-latest-small
drwxr-xr-x   - lngo hadoop          0 2020-01-02 13:27 text
Found 5 items
-rw-r--r--   2 lngo hadoop       8342 2020-01-06 13:50 ml-latest-small/README.txt
-rw-r--r--   2 lngo hadoop     188236 2020-01-06 13:50 ml-latest-small/links.csv
-rw-r--r--   2 lngo hadoop     484688 2020-01-06 13:50 ml-latest-small/movies.csv
-rw-r--r--   2 lngo hadoop    2382886 2020-01-06 13:50 ml-latest-small/ratings.csv
-rw-r--r--   2 lngo hadoop     114976 2020-01-06 13:50 ml-latest-small/tags.csv


In [4]:
!!hdfs dfs -cat ml-latest-small/movies.csv \
    2>/dev/null | head -n 100

['movieId,title,genres',
 '1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy',
 '2,Jumanji (1995),Adventure|Children|Fantasy',
 '3,Grumpier Old Men (1995),Comedy|Romance',
 '4,Waiting to Exhale (1995),Comedy|Drama|Romance',
 '5,Father of the Bride Part II (1995),Comedy',
 '6,Heat (1995),Action|Crime|Thriller',
 '7,Sabrina (1995),Comedy|Romance',
 '8,Tom and Huck (1995),Adventure|Children',
 '9,Sudden Death (1995),Action',
 '10,GoldenEye (1995),Action|Adventure|Thriller',
 '11,"American President, The (1995)",Comedy|Drama|Romance',
 '12,Dracula: Dead and Loving It (1995),Comedy|Horror',
 '13,Balto (1995),Adventure|Animation|Children',
 '14,Nixon (1995),Drama',
 '15,Cutthroat Island (1995),Action|Adventure|Romance',
 '16,Casino (1995),Crime|Drama',
 '17,Sense and Sensibility (1995),Drama|Romance',
 '18,Four Rooms (1995),Comedy',
 '19,Ace Ventura: When Nature Calls (1995),Comedy',
 '20,Money Train (1995),Action|Comedy|Crime|Drama|Thriller',
 '21,Get Shorty (1995),Comedy|Crime

In [5]:
!!hdfs dfs -cat ml-latest-small/ratings.csv \
    2>/dev/null | head -n 100

['userId,movieId,rating,timestamp',
 '1,1,4.0,964982703',
 '1,3,4.0,964981247',
 '1,6,4.0,964982224',
 '1,47,5.0,964983815',
 '1,50,5.0,964982931',
 '1,70,3.0,964982400',
 '1,101,5.0,964980868',
 '1,110,4.0,964982176',
 '1,151,5.0,964984041',
 '1,157,5.0,964984100',
 '1,163,5.0,964983650',
 '1,216,5.0,964981208',
 '1,223,3.0,964980985',
 '1,231,5.0,964981179',
 '1,235,4.0,964980908',
 '1,260,5.0,964981680',
 '1,296,3.0,964982967',
 '1,316,3.0,964982310',
 '1,333,5.0,964981179',
 '1,349,4.0,964982563',
 '1,356,4.0,964980962',
 '1,362,5.0,964982588',
 '1,367,4.0,964981710',
 '1,423,3.0,964982363',
 '1,441,4.0,964980868',
 '1,457,5.0,964981909',
 '1,480,4.0,964982346',
 '1,500,3.0,964981208',
 '1,527,5.0,964984002',
 '1,543,4.0,964981179',
 '1,552,4.0,964982653',
 '1,553,5.0,964984153',
 '1,590,4.0,964982546',
 '1,592,4.0,964982271',
 '1,593,4.0,964983793',
 '1,596,5.0,964982838',
 '1,608,5.0,964982931',
 '1,648,3.0,964982563',
 '1,661,5.0,964982838',
 '1,673,3.0,964981775',
 '1,733,4.0,9

In [6]:
sqlContext = pyspark.SQLContext(sc)
sqlContext

<pyspark.sql.context.SQLContext at 0x7f9bdc1191d0>

In [7]:
#ratings = sc.textFile("./ml-latest-small/ratings.csv")

ratings = sqlContext.read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .load("./ml-latest-small/ratings.csv")\
    .cache()

In [8]:
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [9]:
%%time
ratings.count()

CPU times: user 1.11 ms, sys: 185 µs, total: 1.3 ms
Wall time: 1.07 s


100836

In [10]:
%%time
ratings.count()

CPU times: user 818 µs, sys: 985 µs, total: 1.8 ms
Wall time: 145 ms


100836

### 4.1 Find movies which have the highest average ratings over the years and identify the corresponding genre.

- Find the average ratings of all movies over the years
- Identify the corresponding genres for each movie

In [11]:
ratings.take(5)

[Row(userId=1, movieId=1, rating=4.0, timestamp=964982703),
 Row(userId=1, movieId=3, rating=4.0, timestamp=964981247),
 Row(userId=1, movieId=6, rating=4.0, timestamp=964982224),
 Row(userId=1, movieId=47, rating=5.0, timestamp=964983815),
 Row(userId=1, movieId=50, rating=5.0, timestamp=964982931)]

In [12]:
ratings.registerTempTable("ratings")

In [13]:
%%time
avgRatings = sqlContext.sql("SELECT movieId, AVG(rating) AS AvgRating \
                                    FROM ratings \
                                    GROUP BY movieId \
                                    ORDER BY AvgRating")
avgRatings.show()

+-------+---------+
|movieId|AvgRating|
+-------+---------+
| 135216|      0.5|
|  54768|      0.5|
| 122627|      0.5|
| 107013|      0.5|
|   4775|      0.5|
|  67799|      0.5|
|  85334|      0.5|
|  57326|      0.5|
|   7742|      0.5|
|  31422|      0.5|
|  92681|      0.5|
|  60363|      0.5|
|  89386|      0.5|
|   5771|      0.5|
| 122888|      0.5|
| 134246|      0.5|
|  72424|      0.5|
|   5105|      0.5|
| 109897|      0.5|
| 138186|      0.5|
+-------+---------+
only showing top 20 rows

CPU times: user 2 ms, sys: 3.38 ms, total: 5.38 ms
Wall time: 1.37 s


How do we augment movie ratings data with title information?

In [14]:
#movies = sc.textFile("./ml-latest-small/movies.csv")
movies = sqlContext.read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .load("./ml-latest-small/movies.csv")\
    .cache()
movies.registerTempTable("movies")

In [15]:
movies.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [19]:
%%time
avgRatings = sqlContext.sql("SELECT m.title, r.movieId, AVG(r.rating) AS AvgRatings \
                                    FROM ratings AS r \
                                    INNER JOIN movies AS m \
                                    ON m.movieId = r.movieId \
                                    GROUP BY r.movieId, m.title \
                                    ORDER BY AvgRatings DESC")
avgRatings.show()

+--------------------+-------+----------+
|               title|movieId|AvgRatings|
+--------------------+-------+----------+
|    Lady Jane (1986)|   6201|       5.0|
|Runaway Brain (19...|  96608|       5.0|
|Awfully Big Adven...|    148|       5.0|
|  Tokyo Tribe (2014)| 138632|       5.0|
|Karlson Returns (...| 172585|       5.0|
|Martin Lawrence L...|   5513|       5.0|
|    All Yours (2016)| 158882|       5.0|
|Entertaining Ange...|   1140|       5.0|
|Gena the Crocodil...| 175293|       5.0|
|Slumber Party Mas...|   3940|       5.0|
|Dr. Goldfoot and ...|   4402|       5.0|
|20 Million Miles ...|   5468|       5.0|
|   The Editor (2015)| 142444|       5.0|
|Trinity and Sarta...| 128087|       5.0|
|Crippled Avengers...| 115727|       5.0|
|Enter the Void (2...|  78836|       5.0|
|Mickey's Once Upo...|  72692|       5.0|
|      My Love (2006)| 134095|       5.0|
|Go for Zucker! (A...|  44851|       5.0|
| On the Ropes (1999)|   2824|       5.0|
+--------------------+-------+----

### Challenge:

Make appropriate changes so that only movies with average ratings higher than 3.75 and number of ratings totalling at least 1000 are collected.

In [21]:
%%time
ratingData = sqlContext.sql("SELECT FIRST(m.title), FIRST(r.movieId), AVG(r.rating) AS AvgRatings, COUNT(r.movieId) as Count \
                                    FROM ratings AS r \
                                    INNER JOIN movies AS m \
                                    ON m.movieId = r.movieId \
                                    GROUP BY r.movieId \
                                    ORDER BY AvgRatings DESC")
ratingData.show()

+--------------------+---------------------+----------+-----+
| first(title, false)|first(movieId, false)|AvgRatings|Count|
+--------------------+---------------------+----------+-----+
|     Lamerica (1994)|                   53|       5.0|    2|
|Adventures Of She...|               147300|       5.0|    1|
|Supercop 2 (Proje...|                  876|       5.0|    1|
|  Palindromes (2004)|                33138|       5.0|    1|
|Passenger, The (P...|                26350|       5.0|    1|
|Strictly Sexual (...|                67618|       5.0|    1|
|          61* (2001)|                27373|       5.0|    1|
| The Love Bug (1997)|               150554|       5.0|    1|
|Tales of Manhatta...|                25887|       5.0|    1|
|Denise Calls Up (...|                  633|       5.0|    1|
|Zeitgeist: Moving...|                84273|       5.0|    1|
|Awfully Big Adven...|                  148|       5.0|    1|
|   The Editor (2015)|               142444|       5.0|    1|
|Raise Y

In [23]:
ratingData.registerTempTable("ratingData")
selectedRating = sqlContext.sql("SELECT * FROM ratingData \
                                    WHERE AvgRatings > 3.5 and COUNT > 50 \
                                    ORDER BY AvgRatings DESC")
selectedRating.show()

+--------------------+---------------------+------------------+-----+
| first(title, false)|first(movieId, false)|        AvgRatings|Count|
+--------------------+---------------------+------------------+-----+
|Shawshank Redempt...|                  318| 4.429022082018927|  317|
|Godfather, The (1...|                  858|         4.2890625|  192|
|   Fight Club (1999)|                 2959| 4.272935779816514|  218|
|Cool Hand Luke (1...|                 1276| 4.271929824561403|   57|
|Dr. Strangelove o...|                  750| 4.268041237113402|   97|
|  Rear Window (1954)|                  904| 4.261904761904762|   84|
|Godfather: Part I...|                 1221|  4.25968992248062|  129|
|Departed, The (2006)|                48516| 4.252336448598131|  107|
|   Goodfellas (1990)|                 1213|              4.25|  126|
|   Casablanca (1942)|                  912|              4.24|  100|
|Dark Knight, The ...|                58559| 4.238255033557047|  149|
|Usual Suspects, T..

### 4.2 Find genres which have the highest average ratings over the years

- Identify the genres associated with a movie and its rating
- Each movie can have multiple genres. How to flip the Key/Value pair?

In [29]:
%%time
allData = sqlContext.sql("SELECT FIRST(r.movieId) as movieId, FIRST(m.genres) as genres, SUM(r.rating) AS SumRatings, COUNT(r.movieId) as Count \
                                    FROM ratings AS r \
                                    INNER JOIN movies AS m \
                                    ON m.movieId = r.movieId \
                                    GROUP BY r.movieId")
allData.show()

+-------+--------------------+----------+-----+
|movieId|              genres|SumRatings|Count|
+-------+--------------------+----------+-----+
|    148|               Drama|       5.0|    1|
|    471|              Comedy|     142.0|   40|
|    496|Comedy|Drama|Roma...|       5.0|    1|
|    833|              Comedy|      12.0|    6|
|   1088|Drama|Musical|Rom...|     141.5|   42|
|   1238|              Comedy|      36.5|    9|
|   1342|     Horror|Thriller|      27.5|   11|
|   1580|Action|Comedy|Sci-Fi|     575.5|  165|
|   1591|Action|Adventure|...|      68.5|   26|
|   1645|Drama|Mystery|Thr...|     174.0|   51|
|   1829|       Drama|Romance|       6.5|    2|
|   1959|       Drama|Romance|      55.0|   15|
|   2122|     Horror|Thriller|      27.0|   11|
|   2142|Adventure|Animati...|      27.0|   10|
|   2366|Action|Adventure|...|      91.0|   25|
|   2659|  Comedy|Documentary|       2.0|    1|
|   2866|               Drama|      18.0|    5|
|   3175|Adventure|Comedy|...|     268.5

In [68]:
def split_tab(s):
    return s.split("|")
sqlContext.udf.register("split_tab", split_tab)

<function __main__.split_tab(s)>

In [69]:
split_tab("Action|Adventure|Comedy")

['Action', 'Adventure', 'Comedy']

In [60]:
allData.explode(genres)

AttributeError: 'DataFrame' object has no attribute 'explode'

In [67]:
#spark.range(1, 20).registerTempTable("test")
allData.registerTempTable("allData")
genreData = sqlContext.sql('SELECT split_tab(genres), SumRatings, Count FROM allData')
genreData.show()

Py4JJavaError: An error occurred while calling o503.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 101.0 failed 4 times, most recent failure: Lost task 0.3 in stage 101.0 (TID 3991, clnode229.clemson.cloudlab.us, executor 4): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/hadoop/yarn/local/usercache/lngo/appcache/application_1578081381073_0140/container_e01_1578081381073_0140_01_000005/pyspark.zip/pyspark/worker.py", line 235, in main
    process()
  File "/hadoop/yarn/local/usercache/lngo/appcache/application_1578081381073_0140/container_e01_1578081381073_0140_01_000005/pyspark.zip/pyspark/worker.py", line 230, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/hadoop/yarn/local/usercache/lngo/appcache/application_1578081381073_0140/container_e01_1578081381073_0140_01_000005/pyspark.zip/pyspark/serializers.py", line 331, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/hadoop/yarn/local/usercache/lngo/appcache/application_1578081381073_0140/container_e01_1578081381073_0140_01_000005/pyspark.zip/pyspark/serializers.py", line 140, in dump_stream
    for obj in iterator:
  File "/hadoop/yarn/local/usercache/lngo/appcache/application_1578081381073_0140/container_e01_1578081381073_0140_01_000005/pyspark.zip/pyspark/serializers.py", line 320, in _batched
    for item in iterator:
  File "<string>", line 1, in <lambda>
  File "/hadoop/yarn/local/usercache/lngo/appcache/application_1578081381073_0140/container_e01_1578081381073_0140_01_000005/pyspark.zip/pyspark/worker.py", line 76, in <lambda>
    return lambda *a: f(*a)
  File "/hadoop/yarn/local/usercache/lngo/appcache/application_1578081381073_0140/container_e01_1578081381073_0140_01_000005/pyspark.zip/pyspark/util.py", line 55, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-66-7d75bcdc7ac0>", line 2, in split_tab
NameError: name 'array' is not defined

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:83)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:66)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage4.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:253)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1609)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1597)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1596)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1596)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1830)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1779)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1768)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:363)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3278)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2489)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2489)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3259)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3258)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2489)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2703)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:254)
	at sun.reflect.GeneratedMethodAccessor78.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/hadoop/yarn/local/usercache/lngo/appcache/application_1578081381073_0140/container_e01_1578081381073_0140_01_000005/pyspark.zip/pyspark/worker.py", line 235, in main
    process()
  File "/hadoop/yarn/local/usercache/lngo/appcache/application_1578081381073_0140/container_e01_1578081381073_0140_01_000005/pyspark.zip/pyspark/worker.py", line 230, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/hadoop/yarn/local/usercache/lngo/appcache/application_1578081381073_0140/container_e01_1578081381073_0140_01_000005/pyspark.zip/pyspark/serializers.py", line 331, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/hadoop/yarn/local/usercache/lngo/appcache/application_1578081381073_0140/container_e01_1578081381073_0140_01_000005/pyspark.zip/pyspark/serializers.py", line 140, in dump_stream
    for obj in iterator:
  File "/hadoop/yarn/local/usercache/lngo/appcache/application_1578081381073_0140/container_e01_1578081381073_0140_01_000005/pyspark.zip/pyspark/serializers.py", line 320, in _batched
    for item in iterator:
  File "<string>", line 1, in <lambda>
  File "/hadoop/yarn/local/usercache/lngo/appcache/application_1578081381073_0140/container_e01_1578081381073_0140_01_000005/pyspark.zip/pyspark/worker.py", line 76, in <lambda>
    return lambda *a: f(*a)
  File "/hadoop/yarn/local/usercache/lngo/appcache/application_1578081381073_0140/container_e01_1578081381073_0140_01_000005/pyspark.zip/pyspark/util.py", line 55, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-66-7d75bcdc7ac0>", line 2, in split_tab
NameError: name 'array' is not defined

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:83)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:66)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage4.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:253)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more


In [65]:
genreData = sqlContext.sql('SELECT explode(array(split_tab(genres))), SumRatings, Count FROM allData')
genreData.show()

+--------------------+----------+-----+
|                 col|SumRatings|Count|
+--------------------+----------+-----+
|             [Drama]|       5.0|    1|
|            [Comedy]|     142.0|   40|
|[Comedy, Drama, R...|       5.0|    1|
|            [Comedy]|      12.0|    6|
|[Drama, Musical, ...|     141.5|   42|
|            [Comedy]|      36.5|    9|
|  [Horror, Thriller]|      27.5|   11|
|[Action, Comedy, ...|     575.5|  165|
|[Action, Adventur...|      68.5|   26|
|[Drama, Mystery, ...|     174.0|   51|
|    [Drama, Romance]|       6.5|    2|
|    [Drama, Romance]|      55.0|   15|
|  [Horror, Thriller]|      27.0|   11|
|[Adventure, Anima...|      27.0|   10|
|[Action, Adventur...|      91.0|   25|
|[Comedy, Document...|       2.0|    1|
|             [Drama]|      18.0|    5|
|[Adventure, Comed...|     268.5|   75|
|     [Comedy, Drama]|       5.0|    2|
|            [Horror]|      29.5|    9|
+--------------------+----------+-----+
only showing top 20 rows



In [35]:
augmentedInfo.take(5)

[('50', (5.0, ('"Usual Suspects, The (1995)"', 'Crime|Mystery|Thriller'))),
 ('50', (4.0, ('"Usual Suspects, The (1995)"', 'Crime|Mystery|Thriller'))),
 ('50', (1.0, ('"Usual Suspects, The (1995)"', 'Crime|Mystery|Thriller'))),
 ('50', (4.5, ('"Usual Suspects, The (1995)"', 'Crime|Mystery|Thriller'))),
 ('50', (5.0, ('"Usual Suspects, The (1995)"', 'Crime|Mystery|Thriller')))]

In [36]:
def extractGenreRating (t):
    final_tuples = []
    genreList = t[1][1][1].split("|")
    for genre in genreList:
        final_tuples.append((genre,t[1][0]))
    return final_tuples
print(extractGenreRating((u'1', (3.0, (u'Toy Story (1995)', u'Adventure|Animation|Children|Comedy|Fantasy')))))

[('Adventure', 3.0), ('Animation', 3.0), ('Children', 3.0), ('Comedy', 3.0), ('Fantasy', 3.0)]


In [37]:
genreRatings = augmentedInfo.flatMap(extractGenreRating)

In [38]:
countsByKey = genreRatings.countByKey()
countsByKey

defaultdict(int,
            {'Adventure': 24161,
             'Fantasy': 11834,
             'IMAX': 4145,
             'Comedy': 39053,
             'Action': 30635,
             'Sci-Fi': 17243,
             'Romance': 18124,
             'Animation': 6988,
             'Children': 9208,
             'Drama': 41928,
             'Thriller': 26452,
             'Crime': 16681,
             'War': 4859,
             'Mystery': 7674,
             'Western': 1930,
             'Musical': 4138,
             'Horror': 7291,
             'Film-Noir': 870,
             'Documentary': 1219,
             '(no genres listed)': 47})

### Challenge:

Complete the remainder of the steps to find the average rating of each genre. 

In [39]:
sc.stop()