In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
import pyspark.sql.functions as fc
from pyspark.sql.functions import column as col,udf
import pyspark.sql.types as tp
from datetime import datetime as dt
import kaggle



In [2]:
spark = SparkSession.builder.config("spark.jars.packages","com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.4").getOrCreate()
spark

# Netflix Catalog

### Load catalog data and get ride of malformed rows

In [3]:
netflix_catalog_path='/Users/brayanjules/Projects/personal/data engineer/datasets/raw_netflix_catalog'
catalog = spark.read.csv(netflix_catalog_path,inferSchema=True, header=True,mode="DROPMALFORMED")

In [4]:
files=kaggle.api.dataset_download_files(dataset="shivamb/netflix-shows")

In [14]:
files

In [4]:
catalog.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



In [4]:
catalog_ordered=catalog.orderBy(fc.desc('title'))
catalog_ordered.limit(3).toPandas()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81011096,Movie,최강전사 미니특공대 : 영웅의 탄생,Young Jun Lee,"Um Sang-hyun, Yang Jeong-hwa, Jeon Tae-yeol, S...",,"September 1, 2018",2018,TV-Y7-FV,68 min,Children & Family Movies,"Miniforce, a special task force of elite range..."
1,80226357,Movie,반드시 잡는다,Hong-seon Kim,Baek Yoon-sik,South Korea,"February 28, 2018",2017,TV-MA,110 min,"Dramas, International Movies, Thrillers",After people in his town start turning up dead...
2,80226338,TV Show,마녀사냥,,"Si-kyung Sung, Se-yoon Yoo, Dong-yup Shin, Ji-...",South Korea,"February 19, 2018",2015,TV-MA,1 Season,"International TV Shows, Korean TV Shows, Stand...",Four Korean celebrity men and guest stars of b...


## Netflix Catalog Data Cleaning

### Deduplication of the data

In [5]:
duplicatedContent=catalog.groupBy(['title','director']).count().orderBy(fc.desc('count'))
duplicatedContent.limit(5).toPandas()

Unnamed: 0,title,director,count
0,Life,,2
1,The Silence,Gajendra Ahire,2
2,The Birth Reborn,Eduardo Chauvet,2
3,Top Boy,,2
4,Frank and Cindy,G.J. Echternkamp,2


In [6]:
catalog.where(catalog.title=='The Silence').dropDuplicates(['title','director']).toPandas()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81021447,Movie,The Silence,John R. Leonetti,"Stanley Tucci, Kiernan Shipka, Miranda Otto, K...",Germany,"April 10, 2019",2019,TV-14,91 min,"Horror Movies, Thrillers",With the world under attack by deadly creature...
1,80244078,Movie,The Silence,Gajendra Ahire,"Raghuvir Yadav, Nagraj Manjule, Anjali Patil, ...",India,"March 1, 2018",2017,TV-MA,90 min,"Dramas, International Movies","On a train in Mumbai, 20-something Chini witne..."


In [7]:
catalog.dropDuplicates(['title','director']).groupBy(['title','director']).count().orderBy(fc.desc('count')).toPandas()

Unnamed: 0,title,director,count
0,I'll See You in My Dreams,Brett Haley,1
1,Raging Bull,Martin Scorsese,1
2,Barbie: Spy Squad,Conrad Helten,1
3,Drive,Tarun Mansukhani,1
4,Yaara O Dildaara,Ksshitij Chaudhary,1
...,...,...,...
6216,Pacific Heat,,1
6217,Lunatics,,1
6218,Real Rob,,1
6219,Explained,,1


In [7]:
no_valid_content=catalog.where((catalog.title).isNotNull() & (catalog.description).isNull())
no_valid_content.limit(5).toPandas()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description


In [8]:
non_duplicated_content=catalog.dropDuplicates(['title','director']).orderBy(fc.desc('title'))
non_duplicated_content.limit(3).toPandas()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81011096,Movie,최강전사 미니특공대 : 영웅의 탄생,Young Jun Lee,"Um Sang-hyun, Yang Jeong-hwa, Jeon Tae-yeol, S...",,"September 1, 2018",2018,TV-Y7-FV,68 min,Children & Family Movies,"Miniforce, a special task force of elite range..."
1,80226357,Movie,반드시 잡는다,Hong-seon Kim,Baek Yoon-sik,South Korea,"February 28, 2018",2017,TV-MA,110 min,"Dramas, International Movies, Thrillers",After people in his town start turning up dead...
2,80226338,TV Show,마녀사냥,,"Si-kyung Sung, Se-yoon Yoo, Dong-yup Shin, Ji-...",South Korea,"February 19, 2018",2015,TV-MA,1 Season,"International TV Shows, Korean TV Shows, Stand...",Four Korean celebrity men and guest stars of b...


### Drop rows without title.

In [9]:
content_with_title=non_duplicated_content.dropna('any',subset=['title']).orderBy(fc.asc('title'))
content_with_title=content_with_title.withColumn('title',fc.translate('title','"',''))
content_with_title.limit(3).toPandas()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,80132127,Movie,Behind The Cove: The Quiet Japanese Speak Out,Keiko Yagi,,"Japan, United States","August 25, 2017",2015,TV-14,105 min,"Documentaries, International Movies",After a documentary about the Japanese whaling...
1,81168345,Movie,Escape from the Liberty Cinema,Wojciech Marczewski,"Janusz Gajos, Zbigniew Zamachowski, Teresa Mar...",Poland,"October 1, 2019",1990,TV-MA,88 min,"Comedies, Dramas, Independent Movies",Artistic rebellion ignites at the movies when ...
2,81087095,Movie,#Roxy,Michael Kennedy,"Jake Short, Sarah Fisher, Booboo Stewart, Dann...",Canada,"April 10, 2019",2018,TV-14,105 min,"Comedies, Romantic Movies",A teenage hacker with a huge nose helps a cool...


### Fix of column types

In [10]:
content_with_title=content_with_title.withColumn('show_id',col('show_id').cast(tp.LongType()))
content_with_title=content_with_title.withColumn('release_year',col('release_year').cast(tp.IntegerType()))
content_with_title=content_with_title.withColumn('date_added',fc.to_date('date_added','MMMMM dd, yyyy'))

In [11]:
topDirector=content_with_title.dropna('any',subset=['director']).where((catalog.type=='Movie')).groupBy('director').count().orderBy(fc.desc('count'))
topDirector.limit(3).toPandas()

Unnamed: 0,director,count
0,"Raúl Campos, Jan Suter",18
1,Marcus Raboy,14
2,Jay Karas,13


In [12]:
content_with_title.printSchema()

root
 |-- show_id: long (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: date (nullable = true)
 |-- release_year: integer (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



In [16]:
content_with_title.count()

6219

In [459]:
content_with_title.write.json('/Users/brayanjules/Projects/personal/data engineer/datasets/netflix_catalog','overwrite')

# Reddit Netflix comments

In [13]:
import praw
import os
import pandas as pd
from pyspark.sql import Row

In [453]:
os.getcwd()

'/Users/brayanjules/Projects/personal/data engineer/nanodegre/capstone-project'

In [14]:
reddit = praw.Reddit(client_id='k6twSlTNOdnGjQ',client_secret='NrQ-rdSKMOJM17yj3hO4apbmTis'
                     ,
                     user_agent='academic_comments_understanding:v1 by /u/zekeja') ## Use the praw.init when possible

### SubReddit Search by content ( netflix,NetflixBestOf,bestofnetflix)

In [15]:
redditSchema=tp.StructType([tp.StructField('show_id',tp.LongType(),True),
               tp.StructField('submission_id',tp.StringType(),True),
               tp.StructField('source',tp.StringType(),True),
               tp.StructField('title',tp.StringType(),True),
               tp.StructField('description',tp.StringType(),True),
               tp.StructField('created_utc',tp.TimestampType(),True),
               tp.StructField('author',tp.StringType(),True),
               tp.StructField('score',tp.IntegerType(),True),
               tp.StructField('spoiler',tp.BooleanType(),True),
               tp.StructField('is_original_content',tp.BooleanType(),True),
               tp.StructField('distinguished',tp.StringType(),True),
               tp.StructField('link',tp.StringType(),True),             
               tp.StructField('comments',tp.ArrayType(tp.StructType([
                   tp.StructField('comment_id',tp.StringType(),True),
                   tp.StructField('body',tp.StringType(),True),
                   tp.StructField('created_utc',tp.TimestampType(),True),
                   tp.StructField('score',tp.IntegerType(),True),
                   tp.StructField('parent_id',tp.StringType(),True),
                   tp.StructField('submission_id',tp.StringType(),True)]
               )),True)
              ])


In [644]:
content_rows=[]
for content in content_with_title.limit(5).collect():
    title_split=content.title.split(":",1)
    content_title=title_split[0]
    subreddit=reddit.subreddit('netflix')
    for sm in subreddit.search('"'+content_title+'"',sort='new'):
        sm.comments.replace_more(limit=None)
        #print(sm.title)
        row_comments = []
        for comment in sm.comments.list():
            row_comments.append((comment.id,comment.body,dt.fromtimestamp(float(comment.created_utc)),comment.score,
                                       comment.parent_id,comment.link_id)) 
        current_sm=(content.show_id,sm.id,subreddit.display_name,sm.title,sm.selftext,
                    dt.fromtimestamp(float(sm.created_utc)),sm.author.name,
                      sm.score,sm.spoiler,sm.is_original_content,sm.distinguished,sm.permalink,row_comments)
        content_rows.append(current_sm)    

In [645]:
rdt_netflix_content=spark.createDataFrame(content_rows,redditSchema)
rdt_netflix_content.printSchema()

root
 |-- show_id: long (nullable = true)
 |-- submission_id: string (nullable = true)
 |-- source: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- author: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- spoiler: boolean (nullable = true)
 |-- is_original_content: boolean (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- link: string (nullable = true)
 |-- comments: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- comment_id: string (nullable = true)
 |    |    |-- body: string (nullable = true)
 |    |    |-- created_utc: timestamp (nullable = true)
 |    |    |-- score: integer (nullable = true)
 |    |    |-- parent_id: string (nullable = true)
 |    |    |-- submission_id: string (nullable = true)



In [647]:
rdt_netflix_content.orderBy(fc.desc('created_utc')).limit(3).toPandas()

Unnamed: 0,show_id,submission_id,source,title,description,created_utc,author,score,spoiler,is_original_content,distinguished,link,comments
0,81092768,e8v216,netflix,Weird shirtless gym selfie on Netflix Twitter ...,[https://twitter.com/netflix/status/120445343...,2019-12-10 19:07:43,OrganicCorndawg,1,False,False,,/r/netflix/comments/e8v216/weird_shirtless_gym...,"[(faeoqgb, This response seems to explain it:\..."
1,81092768,99vbkh,netflix,Selfie From Hell Movie: Ending Explained + Wha...,,2018-08-24 07:14:30,PaulTweddle,6,False,False,,/r/netflix/comments/99vbkh/selfie_from_hell_mo...,[]
2,80132127,98y3ld,netflix,"[UK] Netflix no longer has The Cove, the Oscar...",I appreciate having both sides of the argument...,2018-08-20 22:22:31,lewis_pritchard,143,False,False,,/r/netflix/comments/98y3ld/uk_netflix_no_longe...,"[(e4jp9zf, It's highly unlikely they know the ..."


In [642]:
rdt_netflix_content.limit(5).toPandas()

Unnamed: 0,show_id,submission_id,source,title,description,created_utc,author,score,spoiler,is_original_content,distinguished,link,comments
0,80132127,98y3ld,netflix,"[UK] Netflix no longer has The Cove, the Oscar...",I appreciate having both sides of the argument...,2018-08-20 22:22:31,lewis_pritchard,140,False,False,,/r/netflix/comments/98y3ld/uk_netflix_no_longe...,"[(e4jp9zf, It's highly unlikely they know the ..."
1,81092768,e8v216,netflix,Weird shirtless gym selfie on Netflix Twitter ...,[https://twitter.com/netflix/status/120445343...,2019-12-10 19:07:43,OrganicCorndawg,1,False,False,,/r/netflix/comments/e8v216/weird_shirtless_gym...,"[(faeoqgb, This response seems to explain it:\..."
2,81092768,99vbkh,netflix,Selfie From Hell Movie: Ending Explained + Wha...,,2018-08-24 07:14:30,PaulTweddle,6,False,False,,/r/netflix/comments/99vbkh/selfie_from_hell_mo...,[]


In [451]:
len(rdt_netflix_content.toPandas())

1684

In [448]:
rdt_netflix_content.write.json("/Users/brayanjules/Projects/personal/data engineer/datasets/reddit_netflix",'overwrite')

In [65]:
result_test=content_with_title.limit(5).rdd.flatMap(lambda x:getRedditComments(reddit,x,'netflix',redditSchema))
final_result=spark.createDataFrame(result_test,redditSchema)

In [21]:
def test_x(x):
    return x

In [66]:
final_result.toPandas()

Unnamed: 0,show_id,submission_id,source,title,description,created_utc,author,score,spoiler,is_original_content,distinguished,link,comments
0,80132127,98y3ld,netflix,"[UK] Netflix no longer has The Cove, the Oscar...",I appreciate having both sides of the argument...,2018-08-20 19:22:31,lewis_pritchard,141,False,False,,/r/netflix/comments/98y3ld/uk_netflix_no_longe...,"[(e4jp9zf, It's highly unlikely they know the ..."
1,81087095,g5uizp,netflix,Reciving Error M7111-5059 but I don't use any ...,Just today I have been unable to view any con...,2020-04-22 00:20:45,Atromix_,2,False,False,,/r/netflix/comments/g5uizp/reciving_error_m711...,"[(fo5muf4, Your ISP is probably using a shared..."
2,81092768,99vbkh,netflix,Selfie From Hell Movie: Ending Explained + Wha...,,2018-08-24 04:14:30,PaulTweddle,4,False,False,,/r/netflix/comments/99vbkh/selfie_from_hell_mo...,[]
3,81092768,e8v216,netflix,Weird shirtless gym selfie on Netflix Twitter ...,[https://twitter.com/netflix/status/120445343...,2019-12-10 16:07:43,OrganicCorndawg,1,False,False,,/r/netflix/comments/e8v216/weird_shirtless_gym...,"[(faeoqgb, This response seems to explain it:\..."


In [17]:
stored_rows = SparkContext.accumulator(0,[],[])


In [18]:
def add_new_acc_value(rows):
    global stored_rows
    stored_rows.value.append(rows)

In [19]:
stored_rows.value


[]

In [None]:
#@udf(redditSchema)
def getRedditComments(reddit, show, sub_reddit, reddit_schema):
    content_rows = []
    title_split = show.title.split(":", 1)
    content_title = title_split[0]
    subreddit = reddit.subreddit(sub_reddit)
    for sm in subreddit.search('"' + content_title + '"', sort='top'):
        sm.comments.replace_more(limit=None)
        row_comments = []
        for comment in sm.comments.list():
            #row_comments.append(1)
            row_comments.append(
                (comment.id, comment.body, dt.fromtimestamp(float(comment.created_utc)), comment.score,
                 comment.parent_id, comment.link_id))
        current_sm = (show.show_id, sm.id, subreddit.display_name, sm.title, sm.selftext,
                      dt.fromtimestamp(float(sm.created_utc)), sm.author.name,
                      sm.score, sm.spoiler, sm.is_original_content, sm.distinguished, sm.permalink, row_comments)
        content_rows.append(current_sm)
        # self.log.info('Data extracted from subreddit: {}'.format(sub_reddit))
    return content_rows


In [None]:
a = "netflix  NetflixBestOf"
a.split()

In [None]:
#@udf(redditSchema)
def getRedditComments(reddit, show, sub_reddit, reddit_schema):
    content_rows = []
    title_split = show.title.split(":", 1)
    content_title = title_split[0]
    subreddit = reddit.subreddit(sub_reddit)
    for sm in subreddit.search('"' + content_title + '"', sort='top'):
        sm.comments.replace_more(limit=None)
        row_comments = []
        for comment in sm.comments.list():
            #row_comments.append(1)
            row_comments.append(
                (comment.id, comment.body, dt.fromtimestamp(float(comment.created_utc)), comment.score,
                 comment.parent_id, comment.link_id))
        current_sm = (show.show_id, sm.id, subreddit.display_name, sm.title, sm.selftext,
                      dt.fromtimestamp(float(sm.created_utc)), sm.author.name,
                      sm.score, sm.spoiler, sm.is_original_content, sm.distinguished, sm.permalink, row_comments)
        content_rows.append(current_sm)
        # self.log.info('Data extracted from subreddit: {}'.format(sub_reddit))
    return content_rows


In [74]:
a = "netflix  NetflixBestOf"
a.split()

['netflix', 'NetflixBestOf']

In [55]:
#@udf(redditSchema)
def getRedditComments(reddit, show, sub_reddit, reddit_schema):
    content_rows = []
    title_split = show.title.split(":", 1)
    content_title = title_split[0]
    subreddit = reddit.subreddit(sub_reddit)
    for sm in subreddit.search('"' + content_title + '"', sort='top'):
        sm.comments.replace_more(limit=None)
        row_comments = []
        for comment in sm.comments.list():
            #row_comments.append(1)
            row_comments.append(
                (comment.id, comment.body, dt.fromtimestamp(float(comment.created_utc)), comment.score,
                 comment.parent_id, comment.link_id))
        current_sm = (show.show_id, sm.id, subreddit.display_name, sm.title, sm.selftext,
                      dt.fromtimestamp(float(sm.created_utc)), sm.author.name,
                      sm.score, sm.spoiler, sm.is_original_content, sm.distinguished, sm.permalink, row_comments)
        content_rows.append(current_sm)
        # self.log.info('Data extracted from subreddit: {}'.format(sub_reddit))
    return content_rows
