In [None]:
import os

import google_auth_oauthlib.flow
from hdfs import InsecureClient
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
import pyspark.sql.functions as F
import pyspark.sql.types as t

import youtube_utils
from secrets import YOUTUBE_CLIENT_SECRET_FILENAME, HADOOP_USER_NAME, SPARK_URI, HADOOP_NAMENODE

In [None]:
os.environ['HADOOP_USER_NAME'] = HADOOP_USER_NAME

In [None]:
client_hdfs = InsecureClient(f'http://{HADOOP_NAMENODE}:50070', user=HADOOP_USER_NAME)

In [None]:
# get preprocessed opusdata filename
hdfs_path = "/processed/opusdata_omdb.csv"

filename = [f for f in client_hdfs.list(hdfs_path) if f.endswith(".csv")][0]

In [None]:
sc = SparkContext(SPARK_URI)
sparkSession = (
    SparkSession.builder.appName("example-pyspark-read-and-write")
    .config("spark.hadoop.dfs.client.use.datanode.hostname", "true")
    .getOrCreate()
)

In [None]:
# Read from hdfs
opusdata_omdb = sparkSession.read.csv(
    f"hdfs://{HADOOP_NAMENODE}:8020{hdfs_path}/{filename}",
    header=True,
    inferSchema=True,
)

In [None]:
opusdata_omdb.show()

In [None]:
scopes = ["https://www.googleapis.com/auth/youtube.readonly"]

In [None]:
# Disable OAuthlib's HTTPS verification when running locally.
# *DO NOT* leave this option enabled in production.
os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"

api_service_name = "youtube"
api_version = "v3"

# Get credentials and create an API client
flow = google_auth_oauthlib.flow.InstalledAppFlow.from_client_secrets_file(
    YOUTUBE_CLIENT_SECRET_FILENAME, scopes
)

youtube = youtube_utils.get_authenticated_service(api_service_name, api_version, scopes)

In [None]:
@F.udf(returnType=t.StringType())
def id_from_title(movie_name):
    request = youtube.search().list(part="snippet", q=f"{movie_name} official trailer")
    response = request.execute()
    available_videos = response["items"]

    for video in available_videos:
        try:
            kind = video["id"]["kind"]
            if kind == "youtube#video":
                video_id = video["id"]["videoId"]
                return video_id

        except KeyError as e:
            return None

    return None

In [None]:
schema = t.StructType(
    [
        t.StructField("youtube_view_count", t.StringType(), True),
        t.StructField("youtube_engagement_score", t.StringType(), True),
        t.StructField("youtube_positive_engagement_score", t.StringType(), True),
    ]
)

In [None]:
@F.udf(returnType=schema)
def stats_from_id(video_id):
    if not video_id:
        return None, None, None
    request = youtube.videos().list(part="statistics", id=video_id)
    response = request.execute()

    try:
        stats = response["items"][0]["statistics"]

        view_count = int(stats["viewCount"])
        like_count = int(stats["likeCount"])
        dislike_count = int(stats["dislikeCount"])

    except KeyError as e:
        return None, None, None

    engagement_score = (like_count + dislike_count) / view_count
    positive_engagement_score = like_count / dislike_count

    return t.Row(
        "youtube_view_count",
        "youtube_engagement_score",
        "youtube_positive_engagement_score",
    )(view_count, engagement_score, positive_engagement_score)

In [None]:
opusdata_youtube = opusdata_omdb.withColumn(
    "youtube_video_id", id_from_title("movie_name")
)

In [None]:
opusdata_youtube.show()