# Exploration using Pandas

In [None]:
import pandas as pd
import os
import util

In [None]:
os.path.abspath('')

In [None]:
base_url = 'https://datasets.imdbws.com'

files_list = [
        "name.basics.tsv.gz",
        "title.akas.tsv.gz",
        "title.basics.tsv.gz",
        "title.crew.tsv.gz",
        "title.episode.tsv.gz",
        "title.principals.tsv.gz",
        "title.ratings.tsv.gz"]
        
temp_filepath = os.path.abspath(os.path.join(os.path.abspath(''), os.pardir, 'data', 'tmp'))

In [None]:
util.download_files_to_local(base_url, files_list, temp_filepath)

In [None]:
data = dict()

for file in files_list:
    data[file] = pd.read_csv(temp_filepath+'/'+file, sep = '\t', nrows=100, compression='gzip',error_bad_lines=False)

In [None]:
data['name.basics.tsv.gz']['deathYear'] = data['name.basics.tsv.gz']['deathYear'].replace('\\N','') 
data['name.basics.tsv.gz'].head()

In [None]:
data['title.akas.tsv.gz']['region'] = data['title.akas.tsv.gz']['region'].replace('\\N','XX') 
data['title.akas.tsv.gz'].head(20)

In [None]:
data["title.basics.tsv.gz"].head()

In [None]:
data["title.crew.tsv.gz"].head()

In [None]:
data["title.episode.tsv.gz"].head()

In [None]:
data["title.principals.tsv.gz"].head(20)

In [None]:
data["title.ratings.tsv.gz"].head()

In [None]:
data["title.episode.tsv.gz"].describe()

# Exploration using PySpark

In [1]:
%set_env JAVA_HOME=/Users/akshayiyer/Library/Java/JavaVirtualMachines/jdk8u222-b10/Contents/Home

env: JAVA_HOME=/Users/akshayiyer/Library/Java/JavaVirtualMachines/jdk8u222-b10/Contents/Home


In [2]:
import configparser
import datetime
import os
import util
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [3]:
def create_spark_session(master,endpoint=None):
    spark = SparkSession \
            .builder \
            .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
            .config("fs.s3a.endpoint",endpoint)\
            .appName("udacity-dend-data-lake-proj")\
            .master(master)\
            .getOrCreate()
    
    return spark

In [4]:
#spark://127.0.0.1:7077
spark = create_spark_session("spark://127.0.0.1:7077","s3.us-west-2.amazonaws.com")
spark

# Download files to local directory

In [5]:
base_url = 'https://datasets.imdbws.com'

files_list = [
        "name.basics.tsv.gz",
        "title.akas.tsv.gz",
        "title.basics.tsv.gz",
        "title.crew.tsv.gz",
        "title.episode.tsv.gz",
        "title.principals.tsv.gz",
        "title.ratings.tsv.gz"]
        
download_directory = os.path.abspath(os.path.join(os.path.abspath(''), os.pardir, 'data', 'tmp'))

In [11]:
util.download_files_to_local(base_url, files_list, download_directory)

## Process name_basics file

In [6]:
'''
names_schema = StructType([
                    StructField('nconst', StringType(), True),
                    StructField('primaryName', StringType(), True),
                    StructField('birthYear', IntegerType(), True),
                    StructField('deathYear', IntegerType(), True),
                    StructField('primaryProfession', StringType(), True),
                    StructField('knownForTitles', StringType(), True),
                    StructField('broken', StringType(), True)
                        ])
'''

temp_filepath = 'file:///Users/akshayiyer/Dev/GitHub/udacity-dend-capstone-etl/data/tmp'
file = 'name.basics.tsv.gz'

names_df = spark.read.load(
    temp_filepath+'/'+file,
    format="csv", 
    sep="\t", 
    inferSchema="true", 
    header="true",
    ignoreLeadingWhiteSpace=True,
    ignoreTrailingWhiteSpace=True
)

In [7]:
names_df.printSchema()

root
 |-- nconst: string (nullable = true)
 |-- primaryName: string (nullable = true)
 |-- birthYear: string (nullable = true)
 |-- deathYear: string (nullable = true)
 |-- primaryProfession: string (nullable = true)
 |-- knownForTitles: string (nullable = true)



In [9]:
# Get today's date
now = datetime.datetime.now()

fix_birthYear_func = (
                F.when(F.col('birthYear')<1000,None)
                 .when(F.col('birthYear')>now.year,None)
                 .when(F.col('birthYear')=='\\N',None)
                 .otherwise(F.col('birthYear'))
                )

names_df2 = names_df.withColumn("birthYear_fixed",fix_birthYear_func)\
        .drop("birthYear")\
        .withColumnRenamed("birthYear_fixed", "birthYear")

fix_deathYear_func = (
                F.when(F.col('deathYear')<1000,None)
                 .when(F.col('deathYear')>now.year,None)
                 .when(F.col('deathYear')=='\\N',None)
                 .otherwise(F.col('deathYear'))
                )

names_df3 = names_df2.withColumn("deathYear_fixed",fix_deathYear_func)\
         .drop("deathYear")\
         .withColumnRenamed("deathYear_fixed", "deathYear")

names_df3.show(10, False)

+---------+---------------+------------------------------+---------------------------------------+---------+---------+
|nconst   |primaryName    |primaryProfession             |knownForTitles                         |birthYear|deathYear|
+---------+---------------+------------------------------+---------------------------------------+---------+---------+
|nm0000001|Fred Astaire   |soundtrack,actor,miscellaneous|tt0043044,tt0072308,tt0053137,tt0050419|1899     |1987     |
|nm0000002|Lauren Bacall  |actress,soundtrack            |tt0071877,tt0117057,tt0038355,tt0037382|1924     |2014     |
|nm0000003|Brigitte Bardot|actress,soundtrack,producer   |tt0057345,tt0049189,tt0059956,tt0054452|1934     |null     |
|nm0000004|John Belushi   |actor,writer,soundtrack       |tt0078723,tt0080455,tt0077975,tt0072562|1949     |1982     |
|nm0000005|Ingmar Bergman |writer,director,actor         |tt0050976,tt0083922,tt0069467,tt0050986|1918     |2007     |
|nm0000006|Ingrid Bergman |actress,soundtrack,pr

In [11]:
artists_df = names_df3.select("nconst","primaryName","birthYear","deathYear").drop_duplicates()
artists_prmry_prfsn_df = names_df3.select("nconst",F.explode(F.split(F.col("primaryProfession"),",")))
artists_knwn_fr_ttls_df = names_df3.select("nconst",F.explode(F.split(F.col("knownForTitles"),",")))

In [13]:
artists_df.show(5)
artists_prmry_prfsn_df.show(5)
artists_knwn_fr_ttls_df.show(5)

+---------+--------------+---------+---------+
|   nconst|   primaryName|birthYear|deathYear|
+---------+--------------+---------+---------+
|nm0000080|  Orson Welles|     1915|     1985|
|nm0000092|   John Cleese|     1939|     null|
|nm0000238| Shannon Tweed|     1957|     null|
|nm0000282|Scott Bairstow|     1970|     null|
|nm0000373|  Michael Dorn|     1952|     null|
+---------+--------------+---------+---------+
only showing top 5 rows

+---------+-------------+
|   nconst|          col|
+---------+-------------+
|nm0000001|   soundtrack|
|nm0000001|        actor|
|nm0000001|miscellaneous|
|nm0000002|      actress|
|nm0000002|   soundtrack|
+---------+-------------+
only showing top 5 rows

+---------+---------+
|   nconst|      col|
+---------+---------+
|nm0000001|tt0043044|
|nm0000001|tt0072308|
|nm0000001|tt0053137|
|nm0000001|tt0050419|
|nm0000002|tt0071877|
+---------+---------+
only showing top 5 rows



In [None]:
save_file_path = 'file:///Users/akshayiyer/Dev/GitHub/udacity-dend-capstone-etl/data/'

artists_df.write.mode('overwrite').parquet(save_file_path+"artists.parquet")
artists_prmry_prfsn_df.write.mode('overwrite').parquet(save_file_path+"artists_prmry_profession.parquet")
artists_knwn_fr_ttls_df.write.mode('overwrite').parquet(save_file_path+"artists_knwnfor_titles.parquet")