In [1]:
import os
import glob
import psycopg2
import pandas as pd
import numpy as np
from Create_Table_queries import *
import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

def process_youtubedata_file(cur, conn, filepath): 
    """
        This function reads one JSON file and read information of videos and youtuber data and saves into video_data and youtuber_data
        Arguments:
        cur: Database Cursor
        filepath: location of JSON files
        Return: None
    """
    # open JSON file
    df = pd.read_json(filepath)
    df.fillna('')
    
    # ---------insert youtuber record----------
    # write your code here that reads youtuber data from JSON file and insert it into Youtubers_dim table 
    # write your code here
    
    youtuber_data = df[['youtuber_id', 'youtuber_name', 'youtuber_location', 'youtuber_latitude', 'youtuber_longitude']]
    youtuber_data = youtuber_data.drop_duplicates()
    youtuber_data = youtuber_data.values.tolist()

    try: 
        cur.executemany(Youtubers_table_insert, youtuber_data)
    except psycopg2.Error as e:
        print(e)
    

    
    # ---------insert video record--------------
    # write your code here that reads youtube videos data from JSON file and insert it into Videos_dim table 
    # write your code here
    
    video_data = df[['video_id', 'title', 'youtuber_id', 'year', 'duration']]
    video_data = video_data.drop_duplicates(keep='first')
    video_data = video_data.values.tolist()
    
    try: 
        cur.executemany(Videos_table_insert, video_data)
    except psycopg2.Error as e:
        print(e)

def process_log_file(cur, conn, filepath):
    """
        This function reads Log files and reads information of time, user and videoplay data and saves into time, user, videoplay
        Arguments:
        cur: Database Cursor
        filepath: location of Log files
        Return: None
    """

    # open log file
    df = pd.read_json(filepath, lines=True)

    # filter by NextVideo action
    df = df[(df['page'] == 'NextVideo')]

    # convert timestamp column to datetime with 'pd.to_datetime(df["ts"], unit="ms")' 
    df['ts'] = pd.to_datetime(df["ts"], unit="ms")
    
    # insert time data records to Time_dim table
    # write your code here
    df['year'] =  pd.to_datetime(df['ts']).dt.year
    df['month'] =  pd.to_datetime(df['ts']).dt.month
    df['day'] =  pd.to_datetime(df['ts']).dt.day
    df['hour'] =  pd.to_datetime(df['ts']).dt.hour
    df['week'] =  pd.to_datetime(df['ts']).dt.week
    df['weekday'] =  pd.to_datetime(df['ts']).dt.weekday
    df['start_time'] =  pd.to_datetime(df['ts'])
    
    time_data = df[['start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday']]
    time_data = time_data.drop_duplicates(keep='first')
    time_data = time_data.values
    try: 
        cur.executemany(Time_table_insert, time_data)
    except psycopg2.Error as e:
        print(e)

    
    # load user table
    # insert user records into Users_dim table
    # write your code here
    user_data = df[['userId', 'firstName', 'lastName', 'gender', 'level']]
    user_data = user_data.drop_duplicates(keep='first')
    user_data = user_data.values
    try: 
        cur.executemany(Users_table_insert, user_data)
    except psycopg2.Error as e:
        print(e)

    
    # insert Videoplay records in Videoplay_fact table
    # write your code here
    
    # reading videos data
    query = 'SELECT * from videos_dim;'
    videos = pd.read_sql_query(query,conn)
    
    # merging tables to get information of video_id and youtuber_id
    df1 = pd.merge(df, videos, how='inner', left_on = 'video', right_on = 'title')
    
    videoplay_data = df1[['start_time', 'userId', 'level','video_id', 'youtuber_id', 'sessionId','location','userAgent']]
    videoplay_data = videoplay_data.drop_duplicates(keep='first')
    videoplay_data = videoplay_data.values
    try: 
        cur.executemany(Videoplay_table_insert, videoplay_data)
    except psycopg2.Error as e:
        print(e)
    
def getListOfFiles(dirName):
    # create a list of file and sub directories 
    # names in the given directory 
    listOfFile = os.listdir(dirName)
    allFiles = list()
    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = os.path.join(dirName, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)
                
    return allFiles 

def process_data(cur, conn, filepath, func):
    """
        This function get all JSON files in given directory by exploring all sub directories, and process all files that were found using the given function.
        Example: if I give it the path to youtube_data directory which resides in data folder of this assignment,
        and func given is process_youtubedata_file it should get all JSON files in this directories and process each file using process_youtubedata_file function. 
        Arguments:
        cur: Database Cursor
        conn: Database
        filepath: location of JSON files
        func: function to process all files in the directory
        Return: None
    """
    file_list = getListOfFiles(filepath)
    for file in file_list:
        func(cur,conn,file)



def main():
    conn = psycopg2.connect("host=127.0.0.1 dbname=youtubedb user=postgres password=ali123ali")
    cur = conn.cursor()
    conn.set_session(autocommit=True)

    process_data(cur, conn, filepath='data/youtube_data', func=process_youtubedata_file)
    process_data(cur, conn, filepath='data/log_data', func=process_log_file)

    conn.close()


if __name__ == "__main__":
    main()

duplicate key value violates unique constraint "youtubers_dim_pkey"
DETAIL:  Key (youtuber_id)=(ARD7TVE1187B99BFB1) already exists.

duplicate key value violates unique constraint "videos_dim_pkey"
DETAIL:  Key (video_id)=(SOMZWCG12A8C13C480) already exists.

duplicate key value violates unique constraint "youtubers_dim_pkey"
DETAIL:  Key (youtuber_id)=(ARMJAGH1187FB546F3) already exists.

duplicate key value violates unique constraint "videos_dim_pkey"
DETAIL:  Key (video_id)=(SOCIWDW12A8C13D406) already exists.

duplicate key value violates unique constraint "youtubers_dim_pkey"
DETAIL:  Key (youtuber_id)=(ARKRRTF1187B9984DA) already exists.

duplicate key value violates unique constraint "videos_dim_pkey"
DETAIL:  Key (video_id)=(SOXVLOJ12AB0189215) already exists.

duplicate key value violates unique constraint "youtubers_dim_pkey"
DETAIL:  Key (youtuber_id)=(AR7G5I41187FB4CE6C) already exists.

duplicate key value violates unique constraint "videos_dim_pkey"
DETAIL:  Key (video_id

duplicate key value violates unique constraint "videos_dim_pkey"
DETAIL:  Key (video_id)=(SOAOIBZ12AB01815BE) already exists.

duplicate key value violates unique constraint "youtubers_dim_pkey"
DETAIL:  Key (youtuber_id)=(AR9AWNF1187B9AB0B4) already exists.

duplicate key value violates unique constraint "videos_dim_pkey"
DETAIL:  Key (video_id)=(SOZHPGD12A8C1394FE) already exists.

duplicate key value violates unique constraint "youtubers_dim_pkey"
DETAIL:  Key (youtuber_id)=(AROGWRA122988FEE45) already exists.

duplicate key value violates unique constraint "videos_dim_pkey"
DETAIL:  Key (video_id)=(SOSLAVG12A8C13397F) already exists.

duplicate key value violates unique constraint "youtubers_dim_pkey"
DETAIL:  Key (youtuber_id)=(ARMBR4Y1187B9990EB) already exists.

duplicate key value violates unique constraint "videos_dim_pkey"
DETAIL:  Key (video_id)=(SOTTDKS12AB018D69B) already exists.

duplicate key value violates unique constraint "youtubers_dim_pkey"
DETAIL:  Key (youtuber_id

duplicate key value violates unique constraint "users_dim_pkey"
DETAIL:  Key (user_id)=(8) already exists.

duplicate key value violates unique constraint "time_dim_pkey"
DETAIL:  Key (start_time)=(2018-11-02 01:25:34.796) already exists.

duplicate key value violates unique constraint "users_dim_pkey"
DETAIL:  Key (user_id)=(101) already exists.

duplicate key value violates unique constraint "time_dim_pkey"
DETAIL:  Key (start_time)=(2018-11-03 01:05:50.796) already exists.

duplicate key value violates unique constraint "users_dim_pkey"
DETAIL:  Key (user_id)=(53) already exists.

duplicate key value violates unique constraint "time_dim_pkey"
DETAIL:  Key (start_time)=(2018-11-04 00:50:03.796) already exists.

duplicate key value violates unique constraint "users_dim_pkey"
DETAIL:  Key (user_id)=(99) already exists.

duplicate key value violates unique constraint "time_dim_pkey"
DETAIL:  Key (start_time)=(2018-11-05 00:33:12.796) already exists.

duplicate key value violates unique 