
# Practicum II - Movie Database Applicaton
### Collaborators: Abasamia Akpan, Rebecca Brent and Alessia Pizzoccheri

This Notebook provides context, documentation and sample code for a Database Application using IMBd datasets; for clarity, this document has been divided into the following sections:

### 1. Introduction
### 2. Assumptions
### 3. Logical Models
### 4. Database and Relations Definition¶
### 1. Introduction
#### 1.1. Looking at the data
In this section, we analyze and deconstruct the various data sets found on the IMBd page; data is organized in seven TSV files, specifically:

Title AKAs
Title Basics
Title Episodes
Title Ratings
Title Crew
Name Basics
Title Principles Our first step is to establish relationships between each data set and identify a primary key (PK) on each table. Since no information was provided regarding the role of columns, we devised a solution to find unique identifiers within each data set; that is, for each table we compare the total number of rows against the total number of unique values for a given column.
1.2 Establishing PKs
Following the system aforementioned, we identified the following primary keys

In [56]:

import pandas as pd
import numpy as np
import os
import sys
import pip
pip.main(['install','prettytable'])
pip.main(['install','mysql-connector-python'])
pip.main(['install','tqdm'])
pip.main(['install','pymysql'])

from tqdm import tqdm

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.




Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.




Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.




Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.




In [57]:
import mysql.connector
from prettytable import PrettyTable 
from prettytable import from_csv
from mysql.connector import Error

# constant variables
BATCH_SIZE = 100000

# global variables
mydb = None
cursor = None

def create_connection_nodb(host_name, user_name, user_password):
    """ create a connection to mysql host
    :param host_name: string host name (ie. "localhost")
    :param user_name: string username (ie. "root")
    :param user_password: string the password to connect to mysql
    :return: connection object
    """
    connection = None
    try:
        connection = mysql.connector.connect(
            host=host_name,
            user=user_name,
            passwd=user_password
        )
        print("Connection to MySQL DB successful")
    except Error as e:
        print(f"The error '{e}' occurred")

    return connection

def create_database(connection, db_name):
    """ create a database 
    :param connection: Connection object
    :param db_name: name of database in form of string
    :return:
    """
    cursor = connection.cursor()
    try:
        query = "CREATE DATABASE IF NOT EXISTS " + db_name
        cursor.execute(query)
        print("Database created successfully")
    except Error as e:
        print(f"The error '{e}' occurred")
        
def create_connection(host_name, user_name, user_password, db_name):
    """ create a connection to the (possibly newly created) database
    :param host_name: string host name (ie. "localhost")
    :param user_name: string username (ie. "root")
    :param user_password: string the password to connect to mysql
    :param db_name: string name of database 
    :return: connection object
    """
    try:
        mydb = mysql.connector.connect(
            host=host_name,
            user=user_name,
            passwd=user_password,
            database=db_name
        )
        print("Connection to MySQL DB successful")
    except Error as e:
        print(f"The error '{e}' occurred")

    return mydb

def create_table(conn, create_table_sql):
    """ create a table from the create_table_sql statement
    :param conn: Connection object
    :param create_table_sql: a CREATE TABLE statement
    :return:
    """
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
        table_name = create_table_sql.split(" ")[5].split("(")[0]
        print("Successfully created table: " + table_name)
    except Error as e:
        print(e)
        
def main():
    
    database = "name_basics_preprocessed_new"
    host_name = "localhost"
    user_name = "root"
    user_password = "alupizzo92"
    
    create_table_titlebasics = """CREATE TABLE IF NOT EXISTS titlebasics(
                                    tconst VARCHAR(1000) PRIMARY KEY,
                                    titleType VARCHAR(1000),
                                    primaryTitle VARCHAR(1000),
                                    originalTitle VARCHAR(1000),
                                    isAdult BOOLEAN,
                                    startYear DATE,
                                    endYear DATE,
                                    runTimeMinutes TIME)"""
    
    create_table_titleratings = """CREATE TABLE IF NOT EXISTS titleratings(
                                    tconst VARCHAR(1000) PRIMARY KEY,
                                    averageRating FLOAT,
                                    numVotes INT,
                                    FOREIGN KEY (tconst) REFERENCES titlebasics(tconst))"""


    create_table_titleepisodes = """CREATE TABLE IF NOT EXISTS titleepisodes(
                                    tconst VARCHAR(1000) PRIMARY KEY,
                                    seasonNumber INT,
                                    episodeNumber INT,
                                    parentTconst VARCHAR(1000),
                                    FOREIGN KEY (parentTconst) REFERENCES titlebasics(tconst))"""

    create_table_titleAKAs = """CREATE TABLE IF NOT EXISTS titleAKAs(
                                AKAsID INT AUTO_INCREMENT PRIMARY KEY,
                                titleID VARCHAR(1000),
                                ordering INT,
                                title VARCHAR(1000),
                                region VARCHAR(1000),
                                language VARCHAR(1000),
                                isoriginaltitle BOOLEAN,
                                FOREIGN KEY (titleID) REFERENCES titlebasics(tconst))"""

    create_table_namebasics = """CREATE TABLE IF NOT EXISTS namebasics(
                                nconst VARCHAR(1000) PRIMARY KEY,
                                primaryname VARCHAR(1000),
                                birthyear DATE,
                                deathyear DATE,
                                noofmovies INT,
                                age INT,
                                currentdate DATE)"""
    
    create_table_principals = """CREATE TABLE IF NOT EXISTS principals(
                                principalsID INT AUTO_INCREMENT PRIMARY KEY,
                                tconst VARCHAR(1000),
                                ordering INT,
                                category VARCHAR(1000),
                                job VARCHAR(1000),
                                nconst VARCHAR(1000),
                                FOREIGN KEY (tconst) REFERENCES titlebasics(tconst),
                                FOREIGN KEY (nconst) REFERENCES namebasics(nconst))"""
    
    create_table_titlecrew = """CREATE TABLE IF NOT EXISTS titlecrew(
                                crewid INT AUTO_INCREMENT PRIMARY KEY,
                                tconst VARCHAR(1000) UNIQUE,
                                FOREIGN KEY (tconst) REFERENCES titlebasics(tconst))"""
    
    create_table_characters = """CREATE TABLE IF NOT EXISTS characters(
                                characterID INT AUTO_INCREMENT PRIMARY KEY,
                                name VARCHAR(1000) UNIQUE)"""
    
    create_table_writers = """CREATE TABLE IF NOT EXISTS writers(
                                writerID INT AUTO_INCREMENT PRIMARY KEY,
                                name VARCHAR(1000) UNIQUE)"""
    
    create_table_directors = """CREATE TABLE IF NOT EXISTS directors(
                                directorID INT AUTO_INCREMENT PRIMARY KEY,
                                name VARCHAR(1000) UNIQUE)"""
    
    create_table_genres = """CREATE TABLE IF NOT EXISTS genres(
                                genreID INT AUTO_INCREMENT  PRIMARY KEY,
                                genre VARCHAR(1000) UNIQUE)"""
    
    create_table_knownfortitles = """CREATE TABLE IF NOT EXISTS knownfortitles(
                                knownfortitlesID INT AUTO_INCREMENT PRIMARY KEY,
                                titles VARCHAR(1000) UNIQUE)"""
    
    create_table_primaryprofession = """CREATE TABLE IF NOT EXISTS primaryprofession(
                                primaryprofessionID INT AUTO_INCREMENT PRIMARY KEY,
                                profession VARCHAR(1000) UNIQUE)"""
    
    create_table_types = """CREATE TABLE IF NOT EXISTS types(
                                typeID INT AUTO_INCREMENT  PRIMARY KEY,
                                type VARCHAR(1000) UNIQUE)"""
    
    create_table_attributes = """CREATE TABLE IF NOT EXISTS attributes(
                                attributeID INT AUTO_INCREMENT PRIMARY KEY,
                                attribute VARCHAR(1000) UNIQUE)"""
    
    create_table_genretitlebasics = """CREATE TABLE IF NOT EXISTS genretitlebasics(
                                titlebasicsID VARCHAR(1000),
                                genreID INT,
                                PRIMARY KEY (titlebasicsID, genreID),
                                FOREIGN KEY (titlebasicsID) REFERENCES titlebasics(tconst),
                                FOREIGN KEY (genreID) REFERENCES genres(genreID))"""
    
    create_table_crewdirectors = """CREATE TABLE IF NOT EXISTS crewdirectors(
                                crewid INT,
                                directorid INT,
                                PRIMARY KEY (crewid ,directorid),
                                FOREIGN KEY (crewid) REFERENCES titlecrew(crewid),
                                FOREIGN KEY (directorid) REFERENCES directors(directorID))"""
    
    create_table_crewwriters = """CREATE TABLE IF NOT EXISTS crewwriters(
                                crewid INT,
                                writerid INT,
                                PRIMARY KEY (crewid, writerid),
                                FOREIGN KEY (crewid) REFERENCES titlecrew(crewid),
                                FOREIGN KEY (writerid) REFERENCES writers(writerid))"""
    
    create_table_principalscharacters = """CREATE TABLE IF NOT EXISTS principalscharacters(
                                principalsid INT,
                                characterid INT,
                                PRIMARY KEY (principalsid, characterid),
                                FOREIGN KEY (principalsid) REFERENCES principals(principalsID),
                                FOREIGN KEY (characterid) REFERENCES characters(characterid))"""
    
    create_table_AKAattributes = """CREATE TABLE IF NOT EXISTS AKAattributes(
                                akasid INT,
                                attributeid INT,
                                PRIMARY KEY (akasid, attributeid),
                                FOREIGN KEY (akasid) REFERENCES titleAKAs(AKAsID),
                                FOREIGN KEY (attributeid) REFERENCES attributes(attributeid))"""
    
    create_table_AKAtype = """CREATE TABLE IF NOT EXISTS AKAtype(
                                akasid INT,
                                typeid INT,
                                PRIMARY KEY (akasid, typeid),
                                FOREIGN KEY (akasid) REFERENCES titleAKAs(AKAsID),
                                FOREIGN KEY (typeid) REFERENCES type(typeid))"""
    
    create_table_primaryprofessiontitlenames = """CREATE TABLE IF NOT EXISTS primaryprofessiontitlenames(
                                primaryprofessionid INT,
                                namebasicsid VARCHAR(255),
                                PRIMARY KEY (primaryprofessionid, namebasicsid),
                                FOREIGN KEY (primaryprofessionid) REFERENCES primaryprofession(primaryprofessionid),
                                FOREIGN KEY (namebasicsid) REFERENCES namebasics(nconst))"""
    
    create_table_knownfortitlenames = """CREATE TABLE IF NOT EXISTS knownfortitlenames(
                                namebasicsid VARCHAR(255),
                                knownfortitlesid INT,
                                PRIMARY KEY (namebasicsid, knownfortitlesid),
                                FOREIGN KEY (namebasicsid) REFERENCES namebasics(nconst),
                                FOREIGN KEY (knownfortitlesid) REFERENCES knownfortitles(knownfortitlesid))"""
    
    conn = create_connection_nodb(host_name, user_name, user_password)
    
    # create database object if not already created and then connect the cursor to it
    if conn is not None:
        create_database(conn, database)
        create_connection(host_name, user_name, user_password, database)

        # create tables
        if conn is not None:
            # create Title Basics table
            create_table(conn, create_table_titlebasics)

            # create Title Ratings table
            create_table(conn, create_table_titleratings)
            
            # create title episodes table
            create_table(conn, create_table_titleepisodes)
            
            # create title AKAs table
            create_table(conn, create_table_titleAKAs)
            
            # create name basics table
            create_table(conn, create_table_namebasics)
            
            # create principals table
            create_table(conn, create_table_principals)
            
            # create title crew table
            create_table(conn, create_table_titlecrew)
            
            # create characters table
            create_table(conn,create_table_characters)
            
            # create writers table
            create_table(conn,create_table_writers)
            
            # create directors table
            create_table(conn,create_table_directors)
            
            # create genres table
            create_table(conn,create_table_genres)
            
            # create known for titles table
            create_table(conn,create_table_knownfortitles)
            
            # create primary profession table
            create_table(conn,create_table_primaryprofession)
            
            # create type table
            create_table(conn,create_table_types)
            
            # create attributes table
            create_table(conn,create_table_attributes)
            
            # create genre title basics table
            create_table(conn, create_table_genretitlebasics)
            
            # create crew directors table
            create_table(conn,create_table_crewdirectors)
            
            # create crew writers table
            create_table(conn,create_table_crewwriters)
            
            # create principals characters table
            create_table(conn, create_table_principalscharacters)
            
            # create AKA Attributes table
            create_table(conn, create_table_AKAattributes)
            
            # create AKA Type table
            create_table(conn, create_table_AKAtype)
            
            # create Primary Profession Title Names table
            create_table(conn, create_table_primaryprofessiontitlenames)
            
            # create known for title names table
            create_table(conn, create_table_knownfortitlenames)
                
        else:
            print("Error! cannot create the database connection.")
    
    else:
        print("Error! Cannot create the MySQL connection.")
    
    


if __name__ == '__main__':
    main()


Connection to MySQL DB successful
Database created successfully
Connection to MySQL DB successful
1046 (3D000): No database selected
1046 (3D000): No database selected
1046 (3D000): No database selected
1046 (3D000): No database selected
1046 (3D000): No database selected
1046 (3D000): No database selected
1046 (3D000): No database selected
1046 (3D000): No database selected
1046 (3D000): No database selected
1046 (3D000): No database selected
1046 (3D000): No database selected
1046 (3D000): No database selected
1046 (3D000): No database selected
1046 (3D000): No database selected
1046 (3D000): No database selected
1046 (3D000): No database selected
1046 (3D000): No database selected
1046 (3D000): No database selected
1046 (3D000): No database selected
1046 (3D000): No database selected
1046 (3D000): No database selected
1046 (3D000): No database selected
1046 (3D000): No database selected


## SQL Communication Helper

In [58]:
import sqlalchemy
database_username = 'root'
database_password = 'alupizzo92'
database_ip       = 'localhost'
database_name     = 'imbd_application'
database_connection = sqlalchemy.create_engine('mysql+mysqlconnector://{0}:{1}@{2}/{3}'.
                                               format(database_username, database_password, 
                                                      database_ip, database_name))

In [59]:
## Helper Functions

In [60]:
# define load_dataframe function
def load_dataframe(data_frame,table_name):
    data_frame_new = data_frame
    data_frame_new.to_sql(table_name,con=database_connection, if_exists='append',index=False)

# define batch function
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

## Title Basics

In [61]:
# read title basic file
title_basics = pd.read_csv('title.basics.tsv', delimiter='\t',encoding='utf-8')
# UNCOMMENT TO DISPLAY ALL DATA
# title_basics

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [62]:
# preprocess the title basics data
def preprocess_dataframe_title_basics(data_frame):
    title_basics_new_series = data_frame

    b = pd.to_datetime(title_basics_new_series['runtimeMinutes'], errors='coerce', format='%M')

    b.to_frame().runtimeMinutes
    df_1 = pd.DataFrame(dict(time_stamps = b)) 
    
    # convert time stamp to minutes
    df_1['runtimeMinutes'] = df_1['time_stamps'].dt.minute
    title_basics_new_series = title_basics_new_series.drop(['runtimeMinutes'],axis=1)
    title_basics_new_series['runtimeMinutes'] = df_1['runtimeMinutes']
    title_basics_new_frame_transpose = title_basics_new_series.astype({'isAdult': 'int32'})
    title_basics_new_frame_transpose["isAdult"]=title_basics_new_frame_transpose.isAdult.mask(title_basics_new_frame_transpose.isAdult > 1,1)

    #needs to be fixed
    title_basics_new_frame_transpose['startYear'] = pd.to_datetime(title_basics.startYear, errors='coerce', format='%Y')
    title_basics_new_frame_transpose['endYear']= pd.to_datetime(title_basics.endYear, errors='coerce', format='%Y')

    title_basics_new_frame_transpose=title_basics_new_frame_transpose.drop(['genres'],axis=1)

    # constuct new row
    new_row = {
        'tconst':None, 
        'titleType':None, 
        'primaryTitle':None,
        'originalTitle':None,
        'isAdult':None, 
        'startYear':None,
        'endYear':None,
        'runtimeMinutes':None         
    } 
  
    title_basics_new_frame_transpose = title_basics_new_frame_transpose.append(new_row, ignore_index=True)
    return title_basics_new_frame_transpose

title_basics_preprocessed = preprocess_dataframe_title_basics(title_basics)

In [63]:
# print total count of data rows
print('Total size of data: ', len(title_basics_preprocessed))

Total size of data:  7300502


In [64]:
length = len(title_basics_preprocessed) // BATCH_SIZE + 1

for chunk in tqdm(batch(title_basics_preprocessed, BATCH_SIZE), total = length):
    # call helper function
    load_dataframe(chunk, 'titlebasics')

100%|██████████| 74/74 [03:11<00:00,  2.59s/it]


## Title ratings ##

In [65]:
# read data from title ratings file
title_ratings = pd.read_csv('title.ratings.tsv', delimiter='\t',encoding='utf-8')

In [66]:
# preprocess the title ratings data
def preprocess_dataframe_title_ratings(data_frame):
    title_ratings_new_series = data_frame
    
    title_ratings_new_frame_avg_rating = title_ratings_new_series.astype({'averageRating': 'float'})
    title_ratings_new_frame_avg_rating = title_ratings_new_series.astype({'numVotes': 'int32'})
  
    return title_ratings_new_frame_avg_rating

title_ratings_preprocessed = preprocess_dataframe_title_ratings(title_ratings)

In [67]:
# print total count of data rows
print('Total size of data: ', len(title_ratings_preprocessed))

Total size of data:  1086514


In [68]:
length = len(title_ratings_preprocessed) // BATCH_SIZE + 1

for chunk in tqdm(batch(title_ratings_preprocessed, BATCH_SIZE), total = length):
    load_dataframe(chunk, 'titleratings')

100%|██████████| 11/11 [00:14<00:00,  1.29s/it]


## Title Episodes ##

In [69]:
# read title episodes data
title_episodes = pd.read_csv('title.episode.tsv', delimiter='\t',encoding='utf-8')

In [70]:
# preprocess the title episodes data
def preprocess_dataframe_title_episodes(data_frame):
    title_episodes_new_series=data_frame
    title_episodes_coerce_seasonNum = pd.to_numeric(title_episodes_new_series['seasonNumber'], errors='coerce')
    title_episodes_coerce_episodeNum = pd.to_numeric(title_episodes_new_series['episodeNumber'], errors='coerce')

    df_1 = pd.DataFrame(dict(test_column_1 = title_episodes_coerce_seasonNum)) 
    df_2 = pd.DataFrame(dict(test_column_2 = title_episodes_coerce_episodeNum)) 

    df_1['seasonNumber'] = df_1['test_column_1']
    df_2['episodeNumber'] = df_2['test_column_2']
    
    title_episodes_new_frame = title_episodes_new_series.drop(['seasonNumber'],axis=1)
    title_episodes_new_frame = title_episodes_new_series.drop(['episodeNumber'],axis=1)
    
    title_episodes_new_frame['seasonNumber'] = df_1['seasonNumber']
    title_episodes_new_frame['episodeNumber'] = df_2['episodeNumber']
       
    return title_episodes_new_frame

title_episodes_preprocessed = preprocess_dataframe_title_episodes(title_episodes)
title_episodes_preprocessed.tconst = title_episodes_preprocessed.tconst.mask(~title_episodes_preprocessed.tconst.isin(title_basics_preprocessed.tconst), None)

In [71]:
# print total count of data rows
print('Total size of data: ', len(title_episodes_preprocessed))

Total size of data:  5261727


In [72]:
length = len(title_episodes_preprocessed) // BATCH_SIZE +1

for chunk in tqdm(batch(title_episodes_preprocessed, BATCH_SIZE), total = length):
    # call helper function
    load_dataframe(chunk, 'titleepisodes')

100%|██████████| 53/53 [01:21<00:00,  1.53s/it]


## Title Akas ##

In [73]:
title_akas = pd.read_csv('title.akas.tsv', delimiter='\t',encoding='utf-8')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [74]:
# preprocess title AKAs data
def preprocess_dataframe_title_akas(data_frame):
    title_akas_new_series=data_frame
    title_akas_new_series=title_akas_new_series.drop(['types', 'attributes'],axis=1)
    
    title_akas_coerce_ordering = pd.to_numeric(title_akas_new_series['ordering'], errors='coerce')
    title_akas_coerce_isOriginalTitle = pd.to_numeric(title_akas_new_series['isOriginalTitle'], errors='coerce')

    df_1 = pd.DataFrame(dict(test_column_1 = title_akas_coerce_ordering)) 
    df_2 = pd.DataFrame(dict(test_column_2 = title_akas_coerce_isOriginalTitle)) 

    df_1['ordering'] = df_1['test_column_1']
    df_2['isOriginalTitle'] = df_2['test_column_2']
    
    title_akas_new_frame=title_akas_new_series.drop(['ordering'],axis=1)
    title_akas_new_frame=title_akas_new_series.drop(['isOriginalTitle'],axis=1)
    
    title_akas_new_frame['ordering']=df_1['ordering']
    title_akas_new_frame['isOriginalTitle']=df_2['isOriginalTitle']
    return title_akas_new_frame

title_akas_preprocessed = preprocess_dataframe_title_akas(title_akas)
title_akas_preprocessed[~title_akas_preprocessed.titleId.isin(title_basics_preprocessed.tconst)]

Unnamed: 0,titleId,ordering,title,region,language,isOriginalTitle
75588,tt0021006,1,"Ja, der Himmel über Wien",AT,\N,
78146,tt0021453,1,Tapping Toes,US,\N,
87274,tt0023019,1,Hollywood on Parade,US,\N,
97677,tt0024677,1,Tom's in Town,US,\N,
174270,tt0036165,1,Missing Men,US,\N,
...,...,...,...,...,...,...
20596836,tt7978886,1,State of Defacto,AM,\N,0.0
20981546,tt8206494,1,Blood for Dust,US,\N,0.0
21420340,tt8466868,1,Butaca,ES,\N,0.0
22287099,tt8982514,1,To The Bitter End,US,\N,0.0


In [75]:
title_akas_preprocessed.titleId=title_akas_preprocessed.titleId.mask(~title_akas_preprocessed.titleId.isin(title_basics_preprocessed.tconst),None)

In [None]:
length = len(title_akas_preprocessed) // BATCH_SIZE + 1

for chunk in tqdm(batch(title_akas_preprocessed, BATCH_SIZE), total = length):
    # call helper function
    load_dataframe(chunk, 'titleAKAs')

 13%|█▎        | 32/240 [01:03<06:47,  1.96s/it]

## Name Basics

In [None]:
name_basics = pd.read_csv('name.basics.tsv', delimiter='\t',encoding='utf-8')

In [None]:
def preprocess_dataframe_name_basics(data_frame):
    name_basics_new_series=data_frame
    name_basics_new_series=name_basics_new_series.drop(['primaryProfession', 'knownForTitles'],axis=1)
    
    name_basics_new_series['birthYear']= pd.to_datetime(name_basics_new_series.birthYear, errors='coerce', format='%Y')

    #title_basics_new_frame_transpose['startYear']=[time.date() for time in title_basics_new_frame_transpose['startYear']]
    
    name_basics_new_series['deathYear']= pd.to_datetime(name_basics_new_series.deathYear, errors='coerce', format='%Y')
    
    return name_basics_new_series

name_basics_preprocessed = preprocess_dataframe_name_basics(name_basics)

name_basics_preprocessed["noofmovies"] = ""
name_basics_preprocessed["age"] = ""
name_basics_preprocessed["currentdate"] = ""

In [None]:
name_basics_preprocessed_new = pd.to_numeric(name_basics_preprocessed['noofmovies'], errors = 'coerce')
name_basics_preprocessed_new = pd.to_numeric(name_basics_preprocessed['age'], errors = 'coerce')
name_basics_preprocessed['currentdate'] = pd.to_datetime(name_basics_preprocessed.currentdate, errors = 'coerce', format = '%Y')

In [None]:
df_1 = pd.DataFrame(dict(test_column_1 = name_basics_preprocessed_new)) 
df_2 = pd.DataFrame(dict(test_column_2 = name_basics_preprocessed_new)) 

df_1['noofmovies'] = df_1['test_column_1']
df_2['age'] = df_2['test_column_2']

name_basics_preprocessed['noofmovies'] = df_1['noofmovies']
name_basics_preprocessed['age'] = df_2['age']
name_basics_preprocessed['currentdate'] = pd.to_datetime(name_basics_preprocessed['currentdate'])

## Approach 1- With sql alchemy ##

In [None]:
length = len(name_basics_preprocessed) // BATCH_SIZE +1

for chunk in tqdm(batch(name_basics_preprocessed,BATCH_SIZE), total = length):
    load_dataframe(chunk, 'namebasics')

In [None]:
# UNCOMMENT TO DISPLAY TABLE ROWS
# mydb = mysql.connector.connect(
#     host="localhost",
#     user="root",
#     passwd="alupizzo92",
#     database="imbd_application"
# )
# mycursor = mydb.cursor()

# query = "SELECT * FROM namebasics LIMIT 5;"
# mycursor.execute(query)

## Principals-will insert when nconst is in the right format  ##

In [None]:
title_principals = pd.read_csv('title.principals.tsv', delimiter = '\t',encoding = 'utf-8')

## Title crew ##

In [None]:
title_crew = pd.read_csv('title.crew.tsv', delimiter = '\t',encoding = 'utf-8')

In [None]:
title_crew_processed = title_crew.drop(['directors'], axis = 1)
title_crew_processed = title_crew.drop(['writers'], axis = 1)

In [None]:
length = len(title_crew_processed) // BATCH_SIZE + 1

for chunk in tqdm(batch(title_crew_processed, BATCH_SIZE), total = length):
    load_dataframe(chunk,'titlecrew')

## Writers ##

In [None]:
def preprocess_dataframe_parsing(data_frame, column_name):
    
    data_frame = data_frame.to_frame()
    data_frame = data_frame.assign(name = data_frame[column_name].str.split(","))
    #type(x.iloc[0,1])
    data_frame = data_frame.explode('name')
    
    return data_frame
writers_preprocessed = preprocess_dataframe_parsing(title_crew.writers, "writers")

In [None]:
length = len(writers_preprocessed) // BATCH_SIZE + 1

for chunk in tqdm(batch(writers_preprocessed, BATCH_SIZE), total = length):
    load_dataframe(chunk, 'writers')

## Directors ##

In [None]:
directors_preprocessed = preprocess_dataframe_parsing(title_crew.directors, "directors")

In [None]:
length = len(directors_preprocessed) // BATCH_SIZE + 1

for chunk in tqdm(batch(directors_preprocessed, BATCH_SIZE), total = length):
    # call helper function
    load_dataframe(chunk, 'directors')

## Genres ##

In [None]:
genres_preprocessed = preprocess_dataframe_parsing(title_basics.genres, 'genres')

In [None]:
length = len(genres_preprocessed) // BATCH_SIZE + 1

for chunk in tqdm(batch(genres_preprocessed, BATCH_SIZE), total = length):
    # call helper function
    load_dataframe(chunk, 'genres')

## Known for titles ##

In [None]:
knownForTitles_preprocessed = preprocess_dataframe_parsing(name_basics.knownForTitles, 'knownForTitles')

In [None]:
length = len(knownForTitles_preprocessed) // BATCH_SIZE + 1

for chunk in tqdm(batch(knownForTitles_preprocessed, BATCH_SIZE), total = length):
    # call helper function
    load_dataframe(chunk, 'knownForTitles')

## primary profession ##

In [None]:
pprofession_preprocessed = preprocess_dataframe_parsing(name_basics.primaryProfession, 'primaryProfession')

In [None]:
length = len(pprofession_preprocessed) // BATCH_SIZE + 1

for chunk in tqdm(batch(pprofession_preprocessed, BATCH_SIZE), total = length):
    # call helper function
    load_dataframe(chunk, 'primaryProfession')

## types ## 

In [None]:
types_preprocessed = preprocess_dataframe_parsing(title_akas.types, 'types')

In [None]:
length = len(types_preprocessed) // BATCH_SIZE + 1

for chunk in tqdm(batch(types_preprocessed, BATCH_SIZE), total = length):
    # call helper function
    load_dataframe(chunk, 'types')

## attributes ##

In [None]:
attributes_preprocessed = preprocess_dataframe_parsing(title_akas.attributes, 'attributes')

In [None]:
length = len(attributes_preprocessed) // BATCH_SIZE + 1

for chunk in tqdm(batch(attributes_preprocessed, BATCH_SIZE), total = length):
    load_dataframe(chunk, 'attributes')

batch_size=100000
length=len(types_preprocessed)//batch_size+1
for chunk in tqdm(batch(types_preprocessed,batch_size),total=length):
    load_dataframe(chunk,'attributes')

## Characters ##

In [None]:
def preprocess_dataframe_parsing(data_frame, column_name):
    
    data_frame=data_frame.to_frame()
    data_frame=data_frame.assign(=data_frame[column_name].str.replace('[\[\]"]',"").str.split(","))
    #type(x.iloc[0,1])
    data_frame=data_frame.explode('name')
    
    return data_frame
characters_preprocessed=preprocess_dataframe_parsing(title_principals.characters, "characters")

In [None]:
length = len(characters_preprocessed) // BATCH_SIZE + 1

for chunk in tqdm(batch(directors_preprocessed, BATCH_SIZE), total = length):
    load_dataframe(chunk, 'characters')