This notebook is loading all the csv files into a database (sqlite).

In [160]:
import pandas as pd
import numpy as np
import re
import json
import sqlite3

# Database Schema
provided under a txt file

In [161]:
f1 = open("data/schema.txt","r")
tables_schema = {}
flag = 0
tmp = []

# Isolate the schema of each table into a dict
for line in f1.readlines():
    if 'csv' in line:
        if flag > 0:
            tables_schema[key] = tmp
            tmp = []
        key = line.split('.csv')[0]
        flag += 1
    if flag > 0: 
        tmp.append(line)

In [162]:
# These are the different tables of the database
tables_schema.keys()

dict_keys(['circuits', 'constructor_results', 'constructor_standings', 'constructors', 'driver_standings', 'drivers', 'lap_times', 'pit_stops', 'qualifying', 'races', 'results', 'seasons'])

In [163]:
tables_schema

{'circuits': ['circuits.csv\n',
  '+------------+--------------+------+-----+---------+----------------+\n',
  '| Field      | Type         | Null | Key | Default | Extra          |\n',
  '+------------+--------------+------+-----+---------+----------------+\n',
  '| circuitId  | int(11)      | NO   | PRI | NULL    | auto_increment |\n',
  '| circuitRef | varchar(255) | NO   |     |         |                |\n',
  '| name       | varchar(255) | NO   |     |         |                |\n',
  '| location   | varchar(255) | YES  |     | NULL    |                |\n',
  '| country    | varchar(255) | YES  |     | NULL    |                |\n',
  '| lat        | float        | YES  |     | NULL    |                |\n',
  '| lng        | float        | YES  |     | NULL    |                |\n',
  '| alt        | int(11)      | YES  |     | NULL    |                |\n',
  '| url        | varchar(255) | NO   | UNI |         |                |\n',
  '+------------+--------------+------+-----

Each schema is represented following a table format. In order to transform it into a usable query, I need to parse the table.

# Loading into the db

## Loading the schema of each table

In [164]:
# First, let's open a connection to the database, which I called "Formula1"
conn = sqlite3.connect('formula1.sqlite')
cur = conn.cursor()

In [167]:
# Iteration over each table (key)
for key in tables_schema.keys():
    query = "DROP TABLE IF EXISTS {};\nCREATE TABLE {} ( \n".format(key, key)
    # We don't need the first 4 lines of each string (headers) != columns of the tables I will put into the db
    tmp = tables_schema[key][4:]
    # Parsing each line into a variable of the table
    for line in tmp:
        tmp2 = re.findall(r"[\w(\d)]+", line.replace('int(11)', 'INTEGER').replace('0000-00-00','').replace('0','DEFAULT 0').replace('UNI','UNIQUE').replace('NULL','').replace('NO','NOT NULL').replace('YES','').replace('PRI','PRIMARY KEY').replace('auto_increment','AUTOINCREMENT UNIQUE'))
        if len(tmp2) > 0:
            query += ' '.join(tmp2) + ',\n'
    query = query[:-2] + ')'
    # Applying some refinements for the primary keys
    if len(re.findall('PRIMARY KEY', query)) > 1:
        test = query.split('\n')
        test2 = []
        pri_key = []
        for x in test:
            if 'PRIMARY KEY' in x:
                var = x.split()[0]
                pri_key.append(var)
                y = x.replace('PRIMARY KEY', '')
                test2.append(y)
            else:
                test2.append(x)
        test2[-1] = test2[-1][:-1] + ','
        test2.append('PRIMARY KEY ({})\n)'.format(','.join(pri_key)))
        
        # This is the final query
        query = '\n'.join(test2)
    print(query)
    cur.executescript(query)

DROP TABLE IF EXISTS circuits;
CREATE TABLE circuits ( 
circuitId INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
circuitRef varchar(255) NOT NULL,
name varchar(255) NOT NULL,
location varchar(255),
country varchar(255),
lat float,
lng float,
alt INTEGER,
url varchar(255) NOT NULL UNIQUE)
DROP TABLE IF EXISTS constructor_results;
CREATE TABLE constructor_results ( 
constructorResultsId INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
raceId INTEGER NOT NULL DEFAULT 0,
constructorId INTEGER NOT NULL DEFAULT 0,
points float,
status varchar(255))
DROP TABLE IF EXISTS constructor_standings;
CREATE TABLE constructor_standings ( 
constructorStandingsId INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
raceId INTEGER NOT NULL DEFAULT 0,
constructorId INTEGER NOT NULL DEFAULT 0,
points float NOT NULL DEFAULT 0,
position INTEGER,
positionText varchar(255),
wins INTEGER NOT NULL DEFAULT 0)
DROP TABLE IF EXISTS constructors;
CREATE TABLE constructors ( 
constructorId INTEGER NOT NULL PRIMAR

## Loading the data into each table

In [169]:

for key in tables_schema.keys():
    df = pd.read_csv('data/f1/{}.csv'.format(key))
    df = df.replace('\\N', np.nan)
    cols = list(df.columns)
    values = ['?'] * len(cols)
    cur.executemany("INSERT OR IGNORE INTO {} ({}) VALUES ({});".format(key,', '.join(cols), ', '.join(['?'] * len(cols))), df.values)
    
conn.commit()
conn.close()