Data downloaded from:
[http://data.dft.gov.uk/anonymised-mot-test/test_data/dft_test_result_2019.zip](https://data.gov.uk/dataset/e3939ef8-30c7-4ca8-9c7c-ad9475cc9b2f/anonymised-mot-tests-and-results)

Zip file extracted to local disk, then 4 csv files put into the `mot-data/data` folder

In [1]:
# Import and create a new database file
import sqlite3
from sqlite3 import Error
import os

def create_connection(path):
    connection = None
    try:
        connection = sqlite3.connect(path)
        print("Connection to SQLite DB successful")
    except Error as e:
        print(f"The error '{e}' occurred")

    return connection

connection = create_connection(os.path.join('data','mot.sqlite'))


Connection to SQLite DB successful


In [2]:
# Create function to simply execute query
def execute_query(connection, query, firstonly=False, verbose=True):
    cursor = connection.cursor()
    try:
        cursor.execute(query)
        connection.commit()
        if verbose:print("Query executed successfully");
    except Error as e:
        print(f"The error '{e}' occurred")
    if firstonly:
        return cursor.fetchone()
    else:
        return cursor.fetchall()

def insert_query(connection, query, t, verbose=True):
    cursor = connection.cursor()
    try:
        cursor.execute(query, t)
        connection.commit()
        if verbose:print("Query executed successfully");
    except Error as e:
        print(f"The error '{e}' occurred")
    return

In [3]:
import pandas as pd
import os

names = ['test_id','vehicle_id','test_date','test_class_id','test_type','test_result','test_mileage','postcode_area','make','model','colour','fuel_type','cylinder_capacity','first_use_date']
dtypes = {'test_id':'int', 
          'vehicle_id':'int',
          'test_date':'object', 
          'test_class_id':'int',
          'test_type':'object', 
          'test_result':'object', 
          'test_mileage':'object',
          'postcode_area':'object', 
          'make':'object', 
          'model':'object', 
          'colour':'object',
          'fuel_type':'object', 
          'cylinder_capacity':'object', 
          'first_use_date':'object'}
na_values = {'test_mileage':-1}

In [4]:
# Create mot1 table
execute_query(connection,'''
DROP TABLE IF EXISTS mot1;
''')

execute_query(connection, '''
CREATE TABLE mot1 (
  test_id INTEGER,
  vehicle_id INTEGER,
  test_date TEXT,
  test_class_id INTEGER,
  test_type TEXT,
  test_result TEXT,
  test_mileage INTEGER,
  postcode_area TEXT,
  make TEXT,
  model TEXT,
  colour TEXT,
  fuel_type TEXT,
  cylinder_capacity INTEGER,
  first_use_date TEXT
);
''')

KeyboardInterrupt: 

In [None]:
# Stackoverflow
from datetime import datetime

def process_chunk(chunk):
    # handles one chunk of rows ffrom pandas reader
#     con = create_connection(os.path.join('data','mot.sqlite'))
    
    for i, row in enumerate(chunk):
        test_id = row[0]
        vehicle_id = row[1]
        test_date = datetime.strptime(row[2], '%Y-%m-%d')
        test_class_id = row[3]
        test_type = row[4]
        test_result = row[5]
        test_mileage = int(row[6])
        postcode_area = row[7]
        make = row[8]
        model = row[9]
        colour = row[10]
        fuel_type = row[11]
        cylinder_capacity = row[12]
        first_use_date = datetime.strptime(row[13], '%Y-%m-%d')
        
        #             print(f'{test_id} {vehicle_id} [{test_date.strftime("%Y-%m-%d %H:%M:%S")}] '+
        #                   f'{test_class_id} {test_type} {test_result} {test_mileage} {postcode_area} '+
        #                   f'{make} {model} {colour} {fuel_type} {cylinder_capacity} ' +
        #                   f'[{first_use_date.strftime("%Y-%m-%d %H:%M:%S")}]')

        insert_query(connection,f'''
        INSERT INTO mot1 (test_id, vehicle_id, test_date, test_class_id, 
        test_type, test_result, test_mileage, postcode_area,
        make, model, colour, fuel_type, cylinder_capacity, first_use_date)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
        ''', (test_id, vehicle_id, test_date.strftime("%Y-%m-%d %H:%M:%S"),
             test_class_id, test_type, test_result,
             test_mileage, postcode_area, make, model, colour,
             fuel_type, cylinder_capacity, first_use_date), verbose=False)
        if i>5:
            break
    
for i, chunk in enumerate(pd.read_csv(os.path.join('data','dft_test_result-from-2019-01-01_00-00-01-to-2019-04-01_00-00-01.csv'), 
                 sep=',', delim_whitespace=False,
                 names=names, dtype=dtypes, engine='python', chunksize=20000, header=0,
                 error_bad_lines=False, warn_bad_lines=True, doublequote=False)):
    # Adjust chunksize to your needs and RAM
    chunk['test_mileage'].fillna(-1,inplace=True)
    
    process_chunk(chunk.values)
#     if i>0:
#         break


In [12]:
print(*execute_query(connection,'SELECT * FROM mot1 limit 20;', verbose=False))

(1930167913, 1168220651, '2019-01-01 00:00:00', 4, 'NT', 'P', '47108', 'LL', 'LAND ROVER', 'DISCOVERY', 'WHITE', 'DI', '2993', '2014-07-29 00:00:00') (1887519427, 608494756, '2019-01-01 00:00:00', 4, 'NT', 'P', '74254', 'RM', 'VAUXHALL', 'COMBO', 'BLUE', 'DI', '1686', '2000-10-16 00:00:00') (1844870941, 345838224, '2019-01-01 00:00:00', 4, 'NT', 'P', '52596', 'RM', 'SMART (MCC)', 'FORTWO COUPE', 'BLACK', 'PE', '999', '2010-06-30 00:00:00') (1802222455, 712515370, '2019-01-01 00:00:00', 4, 'NT', 'F', '97925', 'S', 'KIA', 'CEED', 'BLUE', 'DI', '1582', '2007-10-31 00:00:00') (1631628511, 929718858, '2019-01-01 00:00:00', 4, 'RT', 'P', '91055', 'BB', 'TOYOTA', 'YARIS', 'RED', 'PE', '998', '2002-11-11 00:00:00') (1588980025, 228077478, '2019-01-01 00:00:00', 4, 'RT', 'P', '69520', 'BN', 'HYUNDAI', 'COUPE', 'SILVER', 'PE', '1975', '2006-06-27 00:00:00') (1205143651, 614637102, '2019-01-01 00:00:00', 4, 'RT', 'P', '62554', 'CA', 'NISSAN', 'JUKE', 'RED', 'PE', '1598', '2011-03-09 00:00:00') (1

In [9]:
from datetime import datetime
list_nats = list()
# Alternative simpler code, straight from dataframe to sql.
for i, chunk in enumerate(pd.read_csv(os.path.join('data','dft_test_result-from-2019-01-01_00-00-01-to-2019-04-01_00-00-01.csv'),
                         sep=',', delim_whitespace=False,
                         names=names, dtype=dtypes, engine='python',
                         chunksize=500000, iterator=True, header=0,
                         error_bad_lines=False, warn_bad_lines=True, doublequote=False)):
    #DEBUGGING
    if i>0:break

    chunk['test_date'] = pd.to_datetime(chunk['test_date'],format='%Y-%m-%d', errors='coerce')
    chunk['first_use_date'] = pd.to_datetime(chunk['first_use_date'],format='%Y-%m-%d', errors='coerce')

    chunk.to_sql('mot1', connection, if_exists='append', index = False)
    is_nat = pd.isna(chunk['first_use_date'])
    if any(is_nat):
        print(f'chunknum {i} completed. NaT detected at test_id={chunk[is_nat]["test_id"].values}')
        list_nats.append(chunk[is_nat]["test_id"].values)
    else:
        print(f'chunknum {i} completed.')


chunknum 0 completed.
chunknum 1 completed. NaT detected at test_id=[1508254279]
chunknum 2 completed.
chunknum 3 completed.
chunknum 4 completed. NaT detected at test_id=[1487206527]
chunknum 5 completed.
chunknum 6 completed. NaT detected at test_id=[1197994457 1371175063]
chunknum 7 completed. NaT detected at test_id=[1969863607  713113865]
chunknum 8 completed. NaT detected at test_id=[1926654095]
chunknum 9 completed. NaT detected at test_id=[758782857]
chunknum 10 completed.
chunknum 11 completed.
chunknum 12 completed.
chunknum 13 completed. NaT detected at test_id=[20432289]
chunknum 14 completed.
chunknum 15 completed.
chunknum 16 completed.
chunknum 17 completed. NaT detected at test_id=[1951363753]
chunknum 18 completed. NaT detected at test_id=[267869835]
chunknum 19 completed.
chunknum 20 completed.


In [19]:
df = pd.read_sql_query('SELECT COUNT(test_id) FROM mot1;', connection)
df.head()

Unnamed: 0,COUNT(test_id)
0,14710086


In [20]:
list_nats

[556155    1508254279
 Name: test_id, dtype: int64,
 2489321    1487206527
 Name: test_id, dtype: int64,
 3250572    1197994457
 3258117    1371175063
 Name: test_id, dtype: int64,
 3575287    1969863607
 3949423     713113865
 Name: test_id, dtype: int64,
 4194487    1926654095
 Name: test_id, dtype: int64,
 4592951    758782857
 Name: test_id, dtype: int64,
 6935550    20432289
 Name: test_id, dtype: int64,
 8842791    1951363753
 Name: test_id, dtype: int64,
 9317592    267869835
 Name: test_id, dtype: int64]

In [None]:
df = pd.read_csv(os.path.join('data','dft_test_result-from-2019-01-01_00-00-01-to-2019-04-01_00-00-01.csv'), 
                 sep=',', delim_whitespace=False,
                 names=names, dtype=dtypes, engine='python', chunksize=20000,
                 error_bad_lines=False, warn_bad_lines=True)

In [None]:
lines = list(range(7795035, 7795036))

with open(os.path.join('data','dft_test_result-from-2019-01-01_00-00-01-to-2019-04-01_00-00-01.csv')) as fp:
    for i, line in enumerate(fp):
        if i in lines:
            print(f'[{i}]{repr(line)}')
        elif i > max(lines):
            break

In [None]:
df.to_sql('table1', connection, if_exists='replace', index = False)

Columns in dataset are:
test_id,vehicle_id,test_date,test_class_id,test_type,test_result,test_mileage,postcode_area,make,model,colour,fuel_type,cylinder_capacity,first_use_date