In [1]:
import pandas as pd
import requests
import time
from datetime import date, datetime

from config import *

########################################
# Main function to download complaints
########################################
# Function to get complaints from NHTSA API
import requests
from requests.exceptions import Timeout, RequestException
import time

# Function to get complaints from NHTSA API
def get_complaints(make, model, model_year, retries=3, timeout=30):
    # Construct the API URL
    url = f"https://api.nhtsa.gov/complaints/complaintsByVehicle?make={make}&model={model}&modelYear={model_year}"

    attempt = 0

    while attempt < retries:
        try:
            # Make the GET request to the NHTSA API with a timeout
            response = requests.get(url, timeout=timeout)
            
            # Check if the request was successful
            if response.status_code == 200:
                # Return the JSON response
                return response.json().get('results', [])
            elif response.status_code == 400:
                return None
            else:
                # Return an error message
                return None
        except Timeout:
            # Handle timeout exception
            attempt += 1
            print(f"Attempt {attempt} timed out. Retrying...")
            time.sleep(5)  # wait before retrying
        except RequestException as e:
            attempt += 1
            # Handle other request exceptions
            print(f"Request failed: {e}")
            time.sleep(5)
        except Exception as e:
            attempt += 1
            print(f"Request failed: {e}")
            time.sleep(5)

    # If all attempts fail, return None
    print("All attempts to contact the API have failed.")
    return None


########################################
# Update Model Years
########################################
# Function to get all Model Years
# Function to get all Model Years
def get_complaints_model_years(retries=3, timeout=30):
    url = "https://api.nhtsa.gov/products/vehicle/modelYears?issueType=c"
    
    attempt = 0

    while attempt < retries:
        try:
            # Make the GET request to the NHTSA API with a timeout
            response = requests.get(url, timeout=timeout)
            
            # Check if the request was successful
            if response.status_code == 200:
                return pd.DataFrame(response.json().get('results', []))
            else:
                return f"Error: {response.status_code}"
        
        except Timeout:
            # Handle timeout exception
            attempt += 1
            print(f"Attempt {attempt} timed out. Retrying...")
            time.sleep(5)  # wait before retrying
        except RequestException as e:
            attempt += 1
            # Handle other request exceptions
            print(f"Request failed: {e}")
            time.sleep(5)
        except Exception as e:
            attempt += 1
            print(f"Request failed: {e}")
            time.sleep(5)

    # If all attempts fail, return an error message
    print("All attempts to contact the API have failed.")
    return "Error: All attempts to contact the API have failed."

# Fetch model years
complaints_model_years = get_complaints_model_years()

# Save model years to database
db = pg_connect()
complaints_model_years.to_sql('complaints_model_years',db,index=False,if_exists='replace')


########################################
# Update Makes
########################################
# Function to get all Makes for the Model Year
# Function to get makes for a specific year
def get_complaints_makes_for_years(year, retries=3, timeout=30):
    url = f"https://api.nhtsa.gov/products/vehicle/makes?modelYear={year}&issueType=c"
    
    attempt = 0

    while attempt < retries:
        try:
            # Make the GET request to the NHTSA API with a timeout
            response = requests.get(url, timeout=timeout)
            
            # Check if the request was successful
            if response.status_code == 200:
                return pd.DataFrame(response.json().get('results', []))
            else:
                return f"Error: {response.status_code}"
        
        except Timeout:
            # Handle timeout exception
            attempt += 1
            print(f"Attempt {attempt} timed out. Retrying...")
            time.sleep(5)  # wait before retrying
        except RequestException as e:
            attempt += 1
            # Handle other request exceptions
            print(f"Request failed: {e}")
            time.sleep(5)
        except Exception as e:
            attempt += 1
            print(f"Request failed: {e}")
            time.sleep(5)

    # If all attempts fail, return an error message
    print("All attempts to contact the API have failed.")
    return "Error: All attempts to contact the API have failed."


for year in complaints_model_years[complaints_model_years['modelYear'].astype(int)>=2016]['modelYear']:
    print(f'Downloading makes for year {year}')
    complaints_makes_for_years = get_complaints_makes_for_years(year)
    complaints_makes_for_years
    complaints_makes_for_years.to_sql('complaints_makes_for_years',db,index=False,if_exists='append')
db.dispose()
# Remove duplicate rows from the table
pg_clean_table('complaints_makes_for_years')

Configuration loaded successfully.


Downloading makes for year 2016
Downloading makes for year 2017
Downloading makes for year 2018
Downloading makes for year 2019
Downloading makes for year 2020
Downloading makes for year 2021
Downloading makes for year 2022
Downloading makes for year 2023
Downloading makes for year 2024
Downloading makes for year 2025
Downloading makes for year 9999
Duplicates removed from complaints_makes_for_years


In [2]:
# Create table to track model updates
if 'complaints_model_download_tracker' not in pg_tables():
    query = """
create table complaints_model_download_tracker as
select
	*,
	CURRENT_TIMESTAMP - interval '1000 years' as models_last_updated,
    0 as models_downloaded
from complaints_makes_for_years
"""
    pg_execute(query)
    print("complaints_model_download_tracker table created")

# Update model download tracker
pg_execute("""
INSERT INTO complaints_model_download_tracker
select distinct on ("modelYear",make)
	"modelYear",
	make,
	CURRENT_TIMESTAMP - interval '1000 years' as models_last_updated,
    0 as models_downloaded
from complaints_makes_for_years
where ("modelYear","make") not in (select "modelYear",make from complaints_model_download_tracker)
""")
print("complaints_model_download_tracker updated")


########################################
# Update Models
########################################
# Function to get all Models for the Make and Model Year
def get_complaints_models(make, year, retries=3, timeout=30):
    url = f"https://api.nhtsa.gov/products/vehicle/models?modelYear={year}&make={make}&issueType=c"
    
    attempt = 0

    while attempt < retries:
        try:
            response = requests.get(url, timeout=timeout)
            
            if response.status_code == 200:
                return response.json().get('results', [])
            else:
                return f"Error: {response.status_code}"
        
        except Timeout:
            # Handle timeout exception
            attempt += 1
            print(f"Attempt {attempt} timed out. Retrying...")
            time.sleep(5)  # wait before retrying
        except RequestException as e:
            attempt += 1
            # Handle other request exceptions
            print(f"Request failed: {e}")
            time.sleep(5)
        except Exception as e:
            attempt += 1
            print(f"Request failed: {e}")
            time.sleep(5)

    print("All attempts to contact the API have failed.")
    return "Error: All attempts to contact the API have failed."

complaints_makes_for_years = pg_query("""
select distinct 
	"modelYear",
	make,
    models_last_updated,
    models_downloaded
from complaints_model_download_tracker 
where "modelYear"::int >= extract(year from current_date) 
and models_last_updated < CURRENT_DATE - INTERVAL '7 days'
union all
select
	"modelYear",
	make,
    models_last_updated,
    models_downloaded
from (
	SELECT DISTINCT
		*,
		random()
	FROM complaints_model_download_tracker
	WHERE "modelYear"::int < EXTRACT(YEAR FROM CURRENT_DATE)
	  AND "modelYear"::int >= EXTRACT(YEAR FROM CURRENT_DATE) - 5
	  AND models_last_updated < CURRENT_DATE - INTERVAL '15 days'
      AND models_downloaded = 0
	order by random() desc
	limit 500
) tbl      
""")

all_models = []
if len(complaints_makes_for_years) > 0:
    db = pg_connect()
    for _,row in complaints_makes_for_years.iterrows():
        print(f"Downloading {row['make']} {row['modelYear']} models")
        # Download models
        try:
            download = get_complaints_models(row['make'],str(row['modelYear']))
        except Exception as e:
            print("Download failed. Will try again next update.")
        # add to list
        print(download)
        payload = pd.DataFrame(download)
        if len(payload) > 0:
            payload.to_sql('complaints_models',db,index=False, if_exists='append')
        # Update complaints_model_download_tracker
        with db.connect() as connection:
            query = text('''
                update complaints_model_download_tracker
                set models_last_updated = current_timestamp,
                    models_downloaded = :x
                where "modelYear" = :y and make = :z
                ''')
            connection.execute(query,{'x':len(payload),'y':str(row['modelYear']),'z':row['make']})
            connection.commit()
        # Done.    
        print(f"{row['modelYear']} {row['make']} models updated: {payload.shape[0]} new models")
        time.sleep(2)
    db.dispose()
    # Clean database
    pg_clean_table('complaints_models')
    print(f"complaints_models table cleaned")

########################################
# Update Complaints
########################################
# Create table to track complaint updates
if 'complaints_download_tracker' not in pg_tables():
    query = """
create table complaints_download_tracker as
select
	*,
	CURRENT_TIMESTAMP - interval '1000 years' as complaints_last_updated,
    0 as total_complaints
from complaints_models
"""
    pg_execute(query)
    print("complaints_download_tracker table created")

# Update complaint download tracker
pg_execute("""
INSERT INTO complaints_download_tracker
select distinct on ("modelYear","make","model")
	"modelYear",
	make,
    "model",
	CURRENT_TIMESTAMP - interval '1000 years' as complaints_last_updated,
    0 as total_complaints
from complaints_models
where ("modelYear","make","model") not in (select "modelYear","make","model" from complaints_download_tracker)
""")

print("complaints_download_tracker updated")

def update_complaints(make_model_year):
    for _,row in make_model_year.iterrows():
        make_complaints = []
        print(f'Downloading data for {row['modelYear']} {row['make']} {row['model']}')
        complaints = get_complaints(row['make'],row['model'],row['modelYear'])
        if complaints:
            for c in complaints:
                c['make'] = row.get('make',None)
                c['model'] = row.get('model',None)
                c['modelYear'] = row.get('modelYear',None)
                c['products'] = json.dumps(c.get('products',{}))
                make_complaints.append(c)
        make_df = pd.DataFrame(make_complaints)
        db = pg_connect()
        if len(make_df) > 0:
            make_df.to_sql('complaints',db,index=False,if_exists='append')
        # update complaints download tracker
        with db.connect() as connection:
            query = text('''
            update complaints_download_tracker
            set complaints_last_updated = current_timestamp, total_complaints = :w
            where "modelYear" = :x and make = :y and model = :z
            ''')
            connection.execute(query,{'w':make_df.shape[0],'x': str(row['modelYear']),'y':row['make'],'z':row['model']})
            connection.commit()
        db.dispose()
        # Done.
        print(f'Complaint data for {row['modelYear']} {row['make']} {row['model']} updated: {make_df.shape[0]} total complaints')
        time.sleep(1)

make_model_year = pg_query(f""" 
(
    select 
        * 
    from complaints_download_tracker
    where make='TESLA' and "modelYear"::int >= extract(year from current_date) - 5
    and extract('days' from current_timestamp - complaints_last_updated) > 0
)
union all
(
    select
        "modelYear",
        "make",
        "model",
        "complaints_last_updated",
        "total_complaints"
    from (
        select 
            *,
            random()
        from complaints_download_tracker
        where make !='TESLA'
        and "modelYear"::int >= extract(year from current_date) - 5
        and extract('days' from current_timestamp - complaints_last_updated) > 3
    ) tbl
    order by random() limit 400
)
""")
if len(make_model_year) > 0:
    update_complaints(make_model_year)
print(f"Stale data randomly updated.")


pg_execute("drop table if exists complaints_backup")
pg_execute("""
create table complaints_backup as
select distinct on ("odiNumber")
    *
from complaints 
""")
pg_execute("delete from complaints")
pg_execute('insert into complaints select * from complaints_backup')
pg_execute('drop table complaints_backup')

print("Complaints data table cleaned.")

##########################
# Update car_sales
##########################
car_sales = pd.read_csv('car_sales.csv')
import re
car_sales['Automaker'] = car_sales.Automaker.apply(lambda x: re.sub(',$','',x).split(','))

complaints = []
for _,row in car_sales.iterrows():
    complain = 0
    for maker in row['Automaker']:
        c = pg_query(f""" 
select count(*) from complaints where make = '{maker}'
and "modelYear" = '{row['Year']}'
""")
        complain = complain + c['count'][0] 

    complaints.append(complain)

car_sales['complaints'] = complaints
car_sales['percentage'] = car_sales['complaints']/car_sales['Sold Autos']
car_sales = car_sales[car_sales['percentage']>0]

unparsed = []
for _,row in car_sales.iterrows():
    for make in row['Automaker']:
        unparsed.append({
            'parent': row['Umbrella'],
            'make': make,
            'modelYear': row['Year'],
            'parent_autos_sold': row['Sold Autos'],
            'world_rank': row['World Rank'],
            'parent_modelYear_complaints': row['complaints'],
            'parent_modelYear_percentage': row['percentage']
        })

df = pd.DataFrame(unparsed)
# Save to database
db = pg_connect()
df.to_sql('car_sales',db,index=False,if_exists='replace')
db.dispose()

pg_execute("""
UPDATE car_sales
SET make = REPLACE(make, 'MERCEDES BENZ', 'MERCEDES-BENZ')
WHERE make = 'MERCEDES BENZ';
""")



complaints_model_download_tracker updated
Downloading SHORELAND'R 9999 models


[]
9999 SHORELAND'R models updated: 0 new models
Downloading RAYBESTOS 9999 models
[]
9999 RAYBESTOS models updated: 0 new models
Downloading LAZY DAZE 9999 models
[]
9999 LAZY DAZE models updated: 0 new models
Downloading UTILITY 9999 models
[]
9999 UTILITY models updated: 0 new models
Downloading RIVERSIDE 9999 models
[]
9999 RIVERSIDE models updated: 0 new models
Downloading CNG. 9999 models
[]
9999 CNG. models updated: 0 new models
Downloading LOTUS 9999 models
[]
9999 LOTUS models updated: 0 new models
Downloading WIDETRACK 9999 models
[]
9999 WIDETRACK models updated: 0 new models
Downloading GEM 9999 models
[]
9999 GEM models updated: 0 new models
Downloading CONDERE 9999 models
[]
9999 CONDERE models updated: 0 new models
Downloading GEO 9999 models
[]
9999 GEO models updated: 0 new models
Downloading RIKEN 9999 models
[]
9999 RIKEN models updated: 0 new models
Downloading GENERAL MOTORS 9999 models
[]
9999 GENERAL MOTORS models updated: 0 new models
Downloading NORSTAR 9999 mo

'done'