In [1]:
import pandas as pd
import requests
import time
from datetime import date, datetime

from config import *

########################################
# Main function to download complaints
########################################
# Function to get complaints from NHTSA API
import requests
from requests.exceptions import Timeout, RequestException
import time

# Get recall years
url = 'https://api.nhtsa.gov/products/vehicle/modelYears?issueType=r '
response = requests.get(url).json()
recalls_years = pd.DataFrame(response['results'])
db = pg_connect()
recalls_years.to_sql('recalls_years',db,index=False,if_exists='replace')
db.dispose()

Configuration loaded successfully.


In [2]:
# Get makes for recall year
def get_makes_for_recalls_year(year,retries=3, timeout=30):
    url = f'https://api.nhtsa.gov/products/vehicle/makes?modelYear={year}&issueType=r'
    attempt = 0
    while attempt < retries:
            try:
                # Make the GET request to the NHTSA API with a timeout
                response = requests.get(url, timeout=timeout)
                
                # Check if the request was successful
                if response.status_code == 200:
                    # Return the JSON response
                    return response.json().get('results', [])
                elif response.status_code == 400:
                    return None
                else:
                    # Return an error message
                    return None
            except Timeout:
                # Handle timeout exception
                attempt += 1
                print(f"Attempt {attempt} timed out. Retrying...")
                time.sleep(5)  # wait before retrying
            except RequestException as e:
                attempt += 1
                # Handle other request exceptions
                print(f"Request failed: {e}")
                time.sleep(5)
            except Exception as e:
                attempt += 1
                print(f"Request failed: {e}")
                time.sleep(5)

    # If all attempts fail, return None
    print("All attempts to contact the API have failed.")
    return None

# download makes for recall years
db = pg_connect()
for year in recalls_years['modelYear'][recalls_years['modelYear'].apply(int)>=2019]:
    temp =get_makes_for_recalls_year(year)
    temp_df = pd.DataFrame(temp)
    if len(temp_df) > 0:
        temp_df.to_sql('recalls_makes_for_years',db,index=False,if_exists='append')
db.dispose()

pg_clean_table('recalls_makes_for_years')

Duplicates removed from recalls_makes_for_years


In [3]:
# Get makes for recall year
def get_models_for_make_recall_years(year,make,retries=3, timeout=30):
    url = f'https://api.nhtsa.gov/products/vehicle/models?modelYear={year}&make={make}&issueType=r'
    attempt = 0
    while attempt < retries:
            try:
                # Make the GET request to the NHTSA API with a timeout
                response = requests.get(url, timeout=timeout)
                
                # Check if the request was successful
                if response.status_code == 200:
                    # Return the JSON response
                    return response.json().get('results', [])
                elif response.status_code == 400:
                    return None
                else:
                    # Return an error message
                    return None
            except Timeout:
                # Handle timeout exception
                attempt += 1
                print(f"Attempt {attempt} timed out. Retrying...")
                time.sleep(5)  # wait before retrying
            except RequestException as e:
                attempt += 1
                # Handle other request exceptions
                print(f"Request failed: {e}")
                time.sleep(5)
            except Exception as e:
                attempt += 1
                print(f"Request failed: {e}")
                time.sleep(5)

    # If all attempts fail, return None
    print("All attempts to contact the API have failed.")
    return None
    

# Create table to track model updates
if 'recalls_model_download_tracker' not in pg_tables():
    query = """
create table recalls_model_download_tracker as
select
	*,
	CURRENT_TIMESTAMP - interval '1000 years' as models_last_updated,
    0 as models_downloaded
from recalls_makes_for_years
"""
    pg_execute(query)
    print("recalls_model_download_tracker table created")

# Update model download tracker
pg_execute("""
INSERT INTO recalls_model_download_tracker
select distinct on ("modelYear","make")
	"modelYear",
	"make",
	CURRENT_TIMESTAMP - interval '1000 years' as models_last_updated,
    0 as models_downloaded
from recalls_makes_for_years
where ("modelYear","make") not in (select "modelYear","make" from recalls_model_download_tracker)
""")
print("recalls_model_download_tracker updated")

recalls_model_download_tracker updated


In [4]:
recalls_model_download_tracker = pg_query("""
select
    *
from recalls_model_download_tracker
where models_last_updated < current_date - interval '15 days'
and "modelYear"::int >= extract(year from current_date) - 5
""")

db = pg_connect()
for _,row in recalls_model_download_tracker.iterrows():
    temp = get_models_for_make_recall_years(row['modelYear'],row['make'])
    temp_df = pd.DataFrame(temp)
    temp_df.to_sql('recalls_models',db,index=False,if_exists='append')
    time.sleep(1)
    with db.connect() as connection:
        query = text("""
                   update recalls_model_download_tracker
                   set models_last_updated = current_timestamp, models_downloaded = :a
                   where "modelYear"::int = :x and "make" = :y
                   """)
        connection.execute(query,{'a':len(temp_df),'x':row['modelYear'],'y':row['make']})
        connection.commit()
    print(f'recall models for {row['modelYear']} {row['make']} updated')
db.dispose()

pg_clean_table('recalls_models')

Duplicates removed from recalls_models


In [5]:
if 'recalls_download_tracker' not in pg_tables():
    query = """
create table recalls_download_tracker as
select
	*,
	CURRENT_TIMESTAMP - interval '1000 years' as recalls_last_updated,
    0 as total_recalls
from recalls_models
"""
    pg_execute(query)
    print("recalls_download_tracker table created")

# Update complaint download tracker
pg_execute("""
INSERT INTO recalls_download_tracker
select distinct on ("modelYear","make","model")
	"modelYear",
	"make",
    "model",
	CURRENT_TIMESTAMP - interval '1000 years' as recalls_last_updated,
    0 as total_recalls
from recalls_models
where ("modelYear","make","model") not in (select "modelYear","make","model" from recalls_download_tracker)
""")

recalls_download_tracker table created


'done'

In [6]:
# Get makes for ratings year
def get_recalls(year,make,model,retries=3, timeout=30):
    url = f'https://api.nhtsa.gov/recalls/recallsByVehicle?make={make}&model={model}&modelYear={year}'
    attempt = 0
    while attempt < retries:
            try:
                # Make the GET request to the NHTSA API with a timeout
                response = requests.get(url, timeout=timeout)
                
                # Check if the request was successful
                if response.status_code == 200:
                    # Return the JSON response
                    return response.json().get('results', [])
                elif response.status_code == 400:
                    return None
                else:
                    # Return an error message
                    return None
            except Timeout:
                # Handle timeout exception
                attempt += 1
                print(f"Attempt {attempt} timed out. Retrying...")
                time.sleep(5)  # wait before retrying
            except RequestException as e:
                attempt += 1
                # Handle other request exceptions
                print(f"Request failed: {e}")
                time.sleep(5)
            except Exception as e:
                attempt += 1
                print(f"Request failed: {e}")
                time.sleep(5)

    # If all attempts fail, return None
    print("All attempts to contact the API have failed.")
    return None

recalls_download_tracker = pg_query("""
select 
    *,
    random()
from recalls_download_tracker
where recalls_last_updated < current_date - interval '14 days'
and "modelYear"::int >= extract(year from current_date) - 5
order by random()
limit 750
""")

db = pg_connect()
for _, row in recalls_download_tracker.iterrows():
    temp = get_recalls(row['modelYear'],row['make'],row['model'])
    temp_df = pd.DataFrame(temp)
    missing_columns = [x for x in ['Manufacturer', 'NHTSACampaignNumber', 'parkIt', 'parkOutSide',
       'ReportReceivedDate', 'Component', 'Summary', 'Consequence', 'Remedy',
       'Notes', 'ModelYear', 'Make', 'Model', 'NHTSAActionNumber'] if x not in temp_df.columns]
    for col in missing_columns:
        temp_df[col] = None
    if len(temp_df) > 0:
        temp_df.to_sql('recalls',db,index=False,if_exists = 'append',)
    with db.connect() as connection:
        query = text("""
        update recalls_download_tracker
        set recalls_last_updated = current_timestamp, total_recalls = :t
        where "modelYear" = :x and "make" = :y and "model" = :z
        """)
        connection.execute(query,{'t':temp_df.shape[0],'x':row['modelYear'],'y':row['make'],'z':row['model']})
        connection.commit()
    time.sleep(1)

pg_execute("drop table if exists recalls_backup")
pg_execute("""
create table recalls_backup as
select distinct on ("NHTSACampaignNumber","ModelYear","Make","Model")
*
from recalls
order by "NHTSACampaignNumber","ModelYear","Make","Model", "NHTSAActionNumber" desc
""")
pg_execute("delete from recalls")
pg_execute("""
insert into recalls
select * from recalls_backup
""")
pg_execute("drop table recalls_backup")
print("recalls updated")

Attempt 1 timed out. Retrying...
recalls updated
