In [None]:
!pip install rapidfuzz yfinance pandas numpy nltk scikit-learn




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
from rapidfuzz import process, fuzz
import yfinance as yf
import pandas as pd
import numpy as np
import nltk
import re

import json

In [8]:
# LOAD SEC(https://www.sec.gov/files/company_tickers.json) TICKERS
with open("tickers.json", "r") as f:
    tickers = json.load(f)
    companies = [tickers[i]["title"] for i in tickers]
    abbrs = [tickers[i]["ticker"] for i in tickers]

    comp_tckr = {comp: abbr for comp, abbr in zip(companies, abbrs)}

# LOAD INITIAL RECALL DATA
df = pd.read_csv("recalls.csv")

## Data Cleaning

In [9]:
# delete instances where year is older than 1995
print(len(df))
for i, row in df.iterrows():
    if ( row['Report Received Date'][-4:].isnumeric() and int(row['Report Received Date'][-4:]) < 1995 ):
        df.drop(i, inplace=True)
print(len(df))

29374
22505


In [10]:
# remove column that won't feed into training
df = df.drop("Completion Rate % (Blank - Not Reported)", axis=1)

In [11]:
import re
from collections import Counter

# Common words to ignore
stop_words = set([
    "inc", "corp", "llc", "co", "ltd", "corporation",
    "company", "limited", "plc", "gmbh", "sa", "ag", "pte", "bv", "kg", "kgaa", "corp", "industries", "us", "of"
])

def tokenize(name):
    name = name.lower()
    name = re.sub(r'[^a-z0-9 ]', ' ', name) 
    words = name.split()
    return [w for w in words if w not in stop_words]

# Build a word-to-company mapping for fast lookup
word_counts = Counter()
company_word_map = {}
for c in companies:
    words = tokenize(c)
    company_word_map[c] = set(words)
    word_counts.update(words)


def match_by_unique_words(name):
    words = tokenize(name)
    if not words:
        return None
    
    # Score companies by sum of inverse word frequencies (rare words are more valuable)
    scores = {}
    for c, c_words in company_word_map.items():
        score = sum(1 / word_counts[w] for w in words if w in c_words)
        if score > 0:
            scores[c] = score
    
    if scores:
        return max(scores, key=scores.get)
    return None

df.loc[:2000, "fuzzy_company"] = df.loc[:2000, "Manufacturer"].apply(match_by_unique_words)

In [12]:
# get stock ticker
df["ticker"] = df["fuzzy_company"].map(comp_tckr)

# convert to datetime
df["Report Received Date"] = pd.to_datetime(df["Report Received Date"])

# get unique tickers
unique_tickers = df["ticker"].astype(str).unique().tolist()

# get all prices in range (start of day, end of day)
start_date = df["Report Received Date"].min()
end_date = df["Report Received Date"].max() + pd.Timedelta(days=1)
price_data = yf.download(unique_tickers, start=start_date, end=end_date, group_by="ticker", progress=False)

# helper function to grab pricing data
def get_price(ticker, date):
    try:
        day_data = price_data[ticker].loc[date.strftime("%Y-%m-%d")]
        return pd.Series([day_data["Open"], day_data["Close"]])
    except Exception:
        return pd.Series([None, None])

# get open/close price
df[["open", "close"]] = df.apply(lambda x: get_price(x["ticker"], x["Report Received Date"]), axis=1)

# grab info on ticker
info_cache = {}
for ticker in unique_tickers:
    try:
        t = yf.Ticker(ticker)
        info = t.info
        info_cache[ticker] = {
            "market_cap": info.get("marketCap"),
            "eps": info.get("trailingEps"),
            "full_name": info.get("longName"),
        }
    except Exception:
        pass

# merge info back on ticker
info_df = pd.DataFrame(info_cache).T
info_df.index.name = "ticker"
df = df.merge(info_df, on="ticker", how="left")


  price_data = yf.download(unique_tickers, start=start_date, end=end_date, group_by="ticker", progress=False)

15 Failed downloads:
['DSYWW', 'TGE-WT', 'AMBI-WT', 'FFAIW', 'AMBP-WT', 'F-PD']: YFPricesMissingError('possibly delisted; no price data found  (1d 1995-01-01 00:00:00 -> 2025-10-02 00:00:00)')
['BBAAY', 'PLSAY', 'RYPBF', 'EPDU', 'LOMWF', 'TMRD', 'LBRJ', 'TDWD', 'CLSO']: YFTzMissingError('possibly delisted; no timezone found')
HTTP Error 404: {"quoteSummary":{"result":null,"error":{"code":"Not Found","description":"Quote not found for symbol: TMRD"}}}
HTTP Error 404: {"quoteSummary":{"result":null,"error":{"code":"Not Found","description":"Quote not found for symbol: CLSO"}}}
HTTP Error 404: {"quoteSummary":{"result":null,"error":{"code":"Not Found","description":"Quote not found for symbol: TDWD"}}}


In [13]:
df

Unnamed: 0,Report Received Date,NHTSA ID,Recall Link,Manufacturer,Subject,Component,Mfr Campaign Number,Recall Type,Potentially Affected,Recall Description,...,Stock Abbreviation,Opening Stock Value,Closing Stock Value,fuzzy_company,ticker,open,close,market_cap,eps,full_name
0,2025-10-01,25V656000,Go to Recall (https://www.nhtsa.gov/recalls?nh...,Toyota Motor Engineering & Manufacturing,Driveshaft May Deform and Break,POWER TRAIN,25TB11 / 25TA11,Vehicle,5960,Toyota Motor Engineering & Manufacturing (Toyo...,...,NYSE:TM,192.39,191.83,TOYOTA MOTOR CORP/,TOYOF,17.84000,19.379999,269150797824,2.14,Toyota Motor Corporation
1,2025-10-01,25V655000,Go to Recall (https://www.nhtsa.gov/recalls?nh...,"Nissan North America, Inc.",Fire Risk from Quick Charging Battery,ELECTRICAL SYSTEM,R25C8,Vehicle,19077,"Nissan North America, Inc. (Nissan) is recalli...",...,NYSE:NSANY,,,Central North Airport Group,GAERF,13.94000,13.940000,4924503552,0.73,"Grupo Aeroportuario del Centro Norte, S.A.B. d..."
2,2025-09-30,25V654000,Go to Recall (https://www.nhtsa.gov/recalls?nh...,"Volvo Car USA, LLC",Power Operated Tailgate May Drop Suddenly,STRUCTURE,R10342,Vehicle,1119,"Volvo Car USA, LLC (Volvo) is recalling certai...",...,NYSE:,,,"Mister Car Wash, Inc.",MCW,5.30000,5.330000,1646194432,0.26,"Mister Car Wash, Inc."
3,2025-09-30,25V653000,Go to Recall (https://www.nhtsa.gov/recalls?nh...,Thor Motor Coach,LP Tank May Detach and Become a Road Hazard,EQUIPMENT,RC000331,Vehicle,23,Thor Motor Coach (TMC) is recalling certain 20...,...,NYSE:,,,THOR INDUSTRIES INC,THO,104.54486,103.191399,5831233024,4.84,"THOR Industries, Inc."
4,2025-09-26,25E062000,Go to Recall (https://www.nhtsa.gov/recalls?nh...,"MEDIX SPECIALTY VEHICLES, INC.",Quick Liner Wheel Covers May Detach and Become...,WHEELS,,Equipment,396,"Medix Specialty Vehicles, Inc. (Medix) is reca...",...,NYSE:,,,"Envirotech Vehicles, Inc.",EVTV,2.02000,1.890000,5754790,-10.19,"Envirotech Vehicles, Inc."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22500,1995-01-11,95V004000,Go to Recall (https://www.nhtsa.gov/recalls?nh...,"THOMAS BUILT BUSES, INC.",INTERIOR SYSTEMS:ACTIVE RESTRAINTS:BELT RETRAC...,SEAT BELTS,NR (Not Reported),Vehicle,3076,THE METAL HOUSING OF THE SAFETY BELT RETRACTOR...,...,NYSE:,,,,,,,,,
22501,1995-01-03,95T001000,Go to Recall (https://www.nhtsa.gov/recalls?nh...,Cooper Tire & Rubber Co.,TIRES:MARKINGS,TIRES,NR (Not Reported),Tire,6603,THE MAXIMUM INFLATION PRESSURE STAMPING ON THE...,...,NYSE:,,,,,,,,,
22502,1995-01-03,95V013000,Go to Recall (https://www.nhtsa.gov/recalls?nh...,"PREVOST CAR, INC. Inactive",STRUCTURE:DOOR ASSEMBLY,STRUCTURE,NR (Not Reported),Vehicle,1090,IF A PASSENGER TRIES TO EXIT THE LAVATORY COMP...,...,NYSE:,,,,,,,,,
22503,1995-01-03,95V016000,Go to Recall (https://www.nhtsa.gov/recalls?nh...,"PREVOST CAR, INC. Inactive",EMERGENCY PARKING BRAKE:MECHANICAL,PARKING BRAKE,NR (Not Reported),Vehicle,2075,THE PARKING BRAKE BUTTON CAN BE HIT ACCIDENTLY...,...,NYSE:,,,,,,,,,


In [14]:
def ba_rm_nulls(df=None, col=None):
    """ 
    Helper function to print length of dataframe before removing nulls in a column, then print the length after the nulls are removed.

    Args:
        * df (pd.DataFrame): Dataframe with nulls.
        * col (str): Column name nulls should be removed from.

    Returns:
        * df (pd.DataFrame): Dataframe without nulls.
    """

    print(f"BEFORE NULLS FROM {col} REMOVED: ", len(df))

    df = df[df[f"{col}"].notna()]

    print(f"AFTER NULLS FROM {col} REMOVED: ", len(df))

    return df

In [15]:
df = ba_rm_nulls(df, "ticker")

BEFORE NULLS FROM ticker REMOVED:  22505
AFTER NULLS FROM ticker REMOVED:  1530


### Get Dummies

In [16]:
print(len(df.value_counts("Recall Type")))
print(len(df.value_counts("Component")))

4
32


In [17]:
df = pd.get_dummies(data=df, prefix="RECALL_TYPE_", columns=["Recall Type"])
df = pd.get_dummies(data=df, prefix="COMPONENT_TYPE_", columns=["Component"])

### Bag of Words Featuring 

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

#### Recall Description BOW

In [19]:
df["Recall Description"]

0       Toyota Motor Engineering & Manufacturing (Toyo...
1       Nissan North America, Inc. (Nissan) is recalli...
2       Volvo Car USA, LLC (Volvo) is recalling certai...
3       Thor Motor Coach (TMC) is recalling certain 20...
4       Medix Specialty Vehicles, Inc. (Medix) is reca...
                              ...                        
1897    Porsche Cars North America, Inc. (Porsche) is ...
1898    Volkswagen Group of America, Inc. (Audi) is re...
1899    Daimler Trucks North America, LLC (DTNA) is re...
1900    Honda (American Honda Motor Co.) is recalling ...
1901    Hubbell Incorporated (Delaware) is recalling c...
Name: Recall Description, Length: 1530, dtype: object

In [None]:
recall_vectorizer = CountVectorizer(
    input='content',    
    stop_words='english',
    analyzer='word',
    decode_error='ignore'
)

component_vectorizer = CountVectorizer(
    input='content',    
    stop_words='english',
    analyzer='word',
    decode_error='ignore'
)

In [None]:
X_recall = recall_vectorizer.fit_transform(df["Recall Description"])
X_component = component_vectorizer.fit_transform(df["Component"])

In [24]:
print(recall_vectorizer.vocabulary_)



#### Corrective Action BOW