In [1]:
!pip install rapidfuzz yfinance pandas numpy nltk scikit-learn




[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\Crook\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
from rapidfuzz import process, fuzz
import yfinance as yf
import pandas as pd
import numpy as np
import nltk
import re

import json

In [3]:
# LOAD SEC(https://www.sec.gov/files/company_tickers.json) TICKERS
with open("tickers.json", "r") as f:
    tickers = json.load(f)
    companies = [tickers[i]["title"] for i in tickers]
    abbrs = [tickers[i]["ticker"] for i in tickers]

    comp_tckr = {comp: abbr for comp, abbr in zip(companies, abbrs)}

# LOAD INITIAL RECALL DATA
df = pd.read_csv("recalls.csv")

## Data Cleaning

In [4]:
# delete instances where year is older than 1995
print(len(df))
for i, row in df.iterrows():
    if ( row['Report Received Date'][-4:].isnumeric() and int(row['Report Received Date'][-4:]) < 1995 ):
        df.drop(i, inplace=True)
print(len(df))

29374
22505


In [5]:
# remove column that won't feed into training
df = df.drop("Completion Rate % (Blank - Not Reported)", axis=1)

In [6]:
# from first version - not using this in this version
df = df.drop("Opening Stock Value", axis =1)
df = df.drop("Closing Stock Value", axis =1)

In [7]:
# drop other features not useful for training
df = df.drop("Recall Link", axis =1)
df = df.drop("Mfr Campaign Number", axis =1)

#NHTSA ID kept to be used as a unique identifier

In [8]:
df

Unnamed: 0,Report Received Date,NHTSA ID,Manufacturer,Subject,Component,Recall Type,Potentially Affected,Recall Description,Consequence Summary,Corrective Action,Park Outside Advisory,Do Not Drive Advisory,Stock Abbreviation
0,10/01/2025,25V656000,Toyota Motor Engineering & Manufacturing,Driveshaft May Deform and Break,POWER TRAIN,Vehicle,5960,Toyota Motor Engineering & Manufacturing (Toyo...,A broken driveshaft can impair steering. Vehic...,Dealers will inspect and replace both front dr...,No,No,NYSE:TM
1,10/01/2025,25V655000,"Nissan North America, Inc.",Fire Risk from Quick Charging Battery,ELECTRICAL SYSTEM,Vehicle,19077,"Nissan North America, Inc. (Nissan) is recalli...",A quick charging battery that overheats increa...,Owners are advised not to use Level 3 quick ch...,No,No,NYSE:NSANY
2,09/30/2025,25V654000,"Volvo Car USA, LLC",Power Operated Tailgate May Drop Suddenly,STRUCTURE,Vehicle,1119,"Volvo Car USA, LLC (Volvo) is recalling certai...",A tailgate that suddenly drops can hit a perso...,Owners are advised not to use the POT function...,No,No,NYSE:
3,09/30/2025,25V653000,Thor Motor Coach,LP Tank May Detach and Become a Road Hazard,EQUIPMENT,Vehicle,23,Thor Motor Coach (TMC) is recalling certain 20...,A detached tank can create a road hazard for o...,The remedy is currently under development. Own...,No,No,NYSE:
4,09/26/2025,25E062000,"MEDIX SPECIALTY VEHICLES, INC.",Quick Liner Wheel Covers May Detach and Become...,WHEELS,Equipment,396,"Medix Specialty Vehicles, Inc. (Medix) is reca...",A detached cover can create a road hazard for ...,"Medix will replace the covers, free of charge....",No,No,NYSE:
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23948,01/11/1995,95V004000,"THOMAS BUILT BUSES, INC.",INTERIOR SYSTEMS:ACTIVE RESTRAINTS:BELT RETRAC...,SEAT BELTS,Vehicle,3076,THE METAL HOUSING OF THE SAFETY BELT RETRACTOR...,PASSENGERS WOULD BE UNABLE TO EXTEND OR RETRAC...,DEALERS WILL INSTALL A HARDENED WASHER WHICH W...,No,No,NYSE:
23949,01/03/1995,95T001000,Cooper Tire & Rubber Co.,TIRES:MARKINGS,TIRES,Tire,6603,THE MAXIMUM INFLATION PRESSURE STAMPING ON THE...,,DEALERS WILL REPLACE THE INCORRECT TIRES WITH ...,No,No,NYSE:
23950,01/03/1995,95V013000,"PREVOST CAR, INC. Inactive",STRUCTURE:DOOR ASSEMBLY,STRUCTURE,Vehicle,1090,IF A PASSENGER TRIES TO EXIT THE LAVATORY COMP...,"IF THE WINDOW SEPARATES, THE OCCUPANT CAN FALL...",DEALERS WILL INSTALL A RAIL IN THE LAVATORY CO...,No,No,NYSE:
23951,01/03/1995,95V016000,"PREVOST CAR, INC. Inactive",EMERGENCY PARKING BRAKE:MECHANICAL,PARKING BRAKE,Vehicle,2075,THE PARKING BRAKE BUTTON CAN BE HIT ACCIDENTLY...,UNINTENDED OPERATION OF THE PARKING BRAKE BUTT...,DEALERS WILL INSTALL A PROTECTOR OVER THE PARK...,No,No,NYSE:


In [9]:
import re
from collections import Counter

# Common words to ignore
stop_words = set([
    "inc", "corp", "llc", "co", "ltd", "corporation",
    "company", "limited", "plc", "gmbh", "sa", "ag", "pte", "bv", "kg", "kgaa", "corp", "industries", "us", "of"
])

def tokenize(name):
    name = name.lower()
    name = re.sub(r'[^a-z0-9 ]', ' ', name) 
    words = name.split()
    return [w for w in words if w not in stop_words]

# Build a word-to-company mapping for fast lookup
word_counts = Counter()
company_word_map = {}
for c in companies:
    words = tokenize(c)
    company_word_map[c] = set(words)
    word_counts.update(words)


def match_by_unique_words(name):
    words = tokenize(name)
    if not words:
        return None
    
    # Score companies by sum of inverse word frequencies (rare words are more valuable)
    scores = {}
    for c, c_words in company_word_map.items():
        score = sum(1 / word_counts[w] for w in words if w in c_words)
        if score > 0:
            scores[c] = score
    
    if scores:
        return max(scores, key=scores.get)
    return None

df.loc[:22505, "fuzzy_company"] = df.loc[:22505, "Manufacturer"].apply(match_by_unique_words)

KeyboardInterrupt: 

In [None]:
# get stock ticker
df["ticker"] = df["fuzzy_company"].map(comp_tckr)

# convert to datetime
df["Report Received Date"] = pd.to_datetime(df["Report Received Date"])

# get unique tickers
unique_tickers = df["ticker"].astype(str).unique().tolist()

# get all prices in range (start of day, end of day)
start_date = df["Report Received Date"].min()
end_date = df["Report Received Date"].max() + pd.Timedelta(days=1)
price_data = yf.download(unique_tickers, start=start_date, end=end_date, group_by="ticker", progress=False)

# helper function to grab pricing data
def get_price(ticker, date):
    try:
        day_data = price_data[ticker].loc[date.strftime("%Y-%m-%d")]
        return pd.Series([day_data["Open"], day_data["Close"]])
    except Exception:
        return pd.Series([None, None])

# get open/close price
df[["open", "close"]] = df.apply(lambda x: get_price(x["ticker"], x["Report Received Date"]), axis=1)

# grab info on ticker
info_cache = {}
for ticker in unique_tickers:
    try:
        t = yf.Ticker(ticker)
        info = t.info
        info_cache[ticker] = {
            "market_cap": info.get("marketCap"),
            "eps": info.get("trailingEps"),
            "full_name": info.get("longName"),
        }
    except Exception:
        pass

# merge info back on ticker
info_df = pd.DataFrame(info_cache).T
info_df.index.name = "ticker"
df = df.merge(info_df, on="ticker", how="left")


  price_data = yf.download(unique_tickers, start=start_date, end=end_date, group_by="ticker", progress=False)

44 Failed downloads:
['SKMTF', 'RVSNW', 'F-PD', 'TGE-WT', 'AREBW', 'KII', 'OPTXW', 'SPEGR', 'WSUPW', 'GRRRW', 'QTIWW', 'BKHAR', 'DSYWW', 'ZAPWF', 'FFAIW', 'DC-WT', 'GROVW', 'XOSWW', 'FBYDW', 'GST', 'AMBI-WT', 'CSC', 'AMBP-WT', 'HUBCZ']: YFPricesMissingError('possibly delisted; no price data found  (1d 1995-01-01 00:00:00 -> 2025-10-02 00:00:00)')
['HCAC', 'NSNFY', 'LBRJ', 'LOMWF', 'MADL', 'AMPM', 'TDWD', 'RYPBF', 'TMRD', 'CLSO', 'ODRS', 'ZSICY', 'PLSAY', 'EPDU', 'BBAAY', 'SEAH', 'CFAC', 'METRY']: YFTzMissingError('possibly delisted; no timezone found')
['PXED', 'ALPS']: YFPricesMissingError('possibly delisted; no price data found  (1d 1995-01-01 00:00:00 -> 2025-10-02 00:00:00) (Yahoo error = "Data doesn\'t exist for startDate = 788936400, endDate = 1759377600")')
HTTP Error 404: {"quoteSummary":{"result":null,"error":{"code":"Not Found","description":"Quote not found for symb

In [None]:
df

Unnamed: 0,Report Received Date,NHTSA ID,Manufacturer,Subject,Component,Recall Type,Potentially Affected,Recall Description,Consequence Summary,Corrective Action,Park Outside Advisory,Do Not Drive Advisory,Stock Abbreviation,fuzzy_company,ticker,open,close,market_cap,eps,full_name
0,2025-10-01,25V656000,Toyota Motor Engineering & Manufacturing,Driveshaft May Deform and Break,POWER TRAIN,Vehicle,5960,Toyota Motor Engineering & Manufacturing (Toyo...,A broken driveshaft can impair steering. Vehic...,Dealers will inspect and replace both front dr...,No,No,NYSE:TM,TOYOTA MOTOR CORP/,TOYOF,17.84000,19.379999,265736028160,2.13,Toyota Motor Corporation
1,2025-10-01,25V655000,"Nissan North America, Inc.",Fire Risk from Quick Charging Battery,ELECTRICAL SYSTEM,Vehicle,19077,"Nissan North America, Inc. (Nissan) is recalli...",A quick charging battery that overheats increa...,Owners are advised not to use Level 3 quick ch...,No,No,NYSE:NSANY,Central North Airport Group,GAERF,13.94000,13.940000,4791702016,0.73,"Grupo Aeroportuario del Centro Norte, S.A.B. d..."
2,2025-09-30,25V654000,"Volvo Car USA, LLC",Power Operated Tailgate May Drop Suddenly,STRUCTURE,Vehicle,1119,"Volvo Car USA, LLC (Volvo) is recalling certai...",A tailgate that suddenly drops can hit a perso...,Owners are advised not to use the POT function...,No,No,NYSE:,"Mister Car Wash, Inc.",MCW,5.30000,5.330000,1831107328,0.27,"Mister Car Wash, Inc."
3,2025-09-30,25V653000,Thor Motor Coach,LP Tank May Detach and Become a Road Hazard,EQUIPMENT,Vehicle,23,Thor Motor Coach (TMC) is recalling certain 20...,A detached tank can create a road hazard for o...,The remedy is currently under development. Own...,No,No,NYSE:,THOR INDUSTRIES INC,THO,104.54486,103.191399,5492275200,4.84,"THOR Industries, Inc."
4,2025-09-26,25E062000,"MEDIX SPECIALTY VEHICLES, INC.",Quick Liner Wheel Covers May Detach and Become...,WHEELS,Equipment,396,"Medix Specialty Vehicles, Inc. (Medix) is reca...",A detached cover can create a road hazard for ...,"Medix will replace the covers, free of charge....",No,No,NYSE:,"Envirotech Vehicles, Inc.",EVTV,2.02000,1.890000,5807748,-10.19,"Envirotech Vehicles, Inc."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22500,1995-01-11,95V004000,"THOMAS BUILT BUSES, INC.",INTERIOR SYSTEMS:ACTIVE RESTRAINTS:BELT RETRAC...,SEAT BELTS,Vehicle,3076,THE METAL HOUSING OF THE SAFETY BELT RETRACTOR...,PASSENGERS WOULD BE UNABLE TO EXTEND OR RETRAC...,DEALERS WILL INSTALL A HARDENED WASHER WHICH W...,No,No,NYSE:,,,,,,,
22501,1995-01-03,95T001000,Cooper Tire & Rubber Co.,TIRES:MARKINGS,TIRES,Tire,6603,THE MAXIMUM INFLATION PRESSURE STAMPING ON THE...,,DEALERS WILL REPLACE THE INCORRECT TIRES WITH ...,No,No,NYSE:,,,,,,,
22502,1995-01-03,95V013000,"PREVOST CAR, INC. Inactive",STRUCTURE:DOOR ASSEMBLY,STRUCTURE,Vehicle,1090,IF A PASSENGER TRIES TO EXIT THE LAVATORY COMP...,"IF THE WINDOW SEPARATES, THE OCCUPANT CAN FALL...",DEALERS WILL INSTALL A RAIL IN THE LAVATORY CO...,No,No,NYSE:,,,,,,,
22503,1995-01-03,95V016000,"PREVOST CAR, INC. Inactive",EMERGENCY PARKING BRAKE:MECHANICAL,PARKING BRAKE,Vehicle,2075,THE PARKING BRAKE BUTTON CAN BE HIT ACCIDENTLY...,UNINTENDED OPERATION OF THE PARKING BRAKE BUTT...,DEALERS WILL INSTALL A PROTECTOR OVER THE PARK...,No,No,NYSE:,,,,,,,


In [None]:
def ba_rm_nulls(df=None, col=None):
    """ 
    Helper function to print length of dataframe before removing nulls in a column, then print the length after the nulls are removed.

    Args:
        * df (pd.DataFrame): Dataframe with nulls.
        * col (str): Column name nulls should be removed from.

    Returns:
        * df (pd.DataFrame): Dataframe without nulls.
    """

    print(f"BEFORE NULLS FROM {col} REMOVED: ", len(df))

    df = df[df[f"{col}"].notna()]

    print(f"AFTER NULLS FROM {col} REMOVED: ", len(df))

    return df

In [None]:
df = ba_rm_nulls(df, "ticker")

BEFORE NULLS FROM ticker REMOVED:  22505
AFTER NULLS FROM ticker REMOVED:  16511


In [None]:
df

Unnamed: 0,Report Received Date,NHTSA ID,Manufacturer,Subject,Component,Recall Type,Potentially Affected,Recall Description,Consequence Summary,Corrective Action,Park Outside Advisory,Do Not Drive Advisory,Stock Abbreviation,fuzzy_company,ticker,open,close,market_cap,eps,full_name
0,2025-10-01,25V656000,Toyota Motor Engineering & Manufacturing,Driveshaft May Deform and Break,POWER TRAIN,Vehicle,5960,Toyota Motor Engineering & Manufacturing (Toyo...,A broken driveshaft can impair steering. Vehic...,Dealers will inspect and replace both front dr...,No,No,NYSE:TM,TOYOTA MOTOR CORP/,TOYOF,17.840000,19.379999,265736028160,2.13,Toyota Motor Corporation
1,2025-10-01,25V655000,"Nissan North America, Inc.",Fire Risk from Quick Charging Battery,ELECTRICAL SYSTEM,Vehicle,19077,"Nissan North America, Inc. (Nissan) is recalli...",A quick charging battery that overheats increa...,Owners are advised not to use Level 3 quick ch...,No,No,NYSE:NSANY,Central North Airport Group,GAERF,13.940000,13.940000,4791702016,0.73,"Grupo Aeroportuario del Centro Norte, S.A.B. d..."
2,2025-09-30,25V654000,"Volvo Car USA, LLC",Power Operated Tailgate May Drop Suddenly,STRUCTURE,Vehicle,1119,"Volvo Car USA, LLC (Volvo) is recalling certai...",A tailgate that suddenly drops can hit a perso...,Owners are advised not to use the POT function...,No,No,NYSE:,"Mister Car Wash, Inc.",MCW,5.300000,5.330000,1831107328,0.27,"Mister Car Wash, Inc."
3,2025-09-30,25V653000,Thor Motor Coach,LP Tank May Detach and Become a Road Hazard,EQUIPMENT,Vehicle,23,Thor Motor Coach (TMC) is recalling certain 20...,A detached tank can create a road hazard for o...,The remedy is currently under development. Own...,No,No,NYSE:,THOR INDUSTRIES INC,THO,104.544860,103.191399,5492275200,4.84,"THOR Industries, Inc."
4,2025-09-26,25E062000,"MEDIX SPECIALTY VEHICLES, INC.",Quick Liner Wheel Covers May Detach and Become...,WHEELS,Equipment,396,"Medix Specialty Vehicles, Inc. (Medix) is reca...",A detached cover can create a road hazard for ...,"Medix will replace the covers, free of charge....",No,No,NYSE:,"Envirotech Vehicles, Inc.",EVTV,2.020000,1.890000,5807748,-10.19,"Envirotech Vehicles, Inc."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21140,1999-02-22,99E004000,Harley-Davidson Motor Company,HARLEY-DAVIDSON/BRACE SUPPORT,,Equipment,1831,EQUIPMENT DESCRIPTION: PART NOS. 66693-98 AND...,THE REAR BRAKE PEDAL CAN COME IN CONTACT WITH ...,DEALERS WILL REMOVE THE RACE BRACE SUPPORT ON ...,No,No,NYSE:HOG,"HARLEY-DAVIDSON, INC.",HOG,17.188709,17.462477,3279495680,2.05,"Harley-Davidson, Inc."
21141,1999-02-18,99V036000,Motor Coach Industries,MCI/STEERING ARM MAINTENANCE,STEERING,Vehicle,19600,VEHICLE DESCRIPTION: TRANSIT COACHES. THE ST...,THESE STEERING ARMS COULD FAIL DUE TO POOR INS...,"DUE TO THE AGE OF THESE VEHICLES, MCI IS NOT O...",No,No,NYSE:,TOYOTA MOTOR CORP/,TOYOF,,,265736028160,2.13,Toyota Motor Corporation
21142,1999-02-17,99E005000,DANA CORPORATION,DANA/SPICER-TIE ROD ASSEMBLIES,STEERING,Equipment,864,"EQUIPMENT DESCRIPTION: SPICER 6,000 OR 8,000 ...",SEPARATION OF THE TIE ROD END COULD RESULT IN ...,DEALERS WILL REMOVE THE TIE ROD ASSEMBLIES AND...,No,No,NYSE:DAN,DANA Inc,DAN,,,2474919168,-0.76,Dana Incorporated
21143,1999-02-17,99E003000,DANA CORPORATION,DANA/SPICER/BEARING CONES,SUSPENSION,Equipment,90,EQUIPMENT DESCRIPTION: FRONT STEERING AXLE AS...,IMPROPER FIT OF THE BEARING TO THE SPINDLE INC...,DEALERS WILL REMOVE THE LEFT AND RIGHT HUB/DRU...,No,No,NYSE:DAN,DANA Inc,DAN,,,2474919168,-0.76,Dana Incorporated


In [None]:
# intermediate step to save compute time during development
df.to_csv('intermediate.csv')

In [None]:
df = pd.read_csv("intermediate.csv")

### Get Dummies

In [11]:
print(len(df.value_counts("Recall Type")))
print(len(df.value_counts("Component")))

4
35


In [12]:
df['Original_Component'] = df['Component'].fillna('')

In [13]:
df = pd.get_dummies(data=df, prefix="RECALL_TYPE_", columns=["Recall Type"])
df = pd.get_dummies(data=df, prefix="COMPONENT_TYPE_", columns=["Component"])

### Bag of Words Featuring 

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

#### Recall Description BOW

In [15]:
df["Recall Description"]

0        Toyota Motor Engineering & Manufacturing (Toyo...
1        Nissan North America, Inc. (Nissan) is recalli...
2        Volvo Car USA, LLC (Volvo) is recalling certai...
3        Thor Motor Coach (TMC) is recalling certain 20...
4        Medix Specialty Vehicles, Inc. (Medix) is reca...
                               ...                        
16506    EQUIPMENT DESCRIPTION:  PART NOS. 66693-98 AND...
16507    VEHICLE DESCRIPTION:  TRANSIT COACHES.  THE ST...
16508    EQUIPMENT DESCRIPTION:  SPICER 6,000 OR 8,000 ...
16509    EQUIPMENT DESCRIPTION:  FRONT STEERING AXLE AS...
16510    SCHOOL, TRANSIT, OR HEAVY DUTY VEHICLES EQUIPP...
Name: Recall Description, Length: 16511, dtype: object

In [16]:
recall_vectorizer = CountVectorizer(
    input='content',    
    stop_words='english',
    analyzer='word',
    decode_error='ignore'
)

component_vectorizer = CountVectorizer(
    input='content',    
    stop_words='english',
    analyzer='word',
    decode_error='ignore'
)

In [17]:
print(df.columns)
len(df.columns)

Index(['Unnamed: 0', 'Report Received Date', 'NHTSA ID', 'Manufacturer',
       'Subject', 'Potentially Affected', 'Recall Description',
       'Consequence Summary', 'Corrective Action', 'Park Outside Advisory',
       'Do Not Drive Advisory', 'Stock Abbreviation', 'fuzzy_company',
       'ticker', 'open', 'close', 'market_cap', 'eps', 'full_name',
       'Original_Component', 'RECALL_TYPE__Child Seat',
       'RECALL_TYPE__Equipment', 'RECALL_TYPE__Tire', 'RECALL_TYPE__Vehicle',
       'COMPONENT_TYPE__AIR BAGS', 'COMPONENT_TYPE__BACK OVER PREVENTION',
       'COMPONENT_TYPE__CHILD SEAT', 'COMPONENT_TYPE__COMMUNICATION',
       'COMPONENT_TYPE__ELECTRICAL SYSTEM',
       'COMPONENT_TYPE__ELECTRONIC STABILITY CONTROL',
       'COMPONENT_TYPE__ELECTRONIC STABILITY CONTROL (ESC)',
       'COMPONENT_TYPE__ENGINE AND ENGINE COOLING',
       'COMPONENT_TYPE__EQUIPMENT',
       'COMPONENT_TYPE__EQUIPMENT ADAPTIVE/MOBILITY',
       'COMPONENT_TYPE__EXTERIOR LIGHTING',
       'COMPONENT_TYPE_

59

In [18]:
X_recall = recall_vectorizer.fit_transform(df["Recall Description"])
X_component = component_vectorizer.fit_transform(df["Original_Component"])

In [19]:
print(recall_vectorizer.vocabulary_)



In [20]:
print(df.columns)
len(df.columns)

Index(['Unnamed: 0', 'Report Received Date', 'NHTSA ID', 'Manufacturer',
       'Subject', 'Potentially Affected', 'Recall Description',
       'Consequence Summary', 'Corrective Action', 'Park Outside Advisory',
       'Do Not Drive Advisory', 'Stock Abbreviation', 'fuzzy_company',
       'ticker', 'open', 'close', 'market_cap', 'eps', 'full_name',
       'Original_Component', 'RECALL_TYPE__Child Seat',
       'RECALL_TYPE__Equipment', 'RECALL_TYPE__Tire', 'RECALL_TYPE__Vehicle',
       'COMPONENT_TYPE__AIR BAGS', 'COMPONENT_TYPE__BACK OVER PREVENTION',
       'COMPONENT_TYPE__CHILD SEAT', 'COMPONENT_TYPE__COMMUNICATION',
       'COMPONENT_TYPE__ELECTRICAL SYSTEM',
       'COMPONENT_TYPE__ELECTRONIC STABILITY CONTROL',
       'COMPONENT_TYPE__ELECTRONIC STABILITY CONTROL (ESC)',
       'COMPONENT_TYPE__ENGINE AND ENGINE COOLING',
       'COMPONENT_TYPE__EQUIPMENT',
       'COMPONENT_TYPE__EQUIPMENT ADAPTIVE/MOBILITY',
       'COMPONENT_TYPE__EXTERIOR LIGHTING',
       'COMPONENT_TYPE_

59

#### Corrective Action BOW

In [21]:
df["Corrective Action"]

0        Dealers will inspect and replace both front dr...
1        Owners are advised not to use Level 3 quick ch...
2        Owners are advised not to use the POT function...
3        The remedy is currently under development. Own...
4        Medix will replace the covers, free of charge....
                               ...                        
16506    DEALERS WILL REMOVE THE RACE BRACE SUPPORT ON ...
16507    DUE TO THE AGE OF THESE VEHICLES, MCI IS NOT O...
16508    DEALERS WILL REMOVE THE TIE ROD ASSEMBLIES AND...
16509    DEALERS WILL REMOVE THE LEFT AND RIGHT HUB/DRU...
16510    DEALERS WILL REPLACE THE COMPLETE TIE ROD ASSE...
Name: Corrective Action, Length: 16511, dtype: object

In [22]:
corrective_action_vectorizer = CountVectorizer(
    input='content',    
    stop_words='english',
    analyzer='word',
    decode_error='ignore'
)

In [23]:
X_corrective_action = corrective_action_vectorizer.fit_transform(df["Corrective Action"])

In [24]:
print(corrective_action_vectorizer.vocabulary_)



In [None]:
df.to_csv('intermediate2.csv')
df = pd.read_csv("intermediate2.csv")

### Linear Regression Experiment

In [25]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [26]:
df = df.dropna(subset=['open', 'close'])
y = df['open'] - df['close']
X = df[['Potentially Affected']]

In [27]:
y

0       -1.539999
1        0.000000
2       -0.030000
3        1.353461
4        0.130000
           ...   
16502   -0.214256
16503    0.526635
16504    0.526635
16505   -0.370553
16506   -0.273768
Length: 10125, dtype: float64

In [28]:
X

Unnamed: 0,Potentially Affected
0,5960
1,19077
2,1119
3,23
4,396
...,...
16502,10000
16503,216
16504,89
16505,27373


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
print("X_train_shape:", X_train.shape)
print("X_test_shape:", X_test.shape)
print("y_train_shape:", y_train.shape)
print("y_test_shape:", y_test.shape )

X_train_shape: (8100, 1)
X_test_shape: (2025, 1)
y_train_shape: (8100,)
y_test_shape: (2025,)


In [31]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lin = lin_reg.predict(X_test)

print("Linear Regression Coefficent:", lin_reg.coef_)
print("Linear Regression Intercept:", lin_reg.intercept_)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_lin))

Linear Regression Coefficent: [0.00374898]
Linear Regression Intercept: -22724.86754803316
Mean Squared Error: 1889518883504.255


In [32]:
print("R2 score: {:.8f}".format(r2_score(y_test, y_pred_lin)))

R2 score: -0.00010747


### Another Experiement lmao

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [34]:
df = df.dropna(subset=['open', 'close'])
X = df.drop('open', axis =1)
X = X.drop('close', axis=1)
y = df['open'] - df['close']

In [35]:
df['Report Received Date'] = pd.to_datetime(df['Report Received Date'])

In [36]:
numeric_feat = ['Potentially Affected']
cat_feat = ['Manufacturer', 'Subject', 'COMPONENT_TYPE__SERVICE BRAKES, HYDRAULIC']

In [37]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [38]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_feat),
        ('cat', categorical_transformer, cat_feat)
    ]
)

In [39]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeRegressor(random_state=42))
])

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- Fit model ---
model.fit(X_train, y_train)

# --- Predict and evaluate ---
y_pred = model.predict(X_test)
#print(classification_report(y_test, y_pred))

In [41]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(mse, r2)

5414898586115.876 -0.9301108597653756


In [42]:
df

Unnamed: 0.1,Unnamed: 0,Report Received Date,NHTSA ID,Manufacturer,Subject,Potentially Affected,Recall Description,Consequence Summary,Corrective Action,Park Outside Advisory,...,"COMPONENT_TYPE__SERVICE BRAKES, HYDRAULIC","COMPONENT_TYPE__SERVICE BRAKES, HYDRAULIC; AUTOHOLD BRAKE SYSTEM/BRAKE HOLD",COMPONENT_TYPE__STEERING,COMPONENT_TYPE__STRUCTURE,COMPONENT_TYPE__SUSPENSION,COMPONENT_TYPE__TIRES,COMPONENT_TYPE__TRAILER HITCHES,COMPONENT_TYPE__VEHICLE SPEED CONTROL,COMPONENT_TYPE__VISIBILITY,COMPONENT_TYPE__WHEELS
0,0,2025-10-01,25V656000,Toyota Motor Engineering & Manufacturing,Driveshaft May Deform and Break,5960,Toyota Motor Engineering & Manufacturing (Toyo...,A broken driveshaft can impair steering. Vehic...,Dealers will inspect and replace both front dr...,No,...,False,False,False,False,False,False,False,False,False,False
1,1,2025-10-01,25V655000,"Nissan North America, Inc.",Fire Risk from Quick Charging Battery,19077,"Nissan North America, Inc. (Nissan) is recalli...",A quick charging battery that overheats increa...,Owners are advised not to use Level 3 quick ch...,No,...,False,False,False,False,False,False,False,False,False,False
2,2,2025-09-30,25V654000,"Volvo Car USA, LLC",Power Operated Tailgate May Drop Suddenly,1119,"Volvo Car USA, LLC (Volvo) is recalling certai...",A tailgate that suddenly drops can hit a perso...,Owners are advised not to use the POT function...,No,...,False,False,False,True,False,False,False,False,False,False
3,3,2025-09-30,25V653000,Thor Motor Coach,LP Tank May Detach and Become a Road Hazard,23,Thor Motor Coach (TMC) is recalling certain 20...,A detached tank can create a road hazard for o...,The remedy is currently under development. Own...,No,...,False,False,False,False,False,False,False,False,False,False
4,4,2025-09-26,25E062000,"MEDIX SPECIALTY VEHICLES, INC.",Quick Liner Wheel Covers May Detach and Become...,396,"Medix Specialty Vehicles, Inc. (Medix) is reca...",A detached cover can create a road hazard for ...,"Medix will replace the covers, free of charge....",No,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16502,21135,1999-02-25,99E008000,"Precision Equipment MFG, LLC",PRECISION ENGINEERING/REMOTE STARTER,10000,EQUIPMENT DESCRIPTION: REMOTE CAR STARTER. T...,"AS A RESULT, THE CONSEQUENCES ARE THE PARKING ...",PRECISION ENGINEERING/DAVID LEVY COMPANY WILL ...,No,...,False,False,False,False,False,False,False,False,False,False
16503,21137,1999-02-23,99T004000,Cooper Tire & Rubber Co.,COOPER/INADEQUATE RUBBER,216,TIRE DESCRIPTION: COOPER COBRA RADIAL G/T P21...,"IF SEPARATION IS NOT DETECTED, CONTINUED USE C...","DEALERS WILL INSPECT THESE TIRES AND REPLACE, ...",No,...,False,False,False,False,False,True,False,False,False,False
16504,21138,1999-02-23,99T003000,Cooper Tire & Rubber Co.,COOPER/FMVSS 119,89,"TIRE DESCRIPTION: COOPER SUPER ROAD SERVICE, ...",THIS CONDITION COULD AFFECT THE LONG TERM SERV...,"DEALERS WILL INSPECT THESE TIRES AND REPLACE, ...",No,...,False,False,False,False,False,True,False,False,False,False
16505,21139,1999-02-23,99E006000,Brake Parts Inc.,BRAKE PARTS/MASTER CYLINDER,27373,EQUIPMENT DESCRIPTION: AFTERMARKET MASTER CYL...,"IF THIS CONDITION OCCURS, LOSS OF BRAKING COUL...",BRAKE PARTS WILL REPLACE THESE CYLINDERS.,No,...,True,False,False,False,False,False,False,False,False,False


### Multiple Linear Regression

In [43]:
df['Report Received Date'] = pd.to_datetime(df['Report Received Date'])  # ensure it's datetime type
df['Date_numeric'] = df['Report Received Date'].map(pd.Timestamp.toordinal)

In [44]:
df['Park Outside Advisory'] = df['Park Outside Advisory'].replace({'Yes': 1, 'No': 0})
df['Do Not Drive Advisory'] = df['Do Not Drive Advisory'].replace({'Yes': 1, 'No': 0})

  df['Park Outside Advisory'] = df['Park Outside Advisory'].replace({'Yes': 1, 'No': 0})
  df['Do Not Drive Advisory'] = df['Do Not Drive Advisory'].replace({'Yes': 1, 'No': 0})


In [45]:
# temp to get model working
df =df.drop(columns=['Stock Abbreviation', 'fuzzy_company','ticker', 'market_cap', 'eps', 'full_name','Original_Component'])

In [46]:
df = df.dropna()

In [47]:
X = df.drop(columns=['Report Received Date', 'NHTSA ID', 'Manufacturer', 'Subject', 'Recall Description', 'Consequence Summary', 'Corrective Action'])
y = df['open'] - df['close']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Model evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("r squared:", r2)
print("Mean Squared Error:", mse)
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

r squared: -312101.65636467916
Mean Squared Error: 9.955226866066356e+17
Coefficients: [ 4.30804393e-13 -1.95676808e-15 -5.46596783e-09  2.32419828e-09
  1.00000000e+00 -1.00000000e+00 -4.85971455e-09  1.19428280e-08
 -3.68905709e-09 -3.39405632e-09 -4.64200794e-09 -4.85456389e-09
 -1.75015390e-09 -4.32387944e-09 -4.71256930e-09 -2.25198728e-10
 -4.51146854e-09 -5.26743865e-09 -4.80831550e-09 -4.61338116e-09
 -6.15878009e-09 -4.32219546e-09 -6.58506224e-09 -5.27572391e-09
 -9.08175817e-09 -3.81854387e-09 -4.19425852e-09 -3.78084210e-09
 -6.55856774e-09 -4.06932516e-09 -4.40048717e-09 -4.30802390e-09
 -6.20784207e-09 -3.71866786e-09 -4.92461334e-09 -4.16550272e-09
  5.37667398e-24 -4.94583485e-09 -3.95837803e-09 -5.11864532e-09
 -2.73993959e-09 -5.70482756e-09 -4.84604755e-09 -4.17355892e-09
 -5.19633613e-09  1.22605168e-12]
Intercept: -8.989227353595197e-07
