In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import seaborn as sns

#### LOAD AND MERGE DATA ####
recall_df = pd.read_csv('test.csv') #original data set pulled from FDA site containined recalled items
allinspections_df = pd.read_excel("fda_inspections.xlsx") #recently found all inspections data 

# Step 1: Create recalled_bool using FEI Number
allinspections_df['recalled_bool'] = allinspections_df['FEI Number'].apply(
    lambda x: 1 if x in recall_df['FIRMFEINUM'].values else 0
)

# Step 2: (OPTIONAL) Override or combine with Classification logic
# If Classification is OAI or VAI, you may consider it risky even if not recalled
class_to_check = ['Official Action Indicated (OAI)', 'Voluntary Action Indicated (VAI)']
allinspections_df['classification_flag'] = allinspections_df['Classification'].isin(class_to_check).astype(int)

# Combine classification with recall label if needed (comment out if you don't want this)
# allinspections_df['recalled_bool'] = allinspections_df[['recalled_bool', 'classification_flag']].max(axis=1)

In [None]:
#### FEATURE ENGINEERING ####
# Convert date
allinspections_df['Inspection End Date'] = pd.to_datetime(allinspections_df['Inspection End Date'])
allinspections_df = allinspections_df.sort_values(['FEI Number', 'Inspection End Date'])

# Date-based features
allinspections_df['inspection_year'] = allinspections_df['Inspection End Date'].dt.year
allinspections_df['inspection_month'] = allinspections_df['Inspection End Date'].dt.month
allinspections_df['inspection_dayofweek'] = allinspections_df['Inspection End Date'].dt.dayofweek
allinspections_df['inspection_quarter'] = allinspections_df['Inspection End Date'].dt.quarter
allinspections_df['week_of_year'] = allinspections_df['Inspection End Date'].dt.isocalendar().week

def get_season(month):
    if month in [12, 1, 2]: return 'Winter'
    elif month in [3, 4, 5]: return 'Spring'
    elif month in [6, 7, 8]: return 'Summer'
    return 'Fall'

allinspections_df['inspection_season'] = allinspections_df['inspection_month'].apply(get_season)

# Days since last inspection per FEI Number
allinspections_df['days_since_last_inspection'] = allinspections_df.groupby('FEI Number')['Inspection End Date'].diff().dt.days
allinspections_df['days_since_last_inspection'] = allinspections_df['days_since_last_inspection'].fillna(-1)

In [None]:
#### PREPARE DATA FOR MODELING ####
#List of features (adjust if needed)
features = [
    'Posted Citations', 'Classification', 'Project Area', 'Product Type', 'State',
    'Fiscal Year', 'inspection_year', 'inspection_month', 'inspection_dayofweek',
    'inspection_quarter', 'week_of_year', 'inspection_season', 'days_since_last_inspection'
]

# Prepare dataframe for modeling
df_model = allinspections_df[features + ['recalled_bool']].dropna()

# Convert Posted Citations to binary
df_model['Posted Citations'] = df_model['Posted Citations'].notnull().astype(int)

# Encode categorical columns
categorical_cols = ['Classification', 'Project Area', 'Product Type', 'State', 'inspection_season']
for col in categorical_cols:
    df_model[col] = LabelEncoder().fit_transform(df_model[col])

In [None]:
output_path = 'final_engineered_features.csv'
allinspections_df.to_csv(output_path, index=False)

print(f"DataFrame saved to {output_path}")