In [None]:
import pandas as pd
from datetime import datetime

# Load the training and test datasets
train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")

# Feature Engineering function(Grouping together ALL feature engineering. Not great code, but this was mostly experimental)
def preprocess_animals(df: pd.DataFrame) -> pd.DataFrame:
    out = pd.DataFrame()
    # Age in days (compute from Date of Birth and Intake date)
    intake_dt = pd.to_datetime(df["Intake Time"], infer_datetime_format=True)
    dob = pd.to_datetime(df["Date of Birth"], infer_datetime_format=True)
    out["AgeDays"] = (intake_dt - dob).dt.days.clip(lower=0)
    # Extract datetime features
    out["Weekday"] = intake_dt.dt.dayofweek
    out["Hour"]    = intake_dt.dt.hour
    out["Month"]   = intake_dt.dt.month
    out["Year"]    = intake_dt.dt.year
    # Animal type (Dog/Cat)
    out["AnimalType"] = df["Animal Type"]
    # Sex and neuter status
    sex_info = df["Sex upon Intake"].fillna("Unknown")
    def split_sex(x):
        if x == "Unknown":
            return ("Unknown", "Unknown")
        parts = x.split()
        if len(parts) == 2:
            status, gender = parts
            gender = "Male" if "Male" in gender else ("Female" if "Female" in gender else "Unknown")
            neutered = "Neutered" if status in ["Neutered","Spayed"] else ("Intact" if status == "Intact" else "Unknown")
            return (gender, neutered)
        else:
            return ("Unknown", "Unknown")
    sex_split = sex_info.apply(split_sex)
    out["Sex"] = sex_split.apply(lambda tup: tup[0])
    out["Neutered"] = sex_split.apply(lambda tup: tup[1])
    # Intake type and condition
    out["IntakeType"] = df["Intake Type"]
    out["IntakeCondition"] = df["Intake Condition"].fillna("Unknown")
    breeds = df["Breed"].fillna("Unknown")
    prim_breed = []
    is_mix = []
    for b in breeds:
        breed_str = str(b)
        if breed_str.endswith(" Mix"):
            breed_str = breed_str[:-4]
            is_mix.append(1)
        elif "/" in breed_str:
            is_mix.append(1)
        else:
            is_mix.append(0)
        if "/" in breed_str:
            prim = breed_str.split("/")[0].strip()
        else:
            prim = breed_str.strip()
        if prim == "" or prim.lower() == "unknown":
            prim = "UnknownBreed"
        prim_breed.append(prim)
    out["PrimaryBreed"] = prim_breed
    out["IsMix"] = is_mix
    # Color processing
    colors = df["Color"].fillna("Unknown")
    prim_color = []
    multi_color = []
    for c in colors:
        color_str = str(c)
        if "/" in color_str:
            multi_color.append(1)
            prim = color_str.split("/")[0].strip()
        else:
            multi_color.append(0)
            prim = color_str.split()[0] if color_str.strip() != "" else "UnknownColor"
        if prim == "" or prim.lower() == "unknown":
            prim = "UnknownColor"
        prim_color.append(prim)
    out["PrimaryColor"] = prim_color
    out["MultiColor"] = multi_color
    # Found Location -> City extraction
    locations = df["Found Location"].fillna("Unknown")
    city_list = []
    for loc in locations:
        loc_str = str(loc).replace(" (TX)", "")
        if " in " in loc_str:
            city = loc_str.split(" in ")[-1].strip()
        else:
            city = loc_str.strip()
        if city == "" or city.lower() == "unknown":
            city = "UnknownCity"
        city_list.append(city)
    out["City"] = city_list
    return out

# Apply preprocessing to train and test
X_train = preprocess_animals(train_df)
X_test  = preprocess_animals(test_df)


# Group rare IntakeCondition categories based on training data frequency
condition_counts = X_train["IntakeCondition"].value_counts()
rare_conditions = condition_counts[condition_counts <= 10].index
X_train["IntakeCondition"] = X_train["IntakeCondition"].replace({cond: "Other" for cond in rare_conditions})
X_test["IntakeCondition"]  = X_test["IntakeCondition"].replace({cond: "Other" for cond in rare_conditions})
# Group rare cities
top_cities = X_train["City"].value_counts().nlargest(6).index  # top 6 frequent cities
X_train["City"] = X_train["City"].apply(lambda c: c if c in top_cities else "Other")
X_test["City"]  = X_test["City"].apply(lambda c: c if c in top_cities else "Other")
#Breed and Color frequency encoding (from training data)
breed_counts = X_train["PrimaryBreed"].value_counts()
color_counts = X_train["PrimaryColor"].value_counts()
X_train["BreedCount"] = X_train["PrimaryBreed"].map(breed_counts)
X_train["ColorCount"] = X_train["PrimaryColor"].map(color_counts)
X_test["BreedCount"]  = X_test["PrimaryBreed"].map(lambda b: breed_counts.get(b, 0))
X_test["ColorCount"]  = X_test["PrimaryColor"].map(lambda c: color_counts.get(c, 0))

categorical_cols = ["AnimalType","Sex","Neutered","IntakeType","IntakeCondition",
                    "PrimaryBreed","PrimaryColor","City","Weekday"]
for col in categorical_cols:
    all_cats = pd.Series(pd.concat([X_train[col], X_test[col]])).unique()
    X_train[col] = X_train[col].astype(pd.api.types.CategoricalDtype(categories=all_cats))
    X_test[col]  = X_test[col].astype(pd.api.types.CategoricalDtype(categories=all_cats))

print("Final feature columns:", list(X_train.columns))
print("Train set size:", X_train.shape, " Test set size:", X_test.shape)

In [None]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from lightgbm import LGBMClassifier

# Define a pipeline: oversample then LightGBM
pipeline = Pipeline([
    ("oversample", RandomOverSampler(sampling_strategy="not majority", random_state=42)),
    ("lgbm", LGBMClassifier(objective="multiclass", boosting_type="dart", random_state=42))
])

# Hyperparameter search
param_dist = {
    "lgbm__num_leaves": [31, 63, 127, 255],
    "lgbm__max_depth": [-1, 10, 20, 50],
    "lgbm__min_child_samples": [20, 50, 100],
    "lgbm__learning_rate": [0.1, 0.05, 0.01],
    "lgbm__n_estimators": [100, 300, 500],
    "lgbm__colsample_bytree": [0.6, 0.8, 1.0],
    "lgbm__subsample": [0.7, 0.85, 1.0],
    "lgbm__reg_alpha": [0, 1, 5],
    "lgbm__reg_lambda": [0, 1, 5]
}
# Stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Randomized Search
random_search = RandomizedSearchCV(pipeline, param_dist, n_iter=20, scoring="balanced_accuracy", 
                                   cv=cv, verbose=1, random_state=42, n_jobs=-1)
# label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(train_df["Outcome Type"])
random_search.fit(X_train, y_train)
print(f"Best CV Balanced Accuracy: {random_search.best_score_:.4f}")
print("Best Hyperparameters:", random_search.best_params_)

In [None]:
# Train final model with best hyperparameters on full training data
best_params = random_search.best_params_
# Update pipeline with best params and fit on full training data
pipeline.set_params(**best_params)
pipeline.fit(X_train, y_train)
# Predict on test set
y_pred = pipeline.predict(X_test)
y_pred_labels = le.inverse_transform(y_pred)
print("Test predictions distribution:")
print(pd.Series(y_pred_labels).value_counts(normalize=True))
# Save predictions to csv
output_df = pd.DataFrame({"Id": test_df["Id"], "PredictedOutcome": y_pred_labels})
output_df.to_csv("predicted_outcomes.csv", index=False)