In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train = train.drop(['Name', 'Outcome Time', 'Outcome Subtype'] if 'Outcome Subtype' in train.columns else ['Name','Outcome Time'], axis=1)

#Should we drop Date of Birth or convert it?
train = train.drop('Date of Birth', axis=1)
test = test.drop('Date of Birth', axis=1)

In [None]:
# Fix missing values
train['Sex upon Intake'] = train['Sex upon Intake'].fillna('Unknown')
missing_age_count = train['Age upon Intake'].isna().sum()
if missing_age_count > 0:
    # Drop the one record with missing age
    train = train[~train['Age upon Intake'].isna()]
print("Remaining missing values in train:", train.isnull().sum().to_dict())

In [None]:
import re

def age_to_days(age_str):
    if pd.isna(age_str):
        return None
    age_str = age_str.lower().strip()
    if age_str == 'unknown':
        return None 
    # Split into number and unit
    parts = age_str.split()
    if len(parts) != 2:
        return None
    num, unit = parts
    try:
        num = int(num)
    except:
        num = 0
    unit = unit.rstrip('s')
    # Convert to days
    if unit == 'day':
        return num
    elif unit == 'week':
        return num * 7
    elif unit == 'month':
        return num * 30
    elif unit == 'year':
        return num * 365
    return None

# Apply conversion to Age upon Intake
train['AgeDays'] = train['Age upon Intake'].apply(age_to_days)
test['AgeDays'] = test['Age upon Intake'].apply(age_to_days)
print(train[['Age upon Intake','AgeDays']].head(3))
# Drop the original age text column
train = train.drop('Age upon Intake', axis=1)
test = test.drop('Age upon Intake', axis=1)

In [None]:
# Convert Intake Time to datetime and extract components
import numpy as np
import pandas as pd

# Convert 'Intake Time' to datetime and extract components
train['IntakeDatetime'] = pd.to_datetime(train['Intake Time'])
test['IntakeDatetime'] = pd.to_datetime(test['Intake Time'])

# Extract basic time components
train['IntakeYear'] = train['IntakeDatetime'].dt.year
train['IntakeMonth'] = train['IntakeDatetime'].dt.month
train['IntakeHour'] = train['IntakeDatetime'].dt.hour
train['IntakeDow'] = train['IntakeDatetime'].dt.dayofweek  # 0=Monday
train['IsWeekend'] = train['IntakeDow'].isin([5, 6]).astype(int)

test['IntakeYear'] = test['IntakeDatetime'].dt.year
test['IntakeMonth'] = test['IntakeDatetime'].dt.month
test['IntakeHour'] = test['IntakeDatetime'].dt.hour
test['IntakeDow'] = test['IntakeDatetime'].dt.dayofweek
test['IsWeekend'] = test['IntakeDow'].isin([5, 6]).astype(int)


# Add a 'Season' feature based on month
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

train['Season'] = train['IntakeMonth'].apply(get_season)
test['Season'] = test['IntakeMonth'].apply(get_season)

#Cover periodic nature of time!
train['Month_sin'] = np.sin(2 * np.pi * train['IntakeMonth'] / 12)
train['Month_cos'] = np.cos(2 * np.pi * train['IntakeMonth'] / 12)
train['Hour_sin'] = np.sin(2 * np.pi * train['IntakeHour'] / 24)
train['Hour_cos'] = np.cos(2 * np.pi * train['IntakeHour'] / 24)

test['Month_sin'] = np.sin(2 * np.pi * test['IntakeMonth'] / 12)
test['Month_cos'] = np.cos(2 * np.pi * test['IntakeMonth'] / 12)
test['Hour_sin'] = np.sin(2 * np.pi * test['IntakeHour'] / 24)
test['Hour_cos'] = np.cos(2 * np.pi * test['IntakeHour'] / 24)

# Drop original datetime; they have served their purpose with honor
train = train.drop(['Intake Time', 'IntakeDatetime'], axis=1)
test = test.drop(['Intake Time', 'IntakeDatetime'], axis=1)

In [None]:
#Breed
def process_breed(breed):
    breed = breed.strip()
    # special case "Black/Tan Hound" is one breed!
    breed = breed.replace("Black/Tan Hound", "Black Tan Hound")
    breed = re.sub(r'\s+Mix$', '', breed, flags=re.IGNORECASE)
    breed = breed.strip()
    if "/" in breed:
        parts = breed.split("/")
    else:
        parts = [breed]
    # If more than 2 parts combine last parts
    if len(parts) > 2:
        parts = ["/".join(parts[:-1]), parts[-1]]
    primary = parts[0].strip()
    secondary = parts[1].strip() if len(parts) > 1 else None
    # Determine if mixed
    is_mix = 1 if ("mix" in breed.lower() or len(parts) > 1) else 0
    # If labeled mix with no second breed, set secondary as Unknown
    if is_mix and secondary is None:
        secondary = "Unknown"
    if secondary is None:
        secondary = "None"
    return primary, secondary, is_mix

# Assuming 'train' and 'test' are your DataFrames
breed_info_train = train['Breed'].apply(process_breed)
train['Breed_Primary'] = breed_info_train.apply(lambda x: x[0])
train['Breed_Secondary'] = breed_info_train.apply(lambda x: x[1])
train['IsMix'] = breed_info_train.apply(lambda x: x[2])

breed_info_test = test['Breed'].apply(process_breed)
test['Breed_Primary'] = breed_info_test.apply(lambda x: x[0])
test['Breed_Secondary'] = breed_info_test.apply(lambda x: x[1])
test['IsMix'] = breed_info_test.apply(lambda x: x[2])


# Group rare primary breeds
primary_counts = train['Breed_Primary'].value_counts()
common_primaries = set(primary_counts[primary_counts >= 100].index)
train['Breed_Primary_Cat'] = train['Breed_Primary'].apply(lambda x: x if x in common_primaries else "Other")
test['Breed_Primary_Cat'] = test['Breed_Primary'].apply(lambda x: x if x in common_primaries else "Other")

# Similarly for secondary breeds
secondary_counts = train['Breed_Secondary'].value_counts()
common_secondaries = set(secondary_counts[secondary_counts >= 100].index)
train['Breed_Secondary_Cat'] = train['Breed_Secondary'].apply(lambda x: x if x in common_secondaries else "Other")
test['Breed_Secondary_Cat'] = test['Breed_Secondary'].apply(lambda x: x if x in common_secondaries else "Other")

print("Unique primary breed categories (grouped):", train['Breed_Primary_Cat'].nunique())
print("Unique secondary breed categories (grouped):", train['Breed_Secondary_Cat'].nunique())

# Primary breed embedding index
primary_categories = sorted(train['Breed_Primary_Cat'].unique())
primary_mapping = {cat: idx for idx, cat in enumerate(primary_categories)}
train['Breed_Primary_Emb'] = train['Breed_Primary_Cat'].map(primary_mapping)
test['Breed_Primary_Emb'] = test['Breed_Primary_Cat'].map(primary_mapping)

# Secondary breed embedding index
secondary_categories = sorted(train['Breed_Secondary_Cat'].unique())
secondary_mapping = {cat: idx for idx, cat in enumerate(secondary_categories)}
train['Breed_Secondary_Emb'] = train['Breed_Secondary_Cat'].map(secondary_mapping)
test['Breed_Secondary_Emb'] = test['Breed_Secondary_Cat'].map(secondary_mapping)

In [None]:
#Color
patterns = ["tabby", "brindle", "tortie", "torbie", "calico", "tricolor", "merle", "point", "tick"]

# Standardize colors; Cream and white are treated as the same, for eg(Later realized that dimensionality is low enough that this is not really necessary)
def standardize_color(c):
    # Define a mapping for common color variations.
    mapping = {
        "Black": "Black",
        "Grey": "Gray",
        "Gray": "Gray",
        "Brown": "Brown",
        "Chocolate": "Brown",
        "Liver": "Brown",
        "Tan": "Tan",
        "Fawn": "Tan",
        "White": "White",
        "Cream": "White",
        "Ivory": "White",
        "Orange": "Orange",
        "Red": "Red",
        "Golden": "Golden",
        "Gold": "Golden"
    }
    c_std = c.title().strip()
    return mapping.get(c_std, c_std)

# Process the Color field
def process_color(color):
    color = color.strip().lower()
    pattern_flags = {p: (1 if p in color else 0) for p in patterns}
    parts = [part.strip() for part in color.split('/')]
    if len(parts) > 2:
        parts = [parts[0], parts[1]]
    primary_part = parts[0]
    secondary_part = parts[1] if len(parts) > 1 else None
    
    # Remove common pattern words
    def remove_pattern_words(col):
        if col is None:
            return None
        for pat in ["tabby", "brindle", "merle", "point", "tick"]:
            if col.endswith(" " + pat):
                col = col[:-len(pat)-1]
        return col.strip()
    
    base_primary = remove_pattern_words(primary_part)
    if base_primary:
        base_primary = base_primary.title()
        base_primary = standardize_color(base_primary)
    else:
        base_primary = "Unknown"
    
    base_secondary = remove_pattern_words(secondary_part)
    if base_secondary:
        base_secondary = base_secondary.title()
        base_secondary = standardize_color(base_secondary)
    else:
        base_secondary = "None"
    
    # Multicolor indicator
    is_multi = 1 if len(parts) > 1 else 0
    if len(parts) == 1 and any(p in color for p in ["calico", "tricolor", "tortie", "torbie"]):
        is_multi = 1
    
    return base_primary, base_secondary, is_multi, pattern_flags

# Apply color parsing to the training set
color_info_train = train['Color'].apply(process_color)
train['Color_Primary'] = color_info_train.apply(lambda x: x[0])
train['Color_Secondary'] = color_info_train.apply(lambda x: x[1])
train['IsMultiColor'] = color_info_train.apply(lambda x: x[2])
for p in patterns:
    train['Pattern_' + p.capitalize()] = color_info_train.apply(lambda x: x[3][p])

# Apply color parsing to the test set
color_info_test = test['Color'].apply(process_color)
test['Color_Primary'] = color_info_test.apply(lambda x: x[0])
test['Color_Secondary'] = color_info_test.apply(lambda x: x[1])
test['IsMultiColor'] = color_info_test.apply(lambda x: x[2])
for p in patterns:
    test['Pattern_' + p.capitalize()] = color_info_test.apply(lambda x: x[3][p])

In [None]:
#Location
def extract_city(location):
    loc = location.strip()
    if "Outside" in loc:
        return "Outside"
    if " in " in loc:
        city = loc[loc.rfind(" in ") + 4 : loc.rfind(" (")]
        return city
    if loc.endswith("(TX)"):
        return loc[:loc.rfind(" (")]
    return loc

# Extract the found city
train['Found_City'] = train['Found Location'].apply(extract_city)
test['Found_City'] = test['Found Location'].apply(extract_city)

# Min threashold to treat city individually
min_threshold = 100
city_counts = train['Found_City'].value_counts()
cities_to_keep = city_counts[city_counts >= min_threshold].index.tolist()

def group_city(city, cities_to_keep=cities_to_keep):
    return city if city in cities_to_keep else "Other Found City"

train['Found_City_Grouped'] = train['Found_City'].apply(group_city)
test['Found_City_Grouped'] = test['Found_City'].apply(group_city)

print(train['Found_City_Grouped'].value_counts())

# Found in Austin or not
train['Found_In_Austin'] = (train['Found_City'] == 'Austin').astype(int)
test['Found_In_Austin'] = (test['Found_City'] == 'Austin').astype(int)

train = train.drop(['Found_City'], axis=1)
test = test.drop(['Found_City'], axis=1)

In [None]:
#Sex upon Intake
# Split "Sex upon Intake" into Gender and Fixed status
def split_sex(sex_str):
    s = sex_str.lower()
    if s.startswith("neutered") or s.startswith("spayed"):
        fixed = "Yes"
    elif s.startswith("intact"):
        fixed = "No"
    else:
        fixed = "Unknown"
    if "female" in s:
        gender = "Female"
    elif "male" in s:
        gender = "Male"
    else:
        gender = "Unknown"
    return gender, fixed

train[['Gender','Fixed']] = pd.DataFrame(train['Sex upon Intake'].apply(split_sex).tolist(), index=train.index)
test[['Gender','Fixed']] = pd.DataFrame(test['Sex upon Intake'].apply(split_sex).tolist(), index=test.index)
# Drop the original Sex column
train = train.drop('Sex upon Intake', axis=1)
test = test.drop('Sex upon Intake', axis=1)
print(train[['Gender','Fixed']].head(5))

In [None]:
# Prepare target variable
y = train['Outcome Type']

# Drop all raw and unnecessary text columns
columns_to_drop = [
    'Outcome Type',   # only in train
    'Breed',         
    'Color',         
    'Found Location', 
    'Id',             
    'Breed_Primary',  
    'Breed_Secondary',
    'Color_Primary',
    'Color_Secondary'
]

train_features = train.drop(columns=columns_to_drop, axis=1)
test_features = test.drop(columns=[col for col in columns_to_drop if col != 'Outcome Type'], axis=1)

# Now check remaining string columns
print("Remaining object columns:", train_features.select_dtypes(include=['object']).columns.tolist())

# One-hot encode
full_data = pd.concat([train_features, test_features], axis=0, ignore_index=True)
categorical_cols = ['Animal Type','Intake Type','Intake Condition','Found_City_Grouped',
                    'Breed_Primary_Cat','Breed_Secondary_Cat','Gender','Fixed',
                    'IntakeYear','IntakeMonth']
full_dummies = pd.get_dummies(full_data, columns=categorical_cols, drop_first=False)
print("Total features after one-hot:", full_dummies.shape[1])

# Split encoded data back into train/test
X_train_enc = full_dummies.iloc[:len(train_features), :].copy()
X_test_enc = full_dummies.iloc[len(train_features):, :].copy()

# Standardize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_enc)
X_test_scaled = scaler.transform(X_test_enc)


y_test = test['Outcome Type']
y_test_num = y_test.map(class_to_idx)
test_features = test.drop(columns=[col for col in columns_to_drop if col != 'Outcome Type'], axis=1)

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
import lightgbm as lgb # Import for callbacks
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler # Assuming X_train_scaled is pre-scaled
import optuna
import functools


classes = sorted(y.unique())
class_to_idx = {cls: idx for idx, cls in enumerate(classes)}
y_num = y.map(class_to_idx)
print(f"Target classes: {classes}")
print(f"Encoded target mapping: {class_to_idx}")

# Compute class weights
class_counts = y_num.value_counts().to_dict()
num_classes = len(classes)
total_samples = len(y_num)

class_weights = {cls_idx: total_samples / (num_classes * count)
                 for cls_idx, count in class_counts.items()}
print(f"Calculated Class weights: {class_weights}")



print("Splitting data into training and validation sets for Optuna...")
X_tr, X_val, y_tr, y_val = train_test_split(X_train_scaled, y_num,
                                            test_size=0.2,    # 20% for validation
                                            stratify=y_num,   # Ensure class distribution is similar
                                            random_state=42)  # For reproducibility
print(f"Training set shape: {X_tr.shape}, Validation set shape: {X_val.shape}")

def objective(trial, X_tr, y_tr, X_val, y_val, class_weights):

    fixed_params = {
        'objective': 'multiclass',    
        'metric': 'multi_logloss',    
        'n_estimators': 1000,         
        'class_weight': class_weights,
        'random_state': 42,           
        'n_jobs': -1                  
    }

    
    tuning_params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 20, 256),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 7) # Include 0 to disable bagging
    }


    params = {**fixed_params, **tuning_params}

    
    model = lgb.LGBMClassifier(**params)

    
    early_stopping_callback = lgb.early_stopping(
        stopping_rounds=50, 
        verbose=False        
    )

    try:
        # Train the model with early stopping
        model.fit(X_tr, y_tr,
                  eval_set=[(X_val, y_val)],
                  eval_metric='multi_logloss', # Metric monitored by early stopping
                  callbacks=[early_stopping_callback])

        # Evaluate the model on the validation set
        val_preds = model.predict(X_val)
        # Calculate balanced accuracy as the metric to maximize
        balanced_accuracy = balanced_accuracy_score(y_val, val_preds)

    except Exception as e:
        
        print(f"Trial {trial.number} failed with error: {e}. Parameters: {trial.params}")
       
        balanced_accuracy = 0.0
   
    return balanced_accuracy


objective_with_data = functools.partial(objective,
                                        X_tr=X_tr, y_tr=y_tr,
                                        X_val=X_val, y_val=y_val,
                                        class_weights=class_weights)

print("Creating Optuna study...")
# Create an Optuna study object to maximize balanced accuracy
study = optuna.create_study(direction="maximize",
                            study_name="lgbm_multiclass_tuning")

print(f"Starting Optuna optimization with {100} trials...") # Adjust n_trials as needed
# Run the optimization process
study.optimize(objective_with_data,
               n_trials=100, 
               timeout=600)




print(f"Best trial number: {study.best_trial.number}")
print(f"Best value (Balanced Accuracy): {study.best_value:.5f}")
print("Best hyperparameters found:")
best_hyperparams = study.best_params
for key, value in best_hyperparams.items():
    print(f"  {key}: {value}")

final_fixed_params = {
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'class_weight': class_weights,
    'random_state': 42, #HGTTG
    'n_jobs': -1,
    'n_estimators': 300
}


# best_hyperparams
final_params = {**final_fixed_params, **best_hyperparams}


print("\nTraining final model with best hyperparameters on the full training data")

final_model = lgb.LGBMClassifier(**final_params)

# Train the final model on the entire training dataset (X_train_scaled, y_num)
final_model.fit(X_train_scaled, y_num)

print("Final model trained successfully.")

test_preds = final_model.predict(X_test_scaled)
print("\nFinal Model Evaluation on Test Set:")
print("Balanced Accuracy on test:", balanced_accuracy_score(y_test_num, test_preds))
print("Test classification report:")
print(classification_report(y_test_num, test_preds, target_names=classes))