In [None]:
import pandas as pd

# Load datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print("Train shape:", train.shape, " Test shape:", test.shape)
print("Train columns:", train.columns.tolist())
print("Test columns:", test.columns.tolist())

# Drop columns not available in test
train = train.drop(['Name', 'Outcome Time', 'Outcome Subtype'] if 'Outcome Subtype' in train.columns else ['Name','Outcome Time'], axis=1)
# drop 'Date of Birth' since we will use age instead
train = train.drop('Date of Birth', axis=1)
test = test.drop('Date of Birth', axis=1)
# Confirm columns after drop
print("Train columns after drop:", train.columns.tolist())
print("Test columns after drop:", test.columns.tolist())


In [None]:
# Fix missing values
train['Sex upon Intake'] = train['Sex upon Intake'].fillna('Unknown')
missing_age_count = train['Age upon Intake'].isna().sum()
if missing_age_count > 0:
    # Drop the one record with missing age
    train = train[~train['Age upon Intake'].isna()]
print("Remaining missing values in train:", train.isnull().sum().to_dict())

In [None]:
import re

def age_to_days(age_str):
    if pd.isna(age_str):
        return None
    age_str = age_str.lower().strip()
    if age_str == 'unknown':
        return None
    # Split into number and unit
    parts = age_str.split()
    if len(parts) != 2:
        return None
    num, unit = parts
    try:
        num = int(num)
    except:
        num = 0
    unit = unit.rstrip('s')
    # Convert to days
    if unit == 'day':
        return num
    elif unit == 'week':
        return num * 7
    elif unit == 'month':
        return num * 30
    elif unit == 'year':
        return num * 365
    return None

# convert age upon intake
train['AgeDays'] = train['Age upon Intake'].apply(age_to_days)
test['AgeDays'] = test['Age upon Intake'].apply(age_to_days)
print(train[['Age upon Intake','AgeDays']].head(3))
# Drop the original age text column
train = train.drop('Age upon Intake', axis=1)
test = test.drop('Age upon Intake', axis=1)

In [None]:
# Convert Intake Time to datetime and extract components
train['IntakeDatetime'] = pd.to_datetime(train['Intake Time'])
test['IntakeDatetime'] = pd.to_datetime(test['Intake Time'])

train['IntakeYear'] = train['IntakeDatetime'].dt.year
train['IntakeMonth'] = train['IntakeDatetime'].dt.month
train['IntakeHour'] = train['IntakeDatetime'].dt.hour
train['IntakeDow'] = train['IntakeDatetime'].dt.dayofweek
train['IsWeekend'] = train['IntakeDow'].isin([5,6]).astype(int)

test['IntakeYear'] = test['IntakeDatetime'].dt.year
test['IntakeMonth'] = test['IntakeDatetime'].dt.month
test['IntakeHour'] = test['IntakeDatetime'].dt.hour
test['IntakeDow'] = test['IntakeDatetime'].dt.dayofweek
test['IsWeekend'] = test['IntakeDow'].isin([5,6]).astype(int)


train = train.drop(['Intake Time','IntakeDatetime'], axis=1)
test = test.drop(['Intake Time','IntakeDatetime'], axis=1)

In [None]:
import numpy as np
#Breed Parsing
def process_breed(breed):
    breed = breed.strip()
    breed = breed.replace("Black/Tan Hound", "Black Tan Hound")
    
    breed = re.sub(r'\s+Mix$', '', breed, flags=re.IGNORECASE)
    breed = breed.strip()
    
    if "/" in breed:
        parts = breed.split("/")
    else:
        parts = [breed]
    if len(parts) > 2:
        parts = ["/".join(parts[:-1]), parts[-1]]
    primary = parts[0].strip()
    secondary = parts[1].strip() if len(parts) > 1 else None
    
    is_mix = 1 if ("mix" in breed.lower() or len(parts) > 1) else 0
    
    if is_mix and secondary is None:
        secondary = "Unknown"
    if secondary is None:
        secondary = "None"
    return primary, secondary, is_mix


breed_info_train = train['Breed'].apply(process_breed)
train['Breed_Primary'] = breed_info_train.apply(lambda x: x[0])
train['Breed_Secondary'] = breed_info_train.apply(lambda x: x[1])
train['IsMix'] = breed_info_train.apply(lambda x: x[2])

breed_info_test = test['Breed'].apply(process_breed)
test['Breed_Primary'] = breed_info_test.apply(lambda x: x[0])
test['Breed_Secondary'] = breed_info_test.apply(lambda x: x[1])
test['IsMix'] = breed_info_test.apply(lambda x: x[2])

# Sample output of head
print(train[['Breed','Breed_Primary','Breed_Secondary','IsMix']].head(5))


In [None]:
# Group rare primary breeds
primary_counts = train['Breed_Primary'].value_counts()
common_primaries = set(primary_counts[primary_counts >= 100].index)
train['Breed_Primary_Cat'] = train['Breed_Primary'].apply(lambda x: x if x in common_primaries else "Other")
test['Breed_Primary_Cat'] = test['Breed_Primary'].apply(lambda x: x if x in common_primaries else "Other")

# Group rare secondary breeds
secondary_counts = train['Breed_Secondary'].value_counts()
common_secondaries = set(secondary_counts[secondary_counts >= 100].index)
train['Breed_Secondary_Cat'] = train['Breed_Secondary'].apply(lambda x: x if x in common_secondaries else "Other")
test['Breed_Secondary_Cat'] = test['Breed_Secondary'].apply(lambda x: x if x in common_secondaries else "Other")

print("Unique primary breed categories (grouped):", train['Breed_Primary_Cat'].nunique())
print("Unique secondary breed categories (grouped):", train['Breed_Secondary_Cat'].nunique())

In [None]:
#Got patterns through printing
patterns = ["tabby","brindle","tortie","torbie","calico","tricolor","merle","point","tick"]

def process_color(color):
    color = color.strip().lower()
    # Identify pattern flags
    pattern_flags = {p: (1 if p in color else 0) for p in patterns}
    # Split into primary/secondary parts
    parts = [part.strip() for part in color.split('/')]
    if len(parts) > 2:
        parts = [parts[0], parts[1]]
    primary_part = parts[0]
    secondary_part = parts[1] if len(parts) > 1 else None
    def remove_pattern_words(col):
        if col is None: 
            return None
        for pat in ["tabby","brindle","merle","point","tick"]:
            if col.endswith(" " + pat):
                col = col[: -len(pat) - 1]
        return col.strip()
    base_primary = remove_pattern_words(primary_part).title() if primary_part else "Unknown"
    base_secondary = remove_pattern_words(secondary_part).title() if secondary_part else None
    if base_secondary is None:
        base_secondary = "None"
    is_multi = 1 if len(parts) > 1 else 0
    if len(parts) == 1 and any(p in color for p in ["calico","tricolor","tortie","torbie"]):
        is_multi = 1
    return base_primary, base_secondary, is_multi, pattern_flags

# Apply color parsing
color_info_train = train['Color'].apply(process_color)
train['Color_Primary'] = color_info_train.apply(lambda x: x[0])
train['Color_Secondary'] = color_info_train.apply(lambda x: x[1])
train['IsMultiColor'] = color_info_train.apply(lambda x: x[2])
for p in patterns:
    train['Pattern_'+p.capitalize()] = color_info_train.apply(lambda x: x[3][p])

color_info_test = test['Color'].apply(process_color)
test['Color_Primary'] = color_info_test.apply(lambda x: x[0])
test['Color_Secondary'] = color_info_test.apply(lambda x: x[1])
test['IsMultiColor'] = color_info_test.apply(lambda x: x[2])
for p in patterns:
    test['Pattern_'+p.capitalize()] = color_info_test.apply(lambda x: x[3][p])

#print some parsed features
print(train[['Color','Color_Primary','Color_Secondary','IsMultiColor','Pattern_Tabby','Pattern_Calico']].head(5))

In [None]:
def extract_city(location):
    loc = location.strip()
    if "Outside" in loc:
        return "Outside"
    if " in " in loc:
        # Use the part after the last " in " and before " (TX"
        city = loc[loc.rfind(" in ") + 4 : loc.rfind(" (")]
        return city
    # If no " in ", just remove the trailing state code
    if loc.endswith("(TX)"):
        return loc[:loc.rfind(" (")]
    return loc

# Extract the found city for both train and test sets
train['Found_City'] = train['Found Location'].apply(extract_city)
test['Found_City'] = test['Found Location'].apply(extract_city)

# Set a minimum threshold for the frequency of a city
min_threshold = 100
city_counts = train['Found_City'].value_counts()
cities_to_keep = city_counts[city_counts >= min_threshold].index.tolist()

def group_city(city, cities_to_keep=cities_to_keep):
    # If the city appears less than the threshold, group it as "Other Found City"
    return city if city in cities_to_keep else "Other Found City"

# Create a new grouped column for the found city
train['Found_City_Grouped'] = train['Found_City'].apply(group_city)
test['Found_City_Grouped'] = test['Found_City'].apply(group_city)

# print the counts to check grouping
print(train['Found_City_Grouped'].value_counts())

# Retain the Found_In_Austin indicator if needed
train['Found_In_Austin'] = (train['Found_City'] == 'Austin').astype(int)
test['Found_In_Austin'] = (test['Found_City'] == 'Austin').astype(int)

train = train.drop(['Found_City'], axis=1)
test = test.drop(['Found_City'], axis=1)

In [None]:
# Split Sex upon Intake into Gender and Fixed(spayed/neutered) status
def split_sex(sex_str):
    s = sex_str.lower()
    if s.startswith("neutered") or s.startswith("spayed"):
        fixed = "Yes"
    elif s.startswith("intact"):
        fixed = "No"
    else:
        fixed = "Unknown"
    if "female" in s:
        gender = "Female"
    elif "male" in s:
        gender = "Male"
    else:
        gender = "Unknown"
    return gender, fixed

train[['Gender','Fixed']] = pd.DataFrame(train['Sex upon Intake'].apply(split_sex).tolist(), index=train.index)
test[['Gender','Fixed']] = pd.DataFrame(test['Sex upon Intake'].apply(split_sex).tolist(), index=test.index)
# Drop original Sex column
train = train.drop('Sex upon Intake', axis=1)
test = test.drop('Sex upon Intake', axis=1)
print(train[['Gender','Fixed']].head(5))


In [None]:
y = train['Outcome Type']

# Drop all raw and unnecessary text/ID columns from both train and test
columns_to_drop = [
    'Outcome Type',     # target variable
    'Breed',            # raw text breed
    'Color',            # raw text color
    'Found Location',   # raw text found location
    'Id',               
    'Breed_Primary',    
    'Breed_Secondary',
    'Color_Primary',
    'Color_Secondary',
]

train_features = train.drop(columns=[col for col in columns_to_drop if col in train.columns], axis=1)
test_features = test.drop(columns=[col for col in columns_to_drop if col in test.columns and col != 'Outcome Type'], axis=1)

# Align columns, handling potential diffs
train_cols = set(train_features.columns)
test_cols = set(test_features.columns)

missing_in_test = list(train_cols - test_cols)
for c in missing_in_test:
    if c != 'Outcome Type': 
        test_features[c] = 0 

missing_in_train = list(test_cols - train_cols)
for c in missing_in_train:
    train_features[c] = 0 # Or appropriate default value

# Ensure column order is the same
test_features = test_features[train_features.columns]


# Identify remaining categorical columns for one-hot encoding
categorical_cols = train_features.select_dtypes(include=['object', 'category']).columns.tolist()
# Ensure list only contains columns present in the dataframe
categorical_cols = [col for col in categorical_cols if col in train_features.columns]

print("Categorical columns for one-hot encoding:", categorical_cols)


# One-hot encode categorical features
for col in categorical_cols:
    combined_cats = pd.concat([train_features[col], test_features[col]], axis=0).unique()
    train_features[col] = pd.Categorical(train_features[col], categories=combined_cats)
    test_features[col] = pd.Categorical(test_features[col], categories=combined_cats)

full_data = pd.concat([train_features, test_features], axis=0, ignore_index=True)
full_dummies = pd.get_dummies(full_data, columns=categorical_cols, drop_first=False, dummy_na=False) # Set dummy_na=True if NaNs should have their own column
print("Total features after one-hot:", full_dummies.shape[1])

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, classification_report
from lightgbm import LGBMClassifier
import optuna
from optuna.integration import LightGBMPruningCallback

imputer = SimpleImputer(strategy='median')
full_dummies_imputed = imputer.fit_transform(full_dummies)
full_dummies = pd.DataFrame(full_dummies_imputed, columns=full_dummies.columns)


# Split encoded data back into train/test
X_train_enc = full_dummies.iloc[:len(train_features), :].copy()
X_test_enc = full_dummies.iloc[len(train_features):, :].copy()

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_enc)
X_test_scaled = scaler.transform(X_test_enc)


# Encode target labels to numeric indices
classes = sorted(y.unique())
class_to_idx = {cls: idx for idx, cls in enumerate(classes)}
y_num = y.map(class_to_idx)
print("Target classes:", classes)
print("Encoded target mapping:", class_to_idx)


# Compute class weights
class_counts = y_num.value_counts().to_dict()
num_classes = len(classes)
total_samples = len(y_num)
# Calculate weights using: n_samples / (n_classes * np.bincount(y))
class_weights_map = {cls_idx: total_samples / (num_classes * count) for cls_idx, count in class_counts.items()}
class_weights_dict = class_weights_map

print("Class weights:", class_weights_dict)

# Split off a validation set from training data for evaluation and Optuna
X_tr, X_val, y_tr, y_val = train_test_split(X_train_scaled, y_num, test_size=0.25, stratify=y_num, random_state=42) # Increased validation size for more robust tuning

# Optuna objective fn for hyperparam optimization
def objective(trial, X_tr, y_tr, X_val, y_val, class_weights_dict):

    # Hyperparameters to optimize
    params = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'num_class': len(classes),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=50),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'class_weight': class_weights_dict,
        'random_state': 42,
        'n_jobs': -1 
    }

    # LGBM models
    model = LGBMClassifier(**params)
    pruning_callback = LightGBMPruningCallback(trial, 'multi_logloss')

    model.fit(X_tr, y_tr,
              eval_set=[(X_val, y_val)],
              eval_metric='multi_logloss',
              callbacks=[pruning_callback],
             )


    val_preds = model.predict(X_val)
    accuracy = balanced_accuracy_score(y_val, val_preds)

    return 1.0 - accuracy


# Optuna run
study_name = 'lgbm-animal-outcome-optimization'
pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=10, interval_steps=1)


study = optuna.create_study(direction='minimize',
                           study_name=study_name,
                           pruner=pruner,
                           )


n_trials = 50
study.optimize(lambda trial: objective(trial, X_tr, y_tr, X_val, y_val, class_weights_dict),
               n_trials=n_trials,
               )



print("\nOptuna Finished")
print(f"Number of finished trials: {len(study.trials)}")
print("Best trial:")
best_trial = study.best_trial

print(f"Value (Balanced Accuracy): {best_trial.value:.4f}")
print("Params: ")
for key, value in best_trial.params.items():
    print(f" {key}: {value}")

print("\nTraining Final Model with Best Parameters")
best_params = best_trial.params
best_params['objective'] = 'multiclass'
best_params['metric'] = 'multi_logloss'
best_params['num_class'] = len(classes)
best_params['class_weight'] = class_weights_dict
best_params['random_state'] = 42
best_params['n_jobs'] = -1


final_model = LGBMClassifier(**best_params)


print("Training on full training data")
final_model.fit(X_train_scaled, y_num,
               )

print("\nValidation Evaluation")
final_val_preds = final_model.predict(X_val)

print("Balanced Accuracy on val of final model:", balanced_accuracy_score(y_val, final_val_preds))
print("Calssification Report:")
print(classification_report(y_val, final_val_preds, target_names=classes))

In [None]:
#Final Predictions
print("\nTest set Prediction:")
test_predictions_encoded = final_model.predict(X_test_scaled)

idx_to_class = {idx: cls for cls, idx in class_to_idx.items()}
test_predictions_labels = pd.Series(test_predictions_encoded).map(idx_to_class)

print("Sample test predictions:")
print(test_predictions_labels.head())


submission = pd.DataFrame({'Id': test['Id'], 'Outcome Type': test_predictions_labels})
print("\nSubmission file:")
print(submission.head())
submission.to_csv('new_optimized_submission.csv', index=False)