In [None]:
import pandas as pd

# Load datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print("Train shape:", train.shape, " Test shape:", test.shape)
print("Train columns:", train.columns.tolist())
print("Test columns:", test.columns.tolist())

train = train.drop(['Name', 'Outcome Time', 'Outcome Subtype'] if 'Outcome Subtype' in train.columns else ['Name','Outcome Time'], axis=1)
train = train.drop('Date of Birth', axis=1)
test = test.drop('Date of Birth', axis=1)
print("Train columns after drop:", train.columns.tolist())
print("Test columns after drop:", test.columns.tolist())


In [None]:
# Fix missing values
train['Sex upon Intake'] = train['Sex upon Intake'].fillna('Unknown')
missing_age_count = train['Age upon Intake'].isna().sum()
if missing_age_count > 0:
    # Drop the one record with missing age
    train = train[~train['Age upon Intake'].isna()]
print("Remaining missing values in train:", train.isnull().sum().to_dict())


In [None]:
import re
#Age Parsing
def age_to_days(age_str):
    if pd.isna(age_str):
        return None
    age_str = age_str.lower().strip()
    if age_str == 'unknown':
        return None
    parts = age_str.split()
    if len(parts) != 2:
        return None
    num, unit = parts
    try:
        num = int(num)
    except:
        num = 0
    unit = unit.rstrip('s')
    if unit == 'day':
        return num
    elif unit == 'week':
        return num * 7
    elif unit == 'month':
        return num * 30
    elif unit == 'year':
        return num * 365
    return None

# Convert Age upon Intake to AgeDays
train['AgeDays'] = train['Age upon Intake'].apply(age_to_days)
test['AgeDays'] = test['Age upon Intake'].apply(age_to_days)
print(train[['Age upon Intake','AgeDays']].head(3))
# Drop the original age
train = train.drop('Age upon Intake', axis=1)
test = test.drop('Age upon Intake', axis=1)


In [None]:
train['IntakeDatetime'] = pd.to_datetime(train['Intake Time'])
test['IntakeDatetime'] = pd.to_datetime(test['Intake Time'])

train['IntakeYear'] = train['IntakeDatetime'].dt.year
train['IntakeMonth'] = train['IntakeDatetime'].dt.month
train['IntakeHour'] = train['IntakeDatetime'].dt.hour
train['IntakeDow'] = train['IntakeDatetime'].dt.dayofweek  # 0=Monday
train['IsWeekend'] = train['IntakeDow'].isin([5,6]).astype(int)

test['IntakeYear'] = test['IntakeDatetime'].dt.year
test['IntakeMonth'] = test['IntakeDatetime'].dt.month
test['IntakeHour'] = test['IntakeDatetime'].dt.hour
test['IntakeDow'] = test['IntakeDatetime'].dt.dayofweek
test['IsWeekend'] = test['IntakeDow'].isin([5,6]).astype(int)

# Drop original datetime
train = train.drop(['Intake Time','IntakeDatetime'], axis=1)
test = test.drop(['Intake Time','IntakeDatetime'], axis=1)

In [None]:
import numpy as np

def process_breed(breed):
    breed = breed.strip()
    #special case black/tan hound
    breed = breed.replace("Black/Tan Hound", "Black Tan Hound")
    breed = re.sub(r'\s+Mix$', '', breed, flags=re.IGNORECASE)
    breed = breed.strip()
    if "/" in breed:
        parts = breed.split("/")
    else:
        parts = [breed]
    if len(parts) > 2:
        parts = ["/".join(parts[:-1]), parts[-1]]
    primary = parts[0].strip()
    secondary = parts[1].strip() if len(parts) > 1 else None
    is_mix = 1 if ("mix" in breed.lower() or len(parts) > 1) else 0
    # If mixed set secondary as Unknown
    if is_mix and secondary is None:
        secondary = "Unknown"
    if secondary is None:
        secondary = "None"
    return primary, secondary, is_mix

# Apply breed parsing to train and test
breed_info_train = train['Breed'].apply(process_breed)
train['Breed_Primary'] = breed_info_train.apply(lambda x: x[0])
train['Breed_Secondary'] = breed_info_train.apply(lambda x: x[1])
train['IsMix'] = breed_info_train.apply(lambda x: x[2])

breed_info_test = test['Breed'].apply(process_breed)
test['Breed_Primary'] = breed_info_test.apply(lambda x: x[0])
test['Breed_Secondary'] = breed_info_test.apply(lambda x: x[1])
test['IsMix'] = breed_info_test.apply(lambda x: x[2])

# Head output
print(train[['Breed','Breed_Primary','Breed_Secondary','IsMix']].head(5))


In [None]:
# Group rare primary breeds as Other
primary_counts = train['Breed_Primary'].value_counts()
common_primaries = set(primary_counts[primary_counts >= 100].index)
train['Breed_Primary_Cat'] = train['Breed_Primary'].apply(lambda x: x if x in common_primaries else "Other")
test['Breed_Primary_Cat'] = test['Breed_Primary'].apply(lambda x: x if x in common_primaries else "Other")

# Group rare secondary breeds as Other
secondary_counts = train['Breed_Secondary'].value_counts()
common_secondaries = set(secondary_counts[secondary_counts >= 100].index)
train['Breed_Secondary_Cat'] = train['Breed_Secondary'].apply(lambda x: x if x in common_secondaries else "Other")
test['Breed_Secondary_Cat'] = test['Breed_Secondary'].apply(lambda x: x if x in common_secondaries else "Other")

print("Unique primary breed categories (grouped):", train['Breed_Primary_Cat'].nunique())
print("Unique secondary breed categories (grouped):", train['Breed_Secondary_Cat'].nunique())


In [None]:
patterns = ["tabby","brindle","tortie","torbie","calico","tricolor","merle","point","tick"]

def process_color(color):
    color = color.strip().lower()
    pattern_flags = {p: (1 if p in color else 0) for p in patterns}
    parts = [part.strip() for part in color.split('/')]
    if len(parts) > 2:
        parts = [parts[0], parts[1]]
    primary_part = parts[0]
    secondary_part = parts[1] if len(parts) > 1 else None
    def remove_pattern_words(col):
        if col is None: 
            return None
        for pat in ["tabby","brindle","merle","point","tick"]:
            if col.endswith(" " + pat):
                col = col[: -len(pat) - 1]
        return col.strip()
    base_primary = remove_pattern_words(primary_part).title() if primary_part else "Unknown"
    base_secondary = remove_pattern_words(secondary_part).title() if secondary_part else None
    if base_secondary is None:
        base_secondary = "None"
    is_multi = 1 if len(parts) > 1 else 0
    if len(parts) == 1 and any(p in color for p in ["calico","tricolor","tortie","torbie"]):
        is_multi = 1
    return base_primary, base_secondary, is_multi, pattern_flags

# Apply color parsing
color_info_train = train['Color'].apply(process_color)
train['Color_Primary'] = color_info_train.apply(lambda x: x[0])
train['Color_Secondary'] = color_info_train.apply(lambda x: x[1])
train['IsMultiColor'] = color_info_train.apply(lambda x: x[2])
for p in patterns:
    train['Pattern_'+p.capitalize()] = color_info_train.apply(lambda x: x[3][p])

color_info_test = test['Color'].apply(process_color)
test['Color_Primary'] = color_info_test.apply(lambda x: x[0])
test['Color_Secondary'] = color_info_test.apply(lambda x: x[1])
test['IsMultiColor'] = color_info_test.apply(lambda x: x[2])
for p in patterns:
    test['Pattern_'+p.capitalize()] = color_info_test.apply(lambda x: x[3][p])

# Show sample of parsed color features
print(train[['Color','Color_Primary','Color_Secondary','IsMultiColor','Pattern_Tabby','Pattern_Calico']].head(5))


In [None]:
def extract_city(location):
    loc = location.strip()
    if "Outside" in loc:
        return "Outside"
    if " in " in loc:
        city = loc[loc.rfind(" in ") + 4 : loc.rfind(" (")]
        return city
    if loc.endswith("(TX)"):
        return loc[:loc.rfind(" (")]
    return loc

train['Found_City'] = train['Found Location'].apply(extract_city)
test['Found_City'] = test['Found Location'].apply(extract_city)

# Min threshold for city freq
min_threshold = 100
city_counts = train['Found_City'].value_counts()
cities_to_keep = city_counts[city_counts >= min_threshold].index.tolist()

def group_city(city, cities_to_keep=cities_to_keep):
    return city if city in cities_to_keep else "Other Found City"

# Create a new grouped column for the found city
train['Found_City_Grouped'] = train['Found_City'].apply(group_city)
test['Found_City_Grouped'] = test['Found_City'].apply(group_city)

print(train['Found_City_Grouped'].value_counts())

train['Found_In_Austin'] = (train['Found_City'] == 'Austin').astype(int)
test['Found_In_Austin'] = (test['Found_City'] == 'Austin').astype(int)

train = train.drop(['Found_City'], axis=1)
test = test.drop(['Found_City'], axis=1)

In [None]:
#Handle sex and neuter status
def split_sex(sex_str):
    s = sex_str.lower()
    if s.startswith("neutered") or s.startswith("spayed"):
        fixed = "Yes"
    elif s.startswith("intact"):
        fixed = "No"
    else:
        fixed = "Unknown"
    if "female" in s:
        gender = "Female"
    elif "male" in s:
        gender = "Male"
    else:
        gender = "Unknown"
    return gender, fixed

train[['Gender','Fixed']] = pd.DataFrame(train['Sex upon Intake'].apply(split_sex).tolist(), index=train.index)
test[['Gender','Fixed']] = pd.DataFrame(test['Sex upon Intake'].apply(split_sex).tolist(), index=test.index)
train = train.drop('Sex upon Intake', axis=1)
test = test.drop('Sex upon Intake', axis=1)
print(train[['Gender','Fixed']].head(5))

In [None]:
# Target var: Outcome Type
y = train['Outcome Type']

# Drop all unncecesary columns
columns_to_drop = [
    'Outcome Type',
    'Breed',          
    'Color',          
    'Found Location', 
    'Id',            
    'Breed_Primary', 
    'Breed_Secondary',
    'Color_Primary',
    'Color_Secondary'
]

train_features = train.drop(columns=columns_to_drop, axis=1)
test_features = test.drop(columns=[col for col in columns_to_drop if col != 'Outcome Type'], axis=1)


print("Remaining object columns:", train_features.select_dtypes(include=['object']).columns.tolist())

#One hot encode
full_data = pd.concat([train_features, test_features], axis=0, ignore_index=True)
categorical_cols = ['Animal Type','Intake Type','Intake Condition','Found_City_Grouped',
                    'Breed_Primary_Cat','Breed_Secondary_Cat','Gender','Fixed',
                    'IntakeYear','IntakeMonth']
full_dummies = pd.get_dummies(full_data, columns=categorical_cols, drop_first=False)
print("Total features after one-hot:", full_dummies.shape[1])


X_train_enc = full_dummies.iloc[:len(train_features), :].copy()
X_test_enc = full_dummies.iloc[len(train_features):, :].copy()


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_enc)
X_test_scaled = scaler.transform(X_test_enc)

#Removed below due to worse balanced accuracy
# Apply PCA (retain 95% variance)
# from sklearn.decomposition import PCA
# pca = PCA(n_components=0.95, random_state=42)
# X_train_pca = pca.fit_transform(X_train_scaled)
# X_test_pca = pca.transform(X_test_scaled)
# print("Original feature count:", X_train_enc.shape[1], 
#       "Reduced feature count:", X_train_pca.shape[1])

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, classification_report

# Encode target labels to numeric indices
classes = sorted(y.unique())
class_to_idx = {cls: idx for idx, cls in enumerate(classes)}
y_num = y.map(class_to_idx)

# Compute class weights
class_counts = y_num.value_counts().to_dict()
num_classes = len(classes)
total_samples = len(y_num)
class_weights = {cls_idx: total_samples/(num_classes*count) for cls_idx, count in class_counts.items()}
print("Class weights:", class_weights)


X_tr, X_val, y_tr, y_val = train_test_split(X_train_scaled, y_num, test_size=0.2, stratify=y_num, random_state=42)

# train LightGBM model
model = LGBMClassifier(n_estimators=200, class_weight=class_weights, random_state=42)
model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], eval_metric='multi_logloss')


# Evaluate on validation set
val_preds = model.predict(X_val)
print("Balanced Accuracy on val:", balanced_accuracy_score(y_val, val_preds))
print("Validation classification report:")
print(classification_report(y_val, val_preds, target_names=classes))

In [None]:
# Train final model on all training data(X_train_pca switched to X_train_scaled)
final_model = LGBMClassifier(n_estimators=300, class_weight=class_weights, random_state=42)
final_model.fit(X_train_scaled, y_num)

In [None]:

test_pred_idx = final_model.predict(X_test_scaled)

idx_to_class = {idx: cls for cls, idx in class_to_idx.items()}
test_pred_labels = [idx_to_class[i] for i in test_pred_idx]


output = pd.DataFrame({'Id': test['Id'], 'Outcome Type': test_pred_labels})
print(output.head(5))

output.to_csv('next_submission.csv', index=False)