# Case Studies in Machine Learning Final Paper

# Import Data from data.austintexas.gov
### AAC Outcomes: https://data.austintexas.gov/Health-and-Community-Services/Austin-Animal-Center-Outcomes/9t4d-g238/about_data
### AAC Intakes: https://data.austintexas.gov/Health-and-Community-Services/Austin-Animal-Center-Intakes/wter-evkm/about_data

# Load the data and merge tables then remove duplicates

In [None]:
from datetime import date, datetime
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Create a date for the dataset
download_date = date(2024, 11, 5)

# format download_date into a string with YYYYMMDD format
download_datestr = download_date.strftime('%Y%m%d')

def load_animal_data(tabletype='processed', download_datestr=download_datestr):
    if tabletype == 'raw':
        # Insert date string in YYYYMMDD format into the filename
        outcomes_filename = os.path.join('data', f'Austin_Animal_Center_Outcomes_{download_datestr}.csv')
        intakes_filename = os.path.join('data', f'Austin_Animal_Center_Intakes_{download_datestr}.csv')

        df_outcomes = pd.read_csv(outcomes_filename)
        df_intakes = pd.read_csv(intakes_filename)

        # Join dataframes on Animal ID
        pd.options.display.max_columns = 50
        df = pd.merge(df_intakes,df_outcomes,on=["Animal ID"],suffixes=('_intake','_outcome'))

        # Drop duplicate columns
        cols_intakes = df_intakes.columns
        cols_outcomes = df_outcomes.columns

        duplicate_prefixes = ["Name", "Animal Type", "Breed", "Color"]

        for pref in duplicate_prefixes:
            if (df[pref + "_intake"].dropna() == df[pref + "_outcome"].dropna()).all():
                df[pref] = df[pref + "_intake"]
                df = df.drop(columns=[pref + "_intake", pref + "_outcome"])
        
        df.to_pickle(os.path.join('data', f'Austin_Animal_Center_Joined_{download_datestr}.pkl'))
    elif tabletype == 'joined':
        filename = os.path.join('data', f'Austin_Animal_Center_Joined_{download_datestr}.pkl')
        df = pd.read_pickle(filename)
    elif tabletype == 'processed':
        filename = os.path.join('data', f'Austin_Animal_Center_Joined_{download_datestr}_processed.pkl')
        df = pd.read_pickle(filename)

    return df

df_joined = load_animal_data(tabletype='raw', download_datestr=download_datestr)
display(df_joined.head())

# Feature engineering

### Duration of Stay in Shelter

In [None]:
# Calculate the duration between intake and outcome
df_joined["DateTime_outcome"] = pd.to_datetime(df_joined["DateTime_outcome"], format="%m/%d/%Y %I:%M:%S %p")
df_joined["DateTime_intake"] = pd.to_datetime(df_joined["DateTime_intake"], format="%m/%d/%Y %I:%M:%S %p")
df_joined["duration_in_shelter"] = df_joined["DateTime_outcome"] - df_joined["DateTime_intake"]
df_joined['intake_month'] = df_joined['DateTime_intake'].dt.month
df_joined['intake_year'] = df_joined['DateTime_intake'].dt.year

# Drop nans in rows that have invalid meanings, Only include male/female sex
df_joined = df_joined.dropna(axis=0,subset=["Sex upon Outcome","Sex upon Intake","Outcome Type","Age upon Outcome"])
df_joined = df_joined.loc[df_joined["Sex upon Intake"].str.contains("Male|Female")]

# Intake condition of Other as Unknown
df_joined["Intake Condition"] = df_joined["Intake Condition"].replace("Other", "Unknown")

# Convert Age upon intake from string to numeric, parse the strings of the form X (years/months) 
age_strings = df_joined['Age upon Intake'].str.split(' ', expand=True)
age_strings.columns = ['age', 'unit']
age_strings['age'] = pd.to_numeric(age_strings['age'])
age_strings['unit'] = age_strings['unit'].str.replace('s', '')
age_strings.loc[age_strings['unit'] == 'month', 'age'] /= 12
age_strings.loc[age_strings['unit'] == 'week', 'age'] /= 52
age_strings.loc[age_strings['unit'] == 'day', 'age'] /= 365
df_joined['age_upon_intake_years'] = age_strings['age']

# Convert NaN Names to Unknown name or stay as nan
df_joined['Name'] = df_joined['Name'].fillna('Unknown')

# Create binary indicator flags for fixed/intact, male/female
df_joined['fixed Intake'] = df_joined['Sex upon Intake'].str.contains('Neutered|Spayed')
df_joined['fixed Outcome'] = df_joined['Sex upon Outcome'].str.contains('Neutered|Spayed')
df_joined['fixed'] = df_joined['fixed Intake'] | df_joined['fixed Outcome']
df_joined = df_joined.drop(['fixed Intake', 'fixed Outcome'], axis=1)

# 1 means female, 0 means male
df_joined['sex'] = df_joined['Sex upon Intake'].str.contains('Female')

# Figure out how many durations are negative and drop from dataframe, unfortunately this is a data quality issue
df_joined = df_joined.drop(df_joined.loc[df_joined['duration_in_shelter'] < pd.Timedelta(0)].index)

# Display Unique Values of categorical variables
categorical_columns = ["Intake Type", "Intake Condition", "Outcome Type", "Outcome Subtype", "Animal Type"]

for col in categorical_columns:
    print(f"Unique values for {col}: {df_joined[col].unique()}")
    print("\n")

# Display the number of missing values in each column
print(df_joined.isna().sum())
display(df_joined.head())

In [None]:
import seaborn as sns
from datetime import timedelta



df_joined["MonthYear_intake_datetime"] = [datetime(year=y,month=m,day=1) for [y,m] in zip(df_joined['DateTime_intake'].dt.year, df_joined['DateTime_intake'].dt.month)]
# df_joined['MonthYear_intake_datetime'].dt.date.value_counts().sort_index().plot(kind='line',color='b')

df_joined["MonthYear_outcome_datetime"] = [datetime(year=y,month=m,day=1) for [y,m] in zip(df_joined['DateTime_outcome'].dt.year, df_joined['DateTime_outcome'].dt.month)]
# df_joined['MonthYear_outcome_datetime'].dt.date.value_counts().sort_index().plot(kind='line',color='b',linestyle='--')



%matplotlib inline
%reload_ext autoreload
%autoreload 2

sns.set(style="darkgrid")
fgh = plt.figure(figsize=(20,4))
sns.histplot(data=df_joined, x='MonthYear_intake_datetime', hue='Animal Type', element='poly', fill=False, binwidth=30)
sns.histplot(data=df_joined, x='MonthYear_outcome_datetime', hue='Animal Type', element='poly', fill=False, linestyle='--', binwidth=30)

fgh.savefig('Intake_Outcome_AnimalType_Timeline.png')

def func(pct, allvals):
    absolute = int(np.round(pct/100.*np.sum(allvals)))
    return f"{pct:.1f}%\n({absolute:d} g)"


fgh, axs = plt.subplots(1,2, figsize = (16,8))

data   = df_joined['Outcome Type'].value_counts()
labels = data.index.to_list()

ax = axs[0]
wedges, texts, autotexts = ax.pie(data, autopct=lambda pct: func(pct, data), textprops=dict(color="w"))
ax.legend(wedges, labels, title="Animal Type", loc="center left", bbox_to_anchor=(0.9, 0, 0.5, 1))
plt.setp(autotexts, size=12, weight="bold")
ax.set_title("Outcome Types", fontsize=20)

df_joined['Adoption_Boolean'] = df_joined['Outcome Type'] == 'Adoption'
data = df_joined['Adoption_Boolean'].value_counts()
labels = ['Adoption', 'Not Adoption']
ax = axs[1]
wedges, texts, autotexts = ax.pie(data, autopct=lambda pct: func(pct, data), textprops=dict(color="w"))
ax.legend(wedges, labels, title="Animal Type", loc="center left", bbox_to_anchor=(0.9, 0, 0.5, 1))
plt.setp(autotexts, size=12, weight="bold")
ax.set_title("Positive vs Negative outcome", fontsize=20)

plt.show()

fgh.savefig('OutcomeType_Pie.png')


In [None]:
df_joined['Outcome Type'].value_counts().plot(kind='bar',color='b')

In [None]:
df_joined_tmp = df_joined.loc[df_joined['Intake Type'].isin(['Stray', 'Owner Surrender', 'Public Assist','Abandoned'])]
fgh = plt.figure(figsize=(16,8))
ax = sns.violinplot(data=df_joined_tmp.loc[df_joined['duration_in_shelter']<=timedelta(days=200)], x='Animal Type', y='duration_in_shelter', hue='Intake Type', palette='deep')
ax.set_title('Duration In Shelter Violin Plots', fontsize=20)
xticks = ax.get_xticklabels()
counts = [np.sum(df_joined['Animal Type'] == xt.get_text()) for xt in xticks]
percents = [np.mean(df_joined['Animal Type'] == xt.get_text()) for xt in xticks]
xticks_new = []
for xt in xticks:
    xt.set_text(f"{xt.get_text()}(Count={counts.pop(0)} : Pct={percents.pop(0):.2f})")
ax.set_xticklabels(xticks)
fgh.savefig('DurationInShelter_IntakeType_Violin.png')

df_joined_tmp = df_joined.loc[df_joined['Outcome Type'].isin(['Adoption', 'Transfer', 'Return to Owner','Euthanasia'])]

fgh = plt.figure(figsize=(16,8))
ax = sns.violinplot(data=df_joined_tmp.loc[df_joined['duration_in_shelter']<=timedelta(days=200)], x='Animal Type', y='duration_in_shelter', hue='Outcome Type', palette='deep')
ax.set_title('Duration In Shelter Violin Plots', fontsize=20)
xticks = ax.get_xticklabels()
counts = [np.sum(df_joined['Animal Type'] == xt.get_text()) for xt in xticks]
percents = [np.mean(df_joined['Animal Type'] == xt.get_text()) for xt in xticks]
xticks_new = []
for xt in xticks:
    xt.set_text(f"{xt.get_text()}(Count={counts.pop(0)} : Pct={percents.pop(0):.2f})")
ax.set_xticklabels(xticks)

fgh.savefig('DurationInShelter_OutcomeType_Violin.png')


In [None]:
xticks_new

In [None]:
df_joined_cats = df_joined.loc[df_joined['Animal Type'] == 'Cat']
fgh = plt.figure(figsize=(20,4))
sns.histplot(data=df_joined_cats, x='MonthYear_intake_datetime', hue='Intake Type', element='poly', fill=False, binwidth=30)
sns.histplot(data=df_joined_cats, x='MonthYear_outcome_datetime', hue='Intake Type', element='poly', fill=False, linestyle='--', binwidth=30)
fgh.savefig('CatIntakeTypes_histplot.png')
fgh = plt.figure(figsize=(20,4))
sns.histplot(data=df_joined_cats, x='MonthYear_intake_datetime', hue='Outcome Type', element='poly', fill=False, binwidth=30)
sns.histplot(data=df_joined_cats, x='MonthYear_outcome_datetime', hue='Outcome Type', element='poly', fill=False, linestyle='--', binwidth=30)
fgh.savefig('CatOutcomeTypes_histplot.png')

In [None]:
# Plot Distribution of Outcome Types
plt.figure()
df_joined['Outcome Type'].value_counts().plot(kind='bar')
plt.title('Outcome Type Counts')

df_joined['Adoption_Boolean'] = df_joined['Outcome Type'] == 'Adoption'
plt.figure()
df_joined['Adoption_Boolean'].value_counts().plot(kind='bar')
plt.title('Adoption Successes')

fgh = plt.figure(figsize=(20,4))
df_joined["MonthYear_intake_datetime"] = [datetime(year=y,month=m,day=1) for [y,m] in zip(df_joined['DateTime_intake'].dt.year, df_joined['DateTime_intake'].dt.month)]
df_joined['MonthYear_intake_datetime'].dt.date.value_counts().sort_index().plot(kind='line',color='b')
ax = fgh.axes

df_joined["MonthYear_outcome_datetime"] = [datetime(year=y,month=m,day=1) for [y,m] in zip(df_joined['DateTime_outcome'].dt.year, df_joined['DateTime_outcome'].dt.month)]
df_joined['MonthYear_outcome_datetime'].dt.date.value_counts().sort_index().plot(kind='line',color='b',linestyle='--')


animal_types = df_joined['Animal Type'].unique()
linecolors = 'rgcmk'
for at, clr in zip(animal_types, linecolors):
    df_joined['MonthYear_intake_datetime'].loc[df_joined['Animal Type'] == at].dt.date.value_counts().sort_index().plot(kind='line',color=clr)
    df_joined['MonthYear_outcome_datetime'].loc[df_joined['Animal Type'] == at].dt.date.value_counts().sort_index().plot(kind='line',color=clr,linestyle='--')



In [None]:
# Look at cat seasonality
fgh = plt.figure(figsize=(20,4))
ax = fgh.axes
df_joined_cats = df_joined.loc[df_joined['Animal Type']=='Dog']

outtypes = df_joined_cats['Outcome Type'].unique()
linecolors = 'rgcmkyb'
for outcome_type, clr in zip(outtypes,linecolors):
    df_joined_cats['MonthYear_intake_datetime'].loc[df_joined_cats['Outcome Type'] == outcome_type].value_counts().sort_index().plot(kind='line',color=clr,='hi')
    df_joined_cats['MonthYear_outcome_datetime'].loc[df_joined_cats['Outcome Type'] == outcome_type].value_counts().sort_index().plot(kind='line',color=clr,linestyle='--')

print(outtypes)


In [None]:
df_join

## Sex and Fixed status as well as age upon intake

In [None]:
fgh,axes = plt.subplots(2,1)
axes = axes.flatten()
df_joineda = df_joined.loc[df_joined['Outcome Type'] == 'Adoption']
axes[0].hist(df_joineda['duration_in_shelter'].dt.days,bins=200)
axes[1].hist(df_joineda['duration_in_shelter'].loc[df_joineda['duration_in_shelter'].dt.days<=50].dt.days,bins=200)

In [None]:
# Extract the month and year from the intake date


## Color in RGB

In [None]:
# Process Colors
def process_color(color):
    color = color.lower()
    color = color.replace('/', ' ') 
    color = color.split(' ')
    return(color)

def color_dict():
    color_dict = {
        'buff': [218, 160, 109], 
        'white': [255, 255, 255], 
        'fawn': [196, 164, 132], 
        'apricot':[251, 206, 177], 
        'black':[0,0,0],
        'silver':[192, 192, 192],
        'brindle':[130,119,107], 
        'pink':[255, 192, 203], 
        'torbie':[255,184,90], 
        'lynx':[162,113,80], 
        'sable':[110, 64, 60], 
        'gray':[128,128,128],
        'blue':[0, 0, 255],
        'calico':[210, 170, 133],
        'red':[255,0,0],
        'flame':[226, 88, 34],
        'liver':[83, 75, 79],
        'lilac':[200, 162, 200],
        'chocolate':[123, 63, 0],
        'yellow':[255, 255, 0],
        'cream':[255, 253, 208],
        'orange':[255, 165, 0],
        'tiger':[203,113,25],
        'gold':[255,215,0],
        'tan':[210, 180, 140],
        'brown':[165, 42, 42],
        'ruddy':[255, 0, 40],                
        'seal':[50, 20, 20],        
        'green':[0, 255, 0],
        'smoke':[132, 136, 132]
    }
    return(color_dict)
    
def map_colors(colors, color_dict):
    rgb_triplets = []
    for color in colors:
        if color in color_dict.keys():
            rgb_triplets.append(color_dict[color])

    if rgb_triplets == []:
        return([np.nan, np.nan, np.nan])
    else:
        return(np.mean(rgb_triplets, axis=0))
            
    
df_joined['Color_processed'] = df_joined['Color'].apply(process_color)
df_joined['Colors_RGB'] = df_joined['Color_processed'].apply(map_colors, color_dict=color_dict())

df_joined = pd.concat([df_joined, pd.DataFrame(df_joined['Colors_RGB'].to_list(), columns=['Color_R', 'Color_G', 'Color_B'],index=df_joined.index)], axis=1)
df_joined = df_joined.drop(['Color_processed', 'Colors_RGB'], axis=1)

## Location found

In [None]:
from transformers import BertTokenizer, BertModel
import pickle
import time
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def load_bert_models(LOAD_FROM_FILES=True):
    if LOAD_FROM_FILES:
        with open('bert_tokenizer.pkl', 'rb') as f:
            bert_tokenizer = pickle.load(f)
        with open('bert_model.pkl', 'rb') as f:
            bert_model = pickle.load(f)
    else:
        bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        bert_model     = BertModel.from_pretrained('bert-base-uncased')    
        with open('bert_tokenizer.pkl', 'wb') as f:
            pickle.dump(bert_tokenizer, f)

        with open('data','bert_model.pkl', 'wb') as f:
            pickle.dump(bert_model, f)

    return(bert_tokenizer, bert_model)

def get_bert_embeddings(bert_tokenizer, bert_model, text, batch_size=10000):

    embeds_list = []
    time_list = []
    for i in range(0, len(text), batch_size):
        start = time.time()
        text_batch = text[i:i+batch_size]
        bert_inputs = bert_tokenizer(text_batch, return_tensors='pt',padding=True)

        # Calculate the bert outputs
        with torch.no_grad():
            bert_outputs = bert_model(**bert_inputs)

        # Final embedding vector is the average of all token vectors    
        avg_hidden_state = bert_outputs.last_hidden_state.mean(dim=1).squeeze(1)
        
        embeds_list.append(avg_hidden_state)
        end = time.time()
        time_list.append(end-start)

        embeds_size = np.sum([np.prod(x.shape) for x in embeds_list])*4/(1024**3)
        print(f"Processed {i + len(text_batch)}/{len(text)} texts. Elapsed Time = {np.sum(time_list)}s. Avg Time = {np.mean(time_list)}. Embeddings Size = {embeds_size} GB",end='\r')

    embeds_tensor = torch.vstack(embeds_list)
    return(embeds_tensor)

def pca_reduction(embeddings,var_threshold=0.90):
    sc = StandardScaler()
    embeddings = sc.fit_transform(embeddings)

    pca = PCA()
    embeddings_pca = pca.fit_transform(embeddings)
    embeddings_pca = embeddings_pca[:,np.cumsum(pca.explained_variance_ratio_) < var_threshold]
    return(embeddings_pca)


In [None]:
bert_tokenizer, bert_model = load_bert_models(LOAD_FROM_FILES=True)
# df_processed = pd.read_pickle(os.path.join('data', f'Austin_Animal_Center_Joined_{download_datestr}_processed.pkl'))

# Process Breed
df_joined['Breed_mixed'] = df_joined['Breed'].str.endswith(' Mix')
df_joined['Breed_processed'] = df_joined['Breed'].str.replace(' Mix', '')
df_joined['Breed_processed'] = df_joined['Breed_processed'].str.lower()
breed_list = df_joined['Breed_processed'].to_list()
breed_embeddings = get_bert_embeddings(bert_tokenizer, bert_model, breed_list,batch_size=1000) 
# breed_embeddings = pca_reduction(breed_embeddings, var_threshold=0.9)

df_joined['Name_processed'] = df_joined['Name'].str.lower()
df_joined['Name_processed'] = df_joined['Name_processed'].str.replace('*', '')
name_list = df_joined['Name_processed'].to_list()
name_embeddings = get_bert_embeddings(bert_tokenizer, bert_model, name_list, batch_size=1000) 
# name_embeddings = pca_reduction(name_embeddings, var_threshold=0.9)

In [None]:
# convert name_embeddings and breed_embeddings to pandas dataframes with same index as df_joined
name_embeddings = pd.DataFrame(name_embeddings.numpy(), index=df_joined.index, columns=[f'name_embedding_{i}' for i in range(name_embeddings.shape[1])])
breed_embeddings = pd.DataFrame(breed_embeddings.numpy(), index=df_joined.index, columns=[f'breed_embedding_{i}' for i in range(breed_embeddings.shape[1])])

In [None]:
df_joined = df_joined.join(name_embeddings)
df_joined = df_joined.join(breed_embeddings)

#save to pickle
df_joined.to_pickle(os.path.join('data', f'Austin_Animal_Center_Joined_{download_date}_processed.pkl'))

In [None]:
# plot variance explained by PCA components
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

ndim_bert = 768
name_embedding_cols = [f'name_embedding_{i}' for i in range(ndim_bert)]
breed_embedding_cols = [f'breed_embedding_{i}' for i in range(ndim_bert)]
df_processed = pd.read_pickle(os.path.join('data', f'Austin_Animal_Center_Joined_{download_datestr}_processed.pkl'))
sc = StandardScaler()

embeddings_name = df_processed[name_embedding_cols].values
embeddings_breed = df_processed[breed_embedding_cols].values
embeddings_name_norm = sc.fit_transform(embeddings_name)
embeddings_breed_norm = sc.fit_transform(embeddings_breed)

pca_name = PCA()
pca_breed = PCA()
embeddings_name_pca = pca_name.fit_transform(embeddings_name_norm)
embeddings_breed_pca = pca_breed.fit_transform(embeddings_breed_norm)

# embeddings_pca = embeddings_pca[:,np.cumsum(pca.explained_variance_ratio_) < var_threshold]

# plot variance explained by PCA components
plt.figure()
plt.plot(np.cumsum(pca_name.explained_variance_ratio_))
plt.plot(np.cumsum(pca_breed.explained_variance_ratio_))

# Plot horizontal dashed lines at a y value of 0.9 and 0.95
plt.axhline(y=0.9, color='gray', linestyle='--')
plt.axhline(y=0.95, color='r', linestyle='--')
plt.xlabel('Number of Components')
plt.ylabel('Variance Explained')
plt.legend(['Name','Breed','90% Explained','95% Explained'])
plt.savefig('PCA_variance_explained_name_bre')


In [None]:
# from geopy.geocoders import Nominatim

# geolocator = Nominatim(user_agent="csmlFinalProject")

# # Get the Austin Animal Shelter Location
# getLoc = geolocator.geocode("Austin, Texas")

# # printing address
# print(getLoc.address)

# # printing latitude and longitude
# print("Latitude = ", getLoc.latitude)
# print("Longitude = ", getLoc.longitude)

# for addy in df_joined['Found Location'].iloc[0:50]:
#     print(addy)
#     # getLoc = geolocator.geocode(addy)
#     # print(getLoc.address)

# address_list = []
# for addy in df_joined['Found Location'].str.split(' '):    
#     city_str = addy[-2]
#     state_str = addy[-1][1:3]
#     addr_str = addy[0:-3]
    
#     # First try the address type of string

#     loc = geolocator.geocode()

# street, city, county, state, country, or postalcode

# df_joined['city_found'] = city_str
# print(df_joined['city_found'].unique())


# df_joined['state_found'] = state_str
# df_joined['state_found'] = df_joined['state_found'].replace('ur', 'Outside State')
# print(df_joined['state_found'].unique())

# for addy in addys:
#     for word in addy:        
#         if word.lower() == 'and':
#             print(addy)


In [None]:
# Run Cross validation on Adoption classification performance
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Drop columns that are not needed for classification

In [None]:
intake_month_counts = df_joined['intake_month'].value_counts(sort=False)
print(intake_month_counts.sort_index())


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
ndim_bert = 768
name_embedding_cols = [f'name_embedding_{i}' for i in range(ndim_bert)]
breed_embedding_cols = [f'breed_embedding_{i}' for i in range(ndim_bert)]
standard_features = ['Intake Type', 'Intake Condition', 'Sex upon Intake', 'Age upon Intake',
       'DateTime_outcome', 'Date of Birth', 'Outcome Type', 'Outcome Subtype',
       'Sex upon Outcome', 'Age upon Outcome', 'Name', 'Animal Type', 'Breed',
       'Color', 'duration_in_shelter', 'age_upon_intake_years', 'fixed', 'sex',
       'Color_R', 'Color_G', 'Color_B', 'city_found', 'state_found',
       'Breed_mixed']

categorical_features = []
# my_features = 

# Different Pipelines
# 1) Basic Pipeline with all categorical features
# 2) Basic Pipeline with one-hot-encoded categorical features
# 3) Pipeline with categorical and embedded/engineered features
# 4) Pipeline with one-hot-encoded and embedded/engineered features
pipe = Pipeline(
    [
        ('ct_ss', 
            ColumnTransformer(
             [("ss_name", StandardScaler(), name_embedding_cols),
              ("ss_breed", StandardScaler(), breed_embedding_cols),
             ("pass", "passthrough", standard_features)])
        ),
        ('ct_pca', 
            ColumnTransformer(
             [("PCA_name", PCA(), name_embedding_cols),
              ("PCA_breed", PCA(), breed_embedding_cols),
             ("pass", "passthrough", standard_features)])
        ),
         ('svc', SVC())
    ]
)

pipe_embeds = Pipeline([("scaler",StandardScaler()),("pca",PCA())])
pipe = Pipeline(
    [
        ('ct', 
            ColumnTransformer(
             [("name_feats", pipe_embeds, name_embedding_cols),
              ("breed_feats", pipe_embeds, breed_embedding_cols),
             ("pass", "passthrough", standard_features)])
        ),
         ('svc', SVC())
    ]
)

In [None]:
pipe


In [None]:
import numpy as np
from matplotlib import pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.svm import SVC

# Number of random trials
NUM_TRIALS = 30

# Load the dataset
iris = load_iris()
X_iris = iris.data
y_iris = iris.target

# Set up possible values of parameters to optimize over
p_grid = {"C": [1, 10, 100], "gamma": [0.01, 0.1]}

# We will use a Support Vector Classifier with "rbf" kernel
svm = SVC(kernel="rbf")

# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

# Loop for each trial
for i in range(NUM_TRIALS):
    # Choose cross-validation techniques for the inner and outer loops,
    # independently of the dataset.
    # E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc.
    inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=outer_cv)
    clf.fit(X_iris, y_iris)
    non_nested_scores[i] = clf.best_score_

    # Nested CV with parameter optimization
    clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=inner_cv)
    nested_score = cross_val_score(clf, X=X_iris, y=y_iris, cv=outer_cv)
    nested_scores[i] = nested_score.mean()

score_difference = non_nested_scores - nested_scores

print(
    "Average difference of {:6f} with std. dev. of {:6f}.".format(
        score_difference.mean(), score_difference.std()
    )
)

# Plot scores on each trial for nested and non-nested CV
plt.figure()
plt.subplot(211)
(non_nested_scores_line,) = plt.plot(non_nested_scores, color="r")
(nested_line,) = plt.plot(nested_scores, color="b")
plt.ylabel("score", fontsize="14")
plt.legend(
    [non_nested_scores_line, nested_line],
    ["Non-Nested CV", "Nested CV"],
    bbox_to_anchor=(0, 0.4, 0.5, 0),
)
plt.title(
    "Non-Nested and Nested Cross Validation on Iris Dataset",
    x=0.5,
    y=1.1,
    fontsize="15",
)

# Plot bar chart of the difference.
plt.subplot(212)
difference_plot = plt.bar(range(NUM_TRIALS), score_difference)
plt.xlabel("Individual Trial #")
plt.legend(
    [difference_plot],
    ["Non-Nested CV - Nested CV Score"],
    bbox_to_anchor=(0, 1, 0.8, 0),
)
plt.ylabel("score difference", fontsize="14")

plt.show()