In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import sqlite3

In [10]:
# Load CSV into a DataFrame
csv_file = "Resources/akc-data-latest.csv" 
df = pd.read_csv(csv_file)

# Connect to SQLite (creates the database if it doesn't exist)
conn = sqlite3.connect("breed_info_database.db")

# Convert DataFrame to a SQL table
df.to_sql("breed_info_table", conn, if_exists="replace", index=False)

# Close the connection
conn.close()

print("Success")


Success


In [11]:
conn = sqlite3.connect("breed_info_database.db")
query = "SELECT * FROM breed_info_table LIMIT 5"

breed_df = pd.read_sql("SELECT * FROM breed_info_table", conn)


conn.close()
breed_df.head()

Unnamed: 0.1,Unnamed: 0,description,temperament,popularity,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy,...,grooming_frequency_value,grooming_frequency_category,shedding_value,shedding_category,energy_level_value,energy_level_category,trainability_value,trainability_category,demeanor_value,demeanor_category
0,Affenpinscher,The Affen’s apish look has been described many...,"Confident, Famously Funny, Fearless",148,22.86,29.21,3.175147,4.535924,12.0,15.0,...,0.6,2-3 Times a Week Brushing,0.6,Seasonal,0.6,Regular Exercise,0.8,Easy Training,1.0,Outgoing
1,Afghan Hound,"The Afghan Hound is an ancient breed, his whol...","Dignified, Profoundly Loyal, Aristocratic",113,63.5,68.58,22.679619,27.215542,12.0,15.0,...,0.8,Daily Brushing,0.2,Infrequent,0.8,Energetic,0.2,May be Stubborn,0.2,Aloof/Wary
2,Airedale Terrier,The Airedale Terrier is the largest of all ter...,"Friendly, Clever, Courageous",60,58.42,58.42,22.679619,31.751466,11.0,14.0,...,0.6,2-3 Times a Week Brushing,0.4,Occasional,0.6,Regular Exercise,1.0,Eager to Please,0.8,Friendly
3,Akita,"Akitas are burly, heavy-boned spitz-type dogs ...","Courageous, Dignified, Profoundly Loyal",47,60.96,71.12,31.751466,58.967008,10.0,13.0,...,0.8,Daily Brushing,0.6,Seasonal,0.8,Energetic,1.0,Eager to Please,0.6,Alert/Responsive
4,Alaskan Malamute,The Alaskan Malamute stands 23 to 25 inches at...,"Affectionate, Loyal, Playful",58,58.42,63.5,34.019428,38.555351,10.0,14.0,...,0.6,2-3 Times a Week Brushing,0.6,Seasonal,0.8,Energetic,0.4,Independent,0.8,Friendly


In [12]:
#Give first column a name
breed_df.rename(columns={breed_df.columns[0]: "Dog Breed"}, inplace=True)

#Preview dataframe
breed_df.head()

Unnamed: 0,Dog Breed,description,temperament,popularity,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy,...,grooming_frequency_value,grooming_frequency_category,shedding_value,shedding_category,energy_level_value,energy_level_category,trainability_value,trainability_category,demeanor_value,demeanor_category
0,Affenpinscher,The Affen’s apish look has been described many...,"Confident, Famously Funny, Fearless",148,22.86,29.21,3.175147,4.535924,12.0,15.0,...,0.6,2-3 Times a Week Brushing,0.6,Seasonal,0.6,Regular Exercise,0.8,Easy Training,1.0,Outgoing
1,Afghan Hound,"The Afghan Hound is an ancient breed, his whol...","Dignified, Profoundly Loyal, Aristocratic",113,63.5,68.58,22.679619,27.215542,12.0,15.0,...,0.8,Daily Brushing,0.2,Infrequent,0.8,Energetic,0.2,May be Stubborn,0.2,Aloof/Wary
2,Airedale Terrier,The Airedale Terrier is the largest of all ter...,"Friendly, Clever, Courageous",60,58.42,58.42,22.679619,31.751466,11.0,14.0,...,0.6,2-3 Times a Week Brushing,0.4,Occasional,0.6,Regular Exercise,1.0,Eager to Please,0.8,Friendly
3,Akita,"Akitas are burly, heavy-boned spitz-type dogs ...","Courageous, Dignified, Profoundly Loyal",47,60.96,71.12,31.751466,58.967008,10.0,13.0,...,0.8,Daily Brushing,0.6,Seasonal,0.8,Energetic,1.0,Eager to Please,0.6,Alert/Responsive
4,Alaskan Malamute,The Alaskan Malamute stands 23 to 25 inches at...,"Affectionate, Loyal, Playful",58,58.42,63.5,34.019428,38.555351,10.0,14.0,...,0.6,2-3 Times a Week Brushing,0.6,Seasonal,0.8,Energetic,0.4,Independent,0.8,Friendly


In [13]:
breed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277 entries, 0 to 276
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Dog Breed                    277 non-null    object 
 1   description                  277 non-null    object 
 2   temperament                  276 non-null    object 
 3   popularity                   198 non-null    object 
 4   min_height                   277 non-null    float64
 5   max_height                   277 non-null    float64
 6   min_weight                   275 non-null    float64
 7   max_weight                   275 non-null    float64
 8   min_expectancy               274 non-null    float64
 9   max_expectancy               274 non-null    float64
 10  group                        277 non-null    object 
 11  grooming_frequency_value     270 non-null    float64
 12  grooming_frequency_category  270 non-null    object 
 13  shedding_value      

In [14]:
# Count the number of rows that contain any missing (NaN) values
missing_rows_count = breed_df.isna().any(axis=1).sum()
print(f"\nNumber of rows with missing values: {missing_rows_count}")


Number of rows with missing values: 89


In [15]:
# List the columns you want to keep
columns_to_keep = [
    'Dog Breed', 'description', 'temperament', 'group', 'min_height', 'max_height', 
    'min_weight', 'max_weight', 'min_expectancy', 'max_expectancy', 'grooming_frequency_value',
    'grooming_frequency_category', 'shedding_value', 'shedding_category', 'energy_level_value', 
    'energy_level_category', 'trainability_value', 'trainability_category', 'demeanor_value', 
    'demeanor_category'
]

# Create the new DataFrame with only the specified columns
breed_df = breed_df[columns_to_keep]

# Drop rows with any missing values
breed_df_cleaned_oana = breed_df.dropna()

# Preview dataframe
breed_df_cleaned_oana.head()

Unnamed: 0,Dog Breed,description,temperament,group,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy,grooming_frequency_value,grooming_frequency_category,shedding_value,shedding_category,energy_level_value,energy_level_category,trainability_value,trainability_category,demeanor_value,demeanor_category
0,Affenpinscher,The Affen’s apish look has been described many...,"Confident, Famously Funny, Fearless",Toy Group,22.86,29.21,3.175147,4.535924,12.0,15.0,0.6,2-3 Times a Week Brushing,0.6,Seasonal,0.6,Regular Exercise,0.8,Easy Training,1.0,Outgoing
1,Afghan Hound,"The Afghan Hound is an ancient breed, his whol...","Dignified, Profoundly Loyal, Aristocratic",Hound Group,63.5,68.58,22.679619,27.215542,12.0,15.0,0.8,Daily Brushing,0.2,Infrequent,0.8,Energetic,0.2,May be Stubborn,0.2,Aloof/Wary
2,Airedale Terrier,The Airedale Terrier is the largest of all ter...,"Friendly, Clever, Courageous",Terrier Group,58.42,58.42,22.679619,31.751466,11.0,14.0,0.6,2-3 Times a Week Brushing,0.4,Occasional,0.6,Regular Exercise,1.0,Eager to Please,0.8,Friendly
3,Akita,"Akitas are burly, heavy-boned spitz-type dogs ...","Courageous, Dignified, Profoundly Loyal",Working Group,60.96,71.12,31.751466,58.967008,10.0,13.0,0.8,Daily Brushing,0.6,Seasonal,0.8,Energetic,1.0,Eager to Please,0.6,Alert/Responsive
4,Alaskan Malamute,The Alaskan Malamute stands 23 to 25 inches at...,"Affectionate, Loyal, Playful",Working Group,58.42,63.5,34.019428,38.555351,10.0,14.0,0.6,2-3 Times a Week Brushing,0.6,Seasonal,0.8,Energetic,0.4,Independent,0.8,Friendly


In [16]:
# Save the cleaned DataFrame to a new CSV file in the Resources folder
breed_df_cleaned_oana.to_csv("Resources/oana-akc-data.csv", index=False)

print("Cleaned CSV file has been saved successfully.")

Cleaned CSV file has been saved successfully.


In [17]:
# List the columns you want to keep
columns_to_keep = [
    'Dog Breed', 'min_height', 'max_height', 
    'min_weight', 'max_weight', 'min_expectancy', 'max_expectancy', 
    'grooming_frequency_category', 'shedding_category', 'energy_level_category', 
    'trainability_category', 'demeanor_category'
]

# Create the new DataFrame with only the specified columns
breed_df = breed_df[columns_to_keep]

# Display the first few rows of the new DataFrame
breed_df.head()


Unnamed: 0,Dog Breed,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy,grooming_frequency_category,shedding_category,energy_level_category,trainability_category,demeanor_category
0,Affenpinscher,22.86,29.21,3.175147,4.535924,12.0,15.0,2-3 Times a Week Brushing,Seasonal,Regular Exercise,Easy Training,Outgoing
1,Afghan Hound,63.5,68.58,22.679619,27.215542,12.0,15.0,Daily Brushing,Infrequent,Energetic,May be Stubborn,Aloof/Wary
2,Airedale Terrier,58.42,58.42,22.679619,31.751466,11.0,14.0,2-3 Times a Week Brushing,Occasional,Regular Exercise,Eager to Please,Friendly
3,Akita,60.96,71.12,31.751466,58.967008,10.0,13.0,Daily Brushing,Seasonal,Energetic,Eager to Please,Alert/Responsive
4,Alaskan Malamute,58.42,63.5,34.019428,38.555351,10.0,14.0,2-3 Times a Week Brushing,Seasonal,Energetic,Independent,Friendly


In [18]:
# Drop rows with any missing values
breed_df_cleaned = breed_df.dropna()

# Count the number of rows after dropping
rows_after_dropping = breed_df_cleaned.shape[0]
print(f"\nNumber of rows after dropping missing values: {rows_after_dropping}")



Number of rows after dropping missing values: 236


In [19]:
#Preview cleaned df after dropping rows with blanks
breed_df_cleaned.head(10)

Unnamed: 0,Dog Breed,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy,grooming_frequency_category,shedding_category,energy_level_category,trainability_category,demeanor_category
0,Affenpinscher,22.86,29.21,3.175147,4.535924,12.0,15.0,2-3 Times a Week Brushing,Seasonal,Regular Exercise,Easy Training,Outgoing
1,Afghan Hound,63.5,68.58,22.679619,27.215542,12.0,15.0,Daily Brushing,Infrequent,Energetic,May be Stubborn,Aloof/Wary
2,Airedale Terrier,58.42,58.42,22.679619,31.751466,11.0,14.0,2-3 Times a Week Brushing,Occasional,Regular Exercise,Eager to Please,Friendly
3,Akita,60.96,71.12,31.751466,58.967008,10.0,13.0,Daily Brushing,Seasonal,Energetic,Eager to Please,Alert/Responsive
4,Alaskan Malamute,58.42,63.5,34.019428,38.555351,10.0,14.0,2-3 Times a Week Brushing,Seasonal,Energetic,Independent,Friendly
5,American Bulldog,50.8,63.5,27.215542,45.359237,10.0,12.0,Occasional Bath/Brush,Seasonal,Energetic,Agreeable,Alert/Responsive
6,American English Coonhound,58.42,66.04,20.411657,29.483504,11.0,12.0,Occasional Bath/Brush,Occasional,Energetic,Agreeable,Alert/Responsive
7,American Eskimo Dog,22.86,48.26,2.721554,15.875733,13.0,15.0,Weekly Brushing,Seasonal,Energetic,Eager to Please,Outgoing
8,American Foxhound,53.34,63.5,27.215542,31.751466,11.0,13.0,Occasional Bath/Brush,Seasonal,Energetic,Independent,Friendly
10,American Leopard Hound,53.34,68.58,20.411657,31.751466,12.0,15.0,Weekly Brushing,Seasonal,Energetic,Eager to Please,Alert/Responsive


In [20]:
# List the categorical columns you want to encode (replace these with your actual categorical columns)
categorical_columns = ['grooming_frequency_category', 'shedding_category', 'energy_level_category', 'trainability_category', 'demeanor_category']  

# Initialize the ColumnTransformer with OneHotEncoder for specified categorical columns
encoder = ColumnTransformer(
    transformers=[
        ('breed_encoder', OneHotEncoder(), categorical_columns)  # Apply OneHotEncoder only on the selected columns
    ], 
    remainder='passthrough'  # Keep non-categorical columns as is
)

# Fit and transform the encoder on the data
encoded_data = encoder.fit_transform(breed_df_cleaned)

# Get the column names after encoding
encoded_columns = encoder.transformers_[0][1].get_feature_names_out(categorical_columns)

# Combine the new one-hot encoded columns with the non-categorical columns
encoded_df = pd.DataFrame(encoded_data, columns=encoded_columns.tolist() + breed_df_cleaned.drop(columns=categorical_columns).columns.tolist())

# Display the first few rows of the encoded DataFrame
encoded_df.head()


Unnamed: 0,grooming_frequency_category_2-3 Times a Week Brushing,grooming_frequency_category_Daily Brushing,grooming_frequency_category_Occasional Bath/Brush,grooming_frequency_category_Specialty/Professional,grooming_frequency_category_Weekly Brushing,shedding_category_Frequent,shedding_category_Infrequent,shedding_category_Occasional,shedding_category_Regularly,shedding_category_Seasonal,...,demeanor_category_Friendly,demeanor_category_Outgoing,demeanor_category_Reserved with Strangers,Dog Breed,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,Affenpinscher,22.86,29.21,3.175147,4.535924,12.0,15.0
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,Afghan Hound,63.5,68.58,22.679619,27.215542,12.0,15.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,Airedale Terrier,58.42,58.42,22.679619,31.751466,11.0,14.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,Akita,60.96,71.12,31.751466,58.967008,10.0,13.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,Alaskan Malamute,58.42,63.5,34.019428,38.555351,10.0,14.0
