In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession

In [2]:
# Initialize Spark Session
# spark = SparkSession.builder \
#     .appName("Dog Breed Prediction") \
#     .getOrCreate() 

# # Load CSV file into a Spark DataFrame
# df = spark.read.csv("akc-data-latest.csv", header=True, inferSchema=True)

# # Show the first few rows
# df.show(5)

# # Print the schema of the DataFrame
# df.printSchema()

# # Convert Spark DataFrame to Pandas DataFrame (if needed for local analysis)
# df_pandas = df.toPandas()

# # Write the DataFrame back to a CSV file (overwrite if exists)
# df.write.csv("output-folder/akc-data-cleaned", header=True, mode="overwrite")


In [6]:
# Define the file path
file_path = "Resources/akc-data-latest.csv"

# Read the CSV into a Pandas DataFrame
breed_df = pd.read_csv(file_path)

#Give first column a name
breed_df.rename(columns={breed_df.columns[0]: "Dog Breed"}, inplace=True)

#Preview dataframe
breed_df.head()

Unnamed: 0,Dog Breed,description,temperament,popularity,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy,...,grooming_frequency_value,grooming_frequency_category,shedding_value,shedding_category,energy_level_value,energy_level_category,trainability_value,trainability_category,demeanor_value,demeanor_category
0,Affenpinscher,The Affen’s apish look has been described many...,"Confident, Famously Funny, Fearless",148,22.86,29.21,3.175147,4.535924,12.0,15.0,...,0.6,2-3 Times a Week Brushing,0.6,Seasonal,0.6,Regular Exercise,0.8,Easy Training,1.0,Outgoing
1,Afghan Hound,"The Afghan Hound is an ancient breed, his whol...","Dignified, Profoundly Loyal, Aristocratic",113,63.5,68.58,22.679619,27.215542,12.0,15.0,...,0.8,Daily Brushing,0.2,Infrequent,0.8,Energetic,0.2,May be Stubborn,0.2,Aloof/Wary
2,Airedale Terrier,The Airedale Terrier is the largest of all ter...,"Friendly, Clever, Courageous",60,58.42,58.42,22.679619,31.751466,11.0,14.0,...,0.6,2-3 Times a Week Brushing,0.4,Occasional,0.6,Regular Exercise,1.0,Eager to Please,0.8,Friendly
3,Akita,"Akitas are burly, heavy-boned spitz-type dogs ...","Courageous, Dignified, Profoundly Loyal",47,60.96,71.12,31.751466,58.967008,10.0,13.0,...,0.8,Daily Brushing,0.6,Seasonal,0.8,Energetic,1.0,Eager to Please,0.6,Alert/Responsive
4,Alaskan Malamute,The Alaskan Malamute stands 23 to 25 inches at...,"Affectionate, Loyal, Playful",58,58.42,63.5,34.019428,38.555351,10.0,14.0,...,0.6,2-3 Times a Week Brushing,0.6,Seasonal,0.8,Energetic,0.4,Independent,0.8,Friendly


In [9]:
breed_df.count()

Dog Breed                      277
description                    277
temperament                    276
popularity                     198
min_height                     277
max_height                     277
min_weight                     275
max_weight                     275
min_expectancy                 274
max_expectancy                 274
group                          277
grooming_frequency_value       270
grooming_frequency_category    270
shedding_value                 257
shedding_category              257
energy_level_value             271
energy_level_category          271
trainability_value             253
trainability_category          253
demeanor_value                 252
demeanor_category              252
dtype: int64

In [7]:
# Count the number of rows that contain any missing (NaN) values
missing_rows_count = breed_df.isna().any(axis=1).sum()
print(f"\nNumber of rows with missing values: {missing_rows_count}")


Number of rows with missing values: 89


In [10]:
# Drop rows with any missing values
breed_df_cleaned = breed_df.dropna()

# Count the number of rows after dropping
rows_after_dropping = breed_df_cleaned.shape[0]
print(f"\nNumber of rows after dropping missing values: {rows_after_dropping}")



Number of rows after dropping missing values: 188


In [18]:
#Preview cleaned df after dropping rows with blanks
breed_df_cleaned.head(60)

Unnamed: 0,Dog Breed,description,temperament,popularity,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy,...,grooming_frequency_value,grooming_frequency_category,shedding_value,shedding_category,energy_level_value,energy_level_category,trainability_value,trainability_category,demeanor_value,demeanor_category
0,Affenpinscher,The Affen’s apish look has been described many...,"Confident, Famously Funny, Fearless",148,22.86,29.21,3.175147,4.535924,12.0,15.0,...,0.6,2-3 Times a Week Brushing,0.6,Seasonal,0.6,Regular Exercise,0.8,Easy Training,1.0,Outgoing
1,Afghan Hound,"The Afghan Hound is an ancient breed, his whol...","Dignified, Profoundly Loyal, Aristocratic",113,63.5,68.58,22.679619,27.215542,12.0,15.0,...,0.8,Daily Brushing,0.2,Infrequent,0.8,Energetic,0.2,May be Stubborn,0.2,Aloof/Wary
2,Airedale Terrier,The Airedale Terrier is the largest of all ter...,"Friendly, Clever, Courageous",60,58.42,58.42,22.679619,31.751466,11.0,14.0,...,0.6,2-3 Times a Week Brushing,0.4,Occasional,0.6,Regular Exercise,1.0,Eager to Please,0.8,Friendly
3,Akita,"Akitas are burly, heavy-boned spitz-type dogs ...","Courageous, Dignified, Profoundly Loyal",47,60.96,71.12,31.751466,58.967008,10.0,13.0,...,0.8,Daily Brushing,0.6,Seasonal,0.8,Energetic,1.0,Eager to Please,0.6,Alert/Responsive
4,Alaskan Malamute,The Alaskan Malamute stands 23 to 25 inches at...,"Affectionate, Loyal, Playful",58,58.42,63.5,34.019428,38.555351,10.0,14.0,...,0.6,2-3 Times a Week Brushing,0.6,Seasonal,0.8,Energetic,0.4,Independent,0.8,Friendly
6,American English Coonhound,"Standing as high as 26 inches at the shoulder,...","Sweet, Mellow, Sociable",175,58.42,66.04,20.411657,29.483504,11.0,12.0,...,0.2,Occasional Bath/Brush,0.4,Occasional,0.8,Energetic,0.6,Agreeable,0.6,Alert/Responsive
7,American Eskimo Dog,The American Eskimo Dog comes in three sizes—s...,"Playful, Perky, Smart",122,22.86,48.26,2.721554,15.875733,13.0,15.0,...,0.4,Weekly Brushing,0.6,Seasonal,0.8,Energetic,1.0,Eager to Please,1.0,Outgoing
8,American Foxhound,"American Foxhounds are sleek, rangy hunters kn...","Independent, Easy-Going, Sweet-Tempered",186,53.34,63.5,27.215542,31.751466,11.0,13.0,...,0.2,Occasional Bath/Brush,0.6,Seasonal,0.8,Energetic,0.4,Independent,0.8,Friendly
11,American Staffordshire Terrier,"AmStaffs are stocky, muscular bull-type terrie...","Confident, Smart, Good-Natured",85,43.18,48.26,18.143695,31.751466,12.0,16.0,...,0.2,Occasional Bath/Brush,0.4,Occasional,0.6,Regular Exercise,0.6,Agreeable,0.6,Alert/Responsive
12,American Water Spaniel,American Water Spaniels are muscular midsize g...,"Eager, Happy, Charming",166,38.1,45.72,11.339809,20.411657,10.0,14.0,...,0.6,2-3 Times a Week Brushing,0.2,Infrequent,0.6,Regular Exercise,1.0,Eager to Please,0.8,Friendly


In [19]:
# Save the cleaned DataFrame to a new CSV file in the Resources folder
breed_df_cleaned.to_csv("Resources/akc-data-cleaned.csv", index=False)

print("Cleaned CSV file has been saved successfully.")

Cleaned CSV file has been saved successfully.
