# Load dataset

In [13]:
import numpy
import kagglehub
import pandas as pd
import os

# Download latest version
path = kagglehub.dataset_download("lainguyn123/animal-planet")

# find all CSV files in the downloaded directory
csv_files = [f for f in os.listdir(path) if f.endswith(".csv")]

# load first CSV (or loop if multiple)
df = pd.read_csv(os.path.join(path, csv_files[0]))

# Descriptive statistics of the dataset

In [21]:
print("Dataset shape:")
print(df.shape) # get row count and column count
print("\nColumns:\n", ", ".join(df.columns)) # list column names
print("\nFirst row of data:")
print(df.head(1))
print("\nData description:")
df.describe(include='all') # get descriptive statistics

Dataset shape:
(29357, 22)

Columns:
 Name, Kingdom, Phylum, Subphylum, Class, Order, Suborder, Family, Genus, Species, Population size, Life span, Top speed, Weight, Height, Length, Attributes, Distribution, Habits, Diet, Mating_Habits, Population

First row of data:
        Name   Kingdom    Phylum   Subphylum     Class      Order    Suborder  \
0  Grey Wolf  Animalia  Chordata  Vertebrata  Mammalia  Carnivora  Caniformia   

    Family  Genus      Species  ... Top speed    Weight    Height      Length  \
0  Canidae  Canis  Canis lupus  ...   75 km/h  16-60 kg  80-85 cm  105-160 cm   

                                          Attributes  \
0  Nocturnal,Carnivore,Scavenger,Terrestrial,Curs...   

                                        Distribution  \
0  {'Geography': {'Continents': 'Asia, Europe, No...   

                                              Habits                  Diet  \
0  {'Group name': 'pack, route, rout', 'Lifestyle...  Carnivore,,Scavenger   

                      

Unnamed: 0,Name,Kingdom,Phylum,Subphylum,Class,Order,Suborder,Family,Genus,Species,...,Top speed,Weight,Height,Length,Attributes,Distribution,Habits,Diet,Mating_Habits,Population
count,29357,29357,28525,10752,29357,28920,2960,29354,29351,29357,...,595,2873,336,2947,29357,29357,29357,2644,29357,29357
unique,29357,416,2,2,12,123,16,858,6194,28688,...,138,2255,254,2122,3133,8441,2179,308,2713,41
top,Junín canastero,Animalia,Chordata,Vertebrata,Aves,Passeriformes,Serpentes,Muridae,Conus,Canidae,...,40 km/h,2-3 kg,1 m,1.5-2 m,"Terrestrial,Not a migrant,starts with","{'Geography': {'Countries': ''}, 'Biome': [], ...",{},Carnivore,{},"{'Population trend': 'Stable', 'Population sta..."
freq,1,28525,23659,10152,10114,5863,1132,764,535,60,...,48,14,6,11,8463,3329,18667,365,26268,6901


# Missing values

In [23]:
print("Missing values per column:")
print(df.isna().sum()) # get count of missing values per column

Missing values per column:
Name                   0
Kingdom                0
Phylum               832
Subphylum          18605
Class                  0
Order                437
Suborder           26397
Family                 3
Genus                  6
Species                0
Population size    27551
Life span          27495
Top speed          28762
Weight             26484
Height             29021
Length             26410
Attributes             0
Distribution           0
Habits                 0
Diet               26713
Mating_Habits          0
Population             0
dtype: int64


In [24]:
print("\nPercentage of missing values per column:")
print(df.isna().mean() * 100) # get percentage of missing values per column


Percentage of missing values per column:
Name                0.000000
Kingdom             0.000000
Phylum              2.834077
Subphylum          63.375004
Class               0.000000
Order               1.488572
Suborder           89.917226
Family              0.010219
Genus               0.020438
Species             0.000000
Population size    93.848145
Life span          93.657390
Top speed          97.973226
Weight             90.213578
Height             98.855469
Length             89.961508
Attributes          0.000000
Distribution        0.000000
Habits              0.000000
Diet               90.993630
Mating_Habits       0.000000
Population          0.000000
dtype: float64


In [34]:
# duplicated rows
print("\nNumber of duplicated rows:", df.duplicated().sum())

# count unique values per column
print("\nUnique values per column:\n", df.nunique())


Number of duplicated rows: 0

Unique values per column:
 Name               29357
Kingdom              416
Phylum                 2
Subphylum              2
Class                 12
Order                123
Suborder              16
Family               858
Genus               6194
Species            28688
Population size      652
Life span            606
Top speed            138
Weight              2255
Height               254
Length              2122
Attributes          3133
Distribution        8441
Habits              2179
Diet                 308
Mating_Habits       2713
Population            41
dtype: int64
