In [36]:
import pandas as pd
import sys

def clean_data(file_path: str) -> pd.DataFrame:
    """Load and clean the dataset.

    Args:
        file_path (str): Path to the CSV file.

    Returns:
        pd.DataFrame: Cleaned DataFrame.
    """
    # Load the dataset
    df = pd.read_csv(file_path,  index_col=0)

    # Drop duplicates
    df.drop_duplicates(inplace=True)

    # Fill missing values
    df.fillna(method='ffill', inplace=True)

    # Capitalize species column
    df['species'] = df['species'].str.capitalize()

    return df


In [None]:
# read in the file
df = pd.read_csv("../data/trees_sampled.csv", index_col=0)

# check for missing values
print("Missing values before cleaning:")
print(df.isna().sum())

# check for duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicates: {duplicates}")


Missing values before cleaning:
species                            0
diameter at breast height [mm]     0
height [dm]                       27
volume with bark [m3]              0
dtype: int64
Number of duplicates: 2


In [37]:
path ="../data/trees_sampled.csv"

new_df = clean_data(path)

# check for missing values
print("Missing values after cleaning:")
print(new_df.isna().sum())

# check for duplicates
duplicates = new_df.duplicated().sum()
print(f"Number of duplicates: {duplicates}")

# Check capitalization of species
print("Unique species after cleaning:")
print(new_df['species'].unique())

Missing values after cleaning:
species                           0
diameter at breast height [mm]    0
height [dm]                       0
volume with bark [m3]             0
dtype: int64
Number of duplicates: 0
Unique species after cleaning:
['Norway spruce' 'Sitka spruce' 'Other spruces' 'Scots pine'
 'Mountain pine' 'European black pine' 'Pinus cembra' 'Eastern white pine'
 'Other pines' 'Silver fir' 'Grand fir' 'Other firs' 'Douglas fir'
 'European larch' 'Japanese larch' 'Other coniferous trees' 'European yew'
 'Beech' 'English oak' 'Sessile oak' 'Northern red oak' 'Common ash'
 'Hornbeam' 'Sycamore maple' 'Norway maple' 'Field maple' 'Linden tree'
 'Black locust' 'Elm, native species' 'Chestnut'
 'Misc. deciduous trees with long life expectancy' 'Service tree'
 'Common whitebeam' 'Silver birch'
 'Betula pubescens + betula pubescens var. glabrata' 'Black alder'
 'Grey alder' 'Common aspen' 'European black poplar' 'Grey poplar'
 'Silver poplar' 'Balsam poplar' 'European rowan' 'Wil