# NYC Trees

###### This jupyter notebook will clean the dataset and save it to a CSV file 

In [9]:
# Importing the necessary libraries for this project
import pandas as pd 
import numpy as np
import time

In [10]:
def latin_family(i):
    """
    Function latin_family will get the latin family and return the family name. 
    :parameter i is required for this function
    :attrib plant will contain the string i parameter
    :attrib family will contain the split plant name
    """
    plant = str(i)
    if plant == "nan": 
        return i
    else:
        family = plant.split(" ")[0]
        return family
    
    
def plant_family(i):
    """
    Function plant_family will get the plant family name and return the family name. 
    :parameter i is required for this function
    :attrib plant will contain the string i parameter
    :attrib family will contain the split plant family name
    """
    plant = str(i)
    if plant == "nan": 
        return i
    else:
        family = plant.split(" ")[-1]
        return family
    
def check_prob(i):
    """
    Function check_prob will clean the problems columns. 
    :parameter i is required for this function
    :attrib prob will contain the string i parameter
    :attrib problems is a list containing the problems of the plant
    :attrib li will contain the list of split prob attrib
    :attrib prob1 will contain the cleaned version of problems list
    It will go thru a loop to remove the duplicates
    :attrib sorted_prob will sort the prob1 attrib
    :attrib probs will be a string containing the sorted_prob 
    This function will return the probs attrib
    """
    prob = str(i)
    if prob == "nan": 
        return i
    elif prob == "None":
        return i
    else:
        problems = []
        li = prob.split(",")
        for i in li:
            if "stone" in i.lower():
                problems.append("Stones")
            elif "trunk" in i.lower():
                problems.append("Trunk")
            elif "branch" in i.lower():
                problems.append("Branch")
            elif "root" in i.lower():
                problems.append("Root")
            elif "wires" in i.lower():
                problems.append("Wires")
            elif "metal" in i.lower():
                problems.append("Metal")
            elif "sneaker" in i.lower():
                problems.append("Sneakers")
            else:
                problems.append(i)
        
        prob1 = []
        for i in problems:
            if i not in prob1:
                prob1.append(i)
                
        sorted_prob = sorted(prob1)
     
        probs = ', '.join(map(str, sorted_prob))
        return probs

In [13]:
def clean_dataset(data):
    """
    Function to start cleaning the dataset
    :parameter data is required to start the function
    :attrib start will contain the time the program started
    :attrib df will be the dataset
    :attrib end will contain the time the program ended
    This function will return a print statement once the program is done.
    """
    
    start = time.time()
    print("We will now start cleaning your dataset.")
    
    df = pd.read_csv(data)
    print(f"Reading dataset from: {data}")
    
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['day'] = df['created_at'].dt.day
    df['month'] = df['created_at'].dt.month
    df['year'] = df['created_at'].dt.year
    df = df.drop(columns = ['tree_id', 'block_id'])
    
    
    df['curb_loc'].replace({ "OnCurb" : 0, "OffsetFromCurb" : 1} , inplace = True)
    df['status'].replace({ "Alive" : 0, "Stump" : 1, "Dead": 2 } , inplace = True)
    df['health'].replace({ "Good" : 0, "Fair" : 1, "Poor": 2 } , inplace = True)
    df['steward'].replace({ "None" : 0, "1or2" : 1, "3or4" : 2, "4orMore" : 3} , inplace = True)
    df['guards'].replace({ "None" : 0, "Helpful" : 1, "Harmful" : 2, "Unsure" : 3} , inplace = True)
    df['sidewalk'].replace({ "NoDamage" : 0, "Damage" : 1} , inplace = True)
    df['user_type'].replace({ "Volunteer" : 0, "TreesCount Staff" : 1, "NYC Parks Staff" : 2} , inplace = True)


    df['spc_latin_family'] = df['spc_latin'].apply(latin_family)
    df['spc_common_family'] = df['spc_common'].apply(plant_family)
    
    root_columns = ['root_stone', 'root_grate', 'root_other']

    df['root_problems'] = df[root_columns].any(axis=1)
    df['root_problems'] = df['root_problems'].apply(lambda x : "yes" if x else "no")
    df['root_problems'].replace({ "no" : 0, "yes" : 1} , inplace = True)

    df = df.drop(columns = ['root_stone', 'root_grate', 'root_other'])
    
    trunk_columns = ['trnk_wire', 'trnk_light', 'trnk_other']

    df['trunk_problems'] = df[trunk_columns].any(axis=1)
    df['trunk_problems'] = df['trunk_problems'].apply(lambda x : "yes" if x else "no")
    df['trunk_problems'].replace({ "no" : 0, "yes" : 1} , inplace = True)

    df = df.drop(columns = ['trnk_wire', 'trnk_light', 'trnk_other'])
    
    branch_columns = ['brnch_ligh', 'brnch_shoe', 'brnch_othe']

    df['branch_problems'] = df[branch_columns].any(axis=1)
    df['branch_problems'] = df['branch_problems'].apply(lambda x : "yes" if x else "no")
    df['branch_problems'].replace({ "no" : 0, "yes" : 1} , inplace = True)
    
    df = df.drop(columns =  ['brnch_ligh', 'brnch_shoe', 'brnch_othe', 'state'])
    
    df['problem(s)'] = df['problems'].apply(check_prob)
    
    
    df.to_csv(r'datasets\NYC-Trees-1st-version', index = False)
    end = time.time()
    return(f"Successfully cleaned the data! It took a total of {end-start} seconds.")


In [14]:
data = 'assets/data_100000.csv'
clean_dataset(data)

We will now start cleaning your dataset.
Reading dataset from: assets/data_100000.csv


'Successfully cleaned the data! It took a total of 6.026623725891113 seconds.'