In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path

import re

# Import Data

In [2]:
# Base datasets
squirrel_2020 = Path("Resources/squirrel_data_2020.csv")
squirrel_2018 = Path("Resources/squirrel_data_2018.csv")

# Additional dataset from 2020
parks_2020 = Path("Resources/park_data_2020.csv")

# 2020 Dataset - Squirrels

In [3]:
# Create dataframe from CSV
s2020_df = pd.read_csv(squirrel_2020, encoding="unicode_escape")

# Display DataFrame and its shape
print(f"2020 Squirrel Dataset: {s2020_df.shape}")
s2020_df.head()

2020 Squirrel Dataset: (433, 16)


Unnamed: 0,Area Name,Area ID,Park Name,Park ID,Squirrel ID,Primary Fur Color,Highlights in Fur Color,Color Notes,Location,Above Ground (Height in Feet),Specific Location,Activities,Interactions with Humans,Other Notes or Observations,Squirrel Latitude (DD.DDDDDD),Squirrel Longitude (-DD.DDDDDD)
0,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-01,Gray,White,,Ground Plane,,,Foraging,Indifferent,,40.85941,-73.933936
1,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-02,Gray,White,,Ground Plane,,,Foraging,Indifferent,Looks skinny,40.859436,-73.933937
2,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-03,Gray,White,,Ground Plane,,,"Eating, Digging something",Indifferent,,40.859416,-73.933894
3,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-04,Gray,White,,Ground Plane,,,Running,Indifferent,,40.859418,-73.933895
4,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-05,Gray,Cinnamon,,Ground Plane,,,"Running, Eating",Indifferent,She left food,40.859493,-73.93359


In [6]:
# Identify the non-null count and data types
s2020_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 433 entries, 0 to 432
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Area Name                        433 non-null    object 
 1   Area ID                          433 non-null    object 
 2   Park Name                        433 non-null    object 
 3   Park ID                          433 non-null    int64  
 4   Squirrel ID                      433 non-null    object 
 5   Primary Fur Color                432 non-null    object 
 6   Highlights in Fur Color          339 non-null    object 
 7   Color Notes                      10 non-null     object 
 8   Location                         399 non-null    object 
 9   Above Ground (Height in Feet)    112 non-null    object 
 10  Specific Location                89 non-null     object 
 11  Activities                       378 non-null    object 
 12  Interactions with Huma

In [7]:
# Drop null values for minimum requirement columns
dropna_columns = [
    "Highlights in Fur Color",
    "Activities",
    "Interactions with Humans",
    "Squirrel Latitude (DD.DDDDDD)",
    "Squirrel Longitude (-DD.DDDDDD)"
] # "Other Notes or Observations"

s2020_nonull = s2020_df.dropna(subset=dropna_columns, how="any")

# Display DataFrame and its shape
print(f"{s2020_nonull.shape}")
s2020_nonull.head()

(192, 16)


Unnamed: 0,Area Name,Area ID,Park Name,Park ID,Squirrel ID,Primary Fur Color,Highlights in Fur Color,Color Notes,Location,Above Ground (Height in Feet),Specific Location,Activities,Interactions with Humans,Other Notes or Observations,Squirrel Latitude (DD.DDDDDD),Squirrel Longitude (-DD.DDDDDD)
0,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-01,Gray,White,,Ground Plane,,,Foraging,Indifferent,,40.85941,-73.933936
1,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-02,Gray,White,,Ground Plane,,,Foraging,Indifferent,Looks skinny,40.859436,-73.933937
2,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-03,Gray,White,,Ground Plane,,,"Eating, Digging something",Indifferent,,40.859416,-73.933894
3,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-04,Gray,White,,Ground Plane,,,Running,Indifferent,,40.859418,-73.933895
4,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-05,Gray,Cinnamon,,Ground Plane,,,"Running, Eating",Indifferent,She left food,40.859493,-73.93359


In [9]:
# Identify the non-null count and data types
s2020_nonull.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 192 entries, 0 to 432
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Area Name                        192 non-null    object 
 1   Area ID                          192 non-null    object 
 2   Park Name                        192 non-null    object 
 3   Park ID                          192 non-null    int64  
 4   Squirrel ID                      192 non-null    object 
 5   Primary Fur Color                192 non-null    object 
 6   Highlights in Fur Color          192 non-null    object 
 7   Color Notes                      5 non-null      object 
 8   Location                         192 non-null    object 
 9   Above Ground (Height in Feet)    58 non-null     object 
 10  Specific Location                28 non-null     object 
 11  Activities                       192 non-null    object 
 12  Interactions with Huma

In [10]:
# Rename the columns
cols_df = pd.DataFrame(s2020_nonull.columns, columns=["name"])

# Define the regex pattern
pattern = '\((.*?)\)' # \( and \) to escape brackets, *.? matches any character (non-newline) zero or more times

new_columns = []
for row in cols_df["name"]:
    stripped_row = re.findall(pattern, row)

    # Remove the brackets and content
    if len(stripped_row) > 0:
        remove_substring = f' ({stripped_row[0].strip("[]")})'
        row = row.strip(remove_substring)

    # Use underscore and cast to lowercase
    new_columns.append(row.replace(" ", "_").lower())

# Update the columns
s2020_nonull.columns = new_columns

s2020_nonull.head()

Unnamed: 0,area_name,area_id,park_name,park_id,squirrel_id,primary_fur_color,highlights_in_fur_color,color_notes,location,above_ground,specific_location,activities,interactions_with_humans,other_notes_or_observations,squirrel_latitude,squirrel_longitude
0,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-01,Gray,White,,Ground Plane,,,Foraging,Indifferent,,40.85941,-73.933936
1,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-02,Gray,White,,Ground Plane,,,Foraging,Indifferent,Looks skinny,40.859436,-73.933937
2,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-03,Gray,White,,Ground Plane,,,"Eating, Digging something",Indifferent,,40.859416,-73.933894
3,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-04,Gray,White,,Ground Plane,,,Running,Indifferent,,40.859418,-73.933895
4,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-05,Gray,Cinnamon,,Ground Plane,,,"Running, Eating",Indifferent,She left food,40.859493,-73.93359
