In [19]:
# Import dependencies
import pandas as pd
from pathlib import Path
import re

# Import Data

In [20]:
# Base datasets
squirrel_2020 = Path("Resources/squirrel_data_2020.csv")
squirrel_2018 = Path("Resources/squirrel_data_2018.csv")

# Additional dataset from 2020
parks_2020 = Path("Resources/park_data_2020.csv")

# 2020 Dataset - Squirrels

In [87]:
# Create dataframe from CSV
s2020_df = pd.read_csv(squirrel_2020, encoding="unicode_escape")

# Display DataFrame and its shape
print(f"2020 Squirrel Dataset: {s2020_df.shape}")
s2020_df.head()

2020 Squirrel Dataset: (433, 16)


Unnamed: 0,Area Name,Area ID,Park Name,Park ID,Squirrel ID,Primary Fur Color,Highlights in Fur Color,Color Notes,Location,Above Ground (Height in Feet),Specific Location,Activities,Interactions with Humans,Other Notes or Observations,Squirrel Latitude (DD.DDDDDD),Squirrel Longitude (-DD.DDDDDD)
0,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-01,Gray,White,,Ground Plane,,,Foraging,Indifferent,,40.85941,-73.933936
1,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-02,Gray,White,,Ground Plane,,,Foraging,Indifferent,Looks skinny,40.859436,-73.933937
2,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-03,Gray,White,,Ground Plane,,,"Eating, Digging something",Indifferent,,40.859416,-73.933894
3,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-04,Gray,White,,Ground Plane,,,Running,Indifferent,,40.859418,-73.933895
4,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-05,Gray,Cinnamon,,Ground Plane,,,"Running, Eating",Indifferent,She left food,40.859493,-73.93359


In [88]:
# Identify the non-null count and data types
s2020_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 433 entries, 0 to 432
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Area Name                        433 non-null    object 
 1   Area ID                          433 non-null    object 
 2   Park Name                        433 non-null    object 
 3   Park ID                          433 non-null    int64  
 4   Squirrel ID                      433 non-null    object 
 5   Primary Fur Color                432 non-null    object 
 6   Highlights in Fur Color          339 non-null    object 
 7   Color Notes                      10 non-null     object 
 8   Location                         399 non-null    object 
 9   Above Ground (Height in Feet)    112 non-null    object 
 10  Specific Location                89 non-null     object 
 11  Activities                       378 non-null    object 
 12  Interactions with Huma

In [89]:
# Drop null values for minimum requirement columns
dropna_columns = [
    "Highlights in Fur Color",
    "Activities",
    "Interactions with Humans",
    "Squirrel Latitude (DD.DDDDDD)",
    "Squirrel Longitude (-DD.DDDDDD)"
] # "Other Notes or Observations"

s2020_nonull = s2020_df.dropna(subset=dropna_columns, how="any")

# Display DataFrame and its shape
print(f"{s2020_nonull.shape}")
s2020_nonull.head()

(192, 16)


Unnamed: 0,Area Name,Area ID,Park Name,Park ID,Squirrel ID,Primary Fur Color,Highlights in Fur Color,Color Notes,Location,Above Ground (Height in Feet),Specific Location,Activities,Interactions with Humans,Other Notes or Observations,Squirrel Latitude (DD.DDDDDD),Squirrel Longitude (-DD.DDDDDD)
0,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-01,Gray,White,,Ground Plane,,,Foraging,Indifferent,,40.85941,-73.933936
1,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-02,Gray,White,,Ground Plane,,,Foraging,Indifferent,Looks skinny,40.859436,-73.933937
2,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-03,Gray,White,,Ground Plane,,,"Eating, Digging something",Indifferent,,40.859416,-73.933894
3,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-04,Gray,White,,Ground Plane,,,Running,Indifferent,,40.859418,-73.933895
4,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-05,Gray,Cinnamon,,Ground Plane,,,"Running, Eating",Indifferent,She left food,40.859493,-73.93359


In [90]:
# Identify the non-null count and data types
s2020_nonull.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 192 entries, 0 to 432
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Area Name                        192 non-null    object 
 1   Area ID                          192 non-null    object 
 2   Park Name                        192 non-null    object 
 3   Park ID                          192 non-null    int64  
 4   Squirrel ID                      192 non-null    object 
 5   Primary Fur Color                192 non-null    object 
 6   Highlights in Fur Color          192 non-null    object 
 7   Color Notes                      5 non-null      object 
 8   Location                         192 non-null    object 
 9   Above Ground (Height in Feet)    58 non-null     object 
 10  Specific Location                28 non-null     object 
 11  Activities                       192 non-null    object 
 12  Interactions with Huma

In [91]:
# Rename the columns
cols_df = pd.DataFrame(s2020_nonull.columns, columns=["name"])

# Define the regex pattern
pattern = '\((.*?)\)' # \( and \) to escape brackets, *.? matches any character (non-newline) zero or more times

new_columns = []
for row in cols_df["name"]:
    stripped_row = re.findall(pattern, row)

    # Remove the brackets and content
    if len(stripped_row) > 0:
        remove_substring = f' ({stripped_row[0].strip("[]")})'
        row = row.strip(remove_substring)

    # Use underscore and cast to lowercase
    new_columns.append(row.replace(" ", "_").lower())

# Update the columns
s2020_nonull.columns = new_columns

s2020_nonull.head()

Unnamed: 0,area_name,area_id,park_name,park_id,squirrel_id,primary_fur_color,highlights_in_fur_color,color_notes,location,above_ground,specific_location,activities,interactions_with_humans,other_notes_or_observations,squirrel_latitude,squirrel_longitude
0,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-01,Gray,White,,Ground Plane,,,Foraging,Indifferent,,40.85941,-73.933936
1,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-02,Gray,White,,Ground Plane,,,Foraging,Indifferent,Looks skinny,40.859436,-73.933937
2,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-03,Gray,White,,Ground Plane,,,"Eating, Digging something",Indifferent,,40.859416,-73.933894
3,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-04,Gray,White,,Ground Plane,,,Running,Indifferent,,40.859418,-73.933895
4,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-05,Gray,Cinnamon,,Ground Plane,,,"Running, Eating",Indifferent,She left food,40.859493,-73.93359


In [127]:
activities = []

# Split the activities column
for sighting in s2020_nonull['activities']:
    activities.append(sighting.split(", "))

# Flatten the list
all_activities = [value.lower() for row in activities for value in row]

# Get the unique activities
unique_activities = set(all_activities)
unique_activities

# 2018 equivalent activities
# parsed_list = ["running", "chasing", "climbing", "eating", "foraging"]

activities_df = pd.DataFrame(all_activities)
activities_df.value_counts()

[['Foraging'],
 ['Foraging'],
 ['Eating', 'Digging something'],
 ['Running'],
 ['Running', 'Eating'],
 ['Climbing'],
 ['Foraging'],
 ['Climbing'],
 ['Foraging'],
 ['Eating', 'Digging'],
 ['Eating', 'Digging'],
 ['Running'],
 ['Running'],
 ['Foraging'],
 ['Running'],
 ['Running'],
 ['Foraging'],
 ['Foraging', 'Nesting/gathering leaves'],
 ['Chasing'],
 ['Running'],
 ['Eating'],
 ['Climbing'],
 ['Sleeping'],
 ['Running'],
 ['Running'],
 ['Eating'],
 ['Running'],
 ['Foraging'],
 ['Climbing'],
 ['Foraging'],
 ['Foraging'],
 ['Foraging'],
 ['Eating'],
 ['Sitting'],
 ['Chasing', 'Climbing'],
 ['Chasing', 'Climbing', 'Eating'],
 ['Running'],
 ['Running'],
 ['Running'],
 ['Running', 'Chasing', 'Climbing'],
 ['Climbing', 'Foraging'],
 ['Vocalization at us'],
 ['Running', 'Foraging'],
 ['Running', 'Eating'],
 ['Eating', 'Foraging'],
 ['Climbing'],
 ['Foraging'],
 ['Eating'],
 ['Foraging'],
 ['Climbing', 'Eating'],
 ['Chasing', 'Climbing'],
 ['Chasing', 'Climbing'],
 ['Running'],
 ['Foraging'],
 

In [132]:
activities_list_df = pd.Series(activities)
activities_list_df

0                        [Foraging]
1                        [Foraging]
2       [Eating, Digging something]
3                         [Running]
4                 [Running, Eating]
                   ...             
187                      [Foraging]
188             [Running, Foraging]
189                      [Climbing]
190    [Running, Chasing, Climbing]
191                      [Foraging]
Length: 192, dtype: object

In [123]:
# Combine all the "eating"
# s2020_nonull[s2020_nonull['activities'].str.startswith("Eating (")]['activities']
s2020_nonull[s2020_nonull['squirrel_id'] == "C-18-01"]

Unnamed: 0,area_name,area_id,park_name,park_id,squirrel_id,primary_fur_color,highlights_in_fur_color,color_notes,location,above_ground,specific_location,activities,interactions_with_humans,other_notes_or_observations,squirrel_latitude,squirrel_longitude
330,LOWER MANHATTAN,C,Teardrop Park,18,C-18-01,Gray,Cinnamon,,Ground Plane,,,"Running, Eating (or pretending to eat)",Runs From,"Ran up tree and down tree, tail vibration ÛÓ ...",40.716335,-74.015612


In [126]:
s2020_nonull[s2020_nonull['activities'].str.endswith("Eating (or pretending to eat)")]

Unnamed: 0,area_name,area_id,park_name,park_id,squirrel_id,primary_fur_color,highlights_in_fur_color,color_notes,location,above_ground,specific_location,activities,interactions_with_humans,other_notes_or_observations,squirrel_latitude,squirrel_longitude
330,LOWER MANHATTAN,C,Teardrop Park,18,C-18-01,Gray,Cinnamon,,Ground Plane,,,"Running, Eating (or pretending to eat)",Runs From,"Ran up tree and down tree, tail vibration ÛÓ ...",40.716335,-74.015612


# 2018 Dataset - Squirrels

In [27]:
# Create dataframe from CSV
s2018_df = pd.read_csv(squirrel_2018)

# Display DataFrame and its shape
print(f"2018 Dataset: {s2018_df.shape}")
s2018_df.head()

2018 Dataset: (3023, 31)


Unnamed: 0,X,Y,Unique Squirrel ID,Hectare,Shift,Date,Hectare Squirrel Number,Age,Primary Fur Color,Highlight Fur Color,...,Kuks,Quaas,Moans,Tail flags,Tail twitches,Approaches,Indifferent,Runs from,Other Interactions,Lat/Long
0,-73.956134,40.794082,37F-PM-1014-03,37F,PM,10142018,3,,,,...,False,False,False,False,False,False,False,False,,POINT (-73.9561344937861 40.7940823884086)
1,-73.968857,40.783783,21B-AM-1019-04,21B,AM,10192018,4,,,,...,False,False,False,False,False,False,False,False,,POINT (-73.9688574691102 40.7837825208444)
2,-73.974281,40.775534,11B-PM-1014-08,11B,PM,10142018,8,,Gray,,...,False,False,False,False,False,False,False,False,,POINT (-73.97428114848522 40.775533619083)
3,-73.959641,40.790313,32E-PM-1017-14,32E,PM,10172018,14,Adult,Gray,,...,False,False,False,False,False,False,False,True,,POINT (-73.9596413903948 40.7903128889029)
4,-73.970268,40.776213,13E-AM-1017-05,13E,AM,10172018,5,Adult,Gray,Cinnamon,...,False,False,False,False,False,False,False,False,,POINT (-73.9702676472613 40.7762126854894)


In [28]:
# Identify the non-null count and data types
s2018_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3023 entries, 0 to 3022
Data columns (total 31 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   X                                           3023 non-null   float64
 1   Y                                           3023 non-null   float64
 2   Unique Squirrel ID                          3023 non-null   object 
 3   Hectare                                     3023 non-null   object 
 4   Shift                                       3023 non-null   object 
 5   Date                                        3023 non-null   int64  
 6   Hectare Squirrel Number                     3023 non-null   int64  
 7   Age                                         2902 non-null   object 
 8   Primary Fur Color                           2968 non-null   object 
 9   Highlight Fur Color                         1937 non-null   object 
 10  Combination 

In [29]:
# 2018 Dataset

# Drop null values for minimum requirement columns
s2018_nonull = s2018_df.dropna(
    subset=["Primary Fur Color", "Highlight Fur Color", "Running", "Chasing", "Climbing", "Eating", "Foraging", "Approaches", "Indifferent", "Runs from", "X", "Y"],
    how="any")

# NOTE: If you add "Other Interactions", dataset is reduced to 170 rows
# Number of unique values for "Other Notes or Observations"
print(f'Unique "other interactions": {s2018_nonull["Other Interactions"].nunique()}') # 152 unique interactions

# Display DataFrame and its shape
print(f"{s2018_nonull.shape}")
s2018_nonull.head()

Unique "other interactions": 152
(1937, 31)


Unnamed: 0,X,Y,Unique Squirrel ID,Hectare,Shift,Date,Hectare Squirrel Number,Age,Primary Fur Color,Highlight Fur Color,...,Kuks,Quaas,Moans,Tail flags,Tail twitches,Approaches,Indifferent,Runs from,Other Interactions,Lat/Long
4,-73.970268,40.776213,13E-AM-1017-05,13E,AM,10172018,5,Adult,Gray,Cinnamon,...,False,False,False,False,False,False,False,False,,POINT (-73.9702676472613 40.7762126854894)
5,-73.968361,40.772591,11H-AM-1010-03,11H,AM,10102018,3,Adult,Cinnamon,White,...,False,False,False,False,True,False,True,False,,POINT (-73.9683613516225 40.7725908847499)
9,-73.97225,40.774288,11D-AM-1010-03,11D,AM,10102018,3,Adult,Gray,Cinnamon,...,False,False,False,False,False,False,True,False,,POINT (-73.9722500196844 40.7742879599026)
10,-73.969506,40.782351,20B-PM-1013-05,20B,PM,10132018,5,Adult,Gray,White,...,False,False,False,False,False,False,True,False,,POINT (-73.9695063535333 40.7823507678183)
12,-73.953217,40.791967,36I-PM-1007-01,36I,PM,10072018,1,Adult,Gray,Cinnamon,...,False,False,False,False,False,False,True,False,,POINT (-73.9532170504865 40.7919669739962)


In [32]:
# Rename the columns
cols_df = pd.DataFrame(s2018_nonull.columns, columns=["name"])

# Define the regex pattern
pattern = '\((.*?)\)' # \( and \) to escape brackets, *.? matches any character (non-newline) zero or more times

new_columns = []
for row in cols_df["name"]:
    # Use underscore and cast to lowercase
    new_columns.append(row.replace(" ", "_").lower())

# Update the columns
s2018_nonull.columns = new_columns

s2018_nonull.head()

Unnamed: 0,x,y,unique_squirrel_id,hectare,shift,date,hectare_squirrel_number,age,primary_fur_color,highlight_fur_color,...,kuks,quaas,moans,tail_flags,tail_twitches,approaches,indifferent,runs_from,other_interactions,lat/long
4,-73.970268,40.776213,13E-AM-1017-05,13E,AM,10172018,5,Adult,Gray,Cinnamon,...,False,False,False,False,False,False,False,False,,POINT (-73.9702676472613 40.7762126854894)
5,-73.968361,40.772591,11H-AM-1010-03,11H,AM,10102018,3,Adult,Cinnamon,White,...,False,False,False,False,True,False,True,False,,POINT (-73.9683613516225 40.7725908847499)
9,-73.97225,40.774288,11D-AM-1010-03,11D,AM,10102018,3,Adult,Gray,Cinnamon,...,False,False,False,False,False,False,True,False,,POINT (-73.9722500196844 40.7742879599026)
10,-73.969506,40.782351,20B-PM-1013-05,20B,PM,10132018,5,Adult,Gray,White,...,False,False,False,False,False,False,True,False,,POINT (-73.9695063535333 40.7823507678183)
12,-73.953217,40.791967,36I-PM-1007-01,36I,PM,10072018,1,Adult,Gray,Cinnamon,...,False,False,False,False,False,False,True,False,,POINT (-73.9532170504865 40.7919669739962)


In [35]:
## Delete unwanted columns but maintain original copy
s2018_drop_columns = s2018_nonull.drop(columns=['hectare', 'shift', 'hectare_squirrel_number', 'age', 'other_interactions'], inplace=False)

# Rename columns to match 2020 dataset
s2018_rename_columns = s2018_drop_columns.rename(columns={"x": "squirrel_longitude", "y": "squirrel_latitude", "highlight_fur_colour": "highlights_in_fur_color", "unique_squirrel_id": "squirrel_id"})

s2018_rename_columns.head()

Unnamed: 0,squirrel_longitude,squirrel_latitude,squirrel_id,date,primary_fur_color,highlight_fur_color,combination_of_primary_and_highlight_color,color_notes,location,above_ground_sighter_measurement,...,other_activities,kuks,quaas,moans,tail_flags,tail_twitches,approaches,indifferent,runs_from,lat/long
4,-73.970268,40.776213,13E-AM-1017-05,10172018,Gray,Cinnamon,Gray+Cinnamon,,Above Ground,,...,,False,False,False,False,False,False,False,False,POINT (-73.9702676472613 40.7762126854894)
5,-73.968361,40.772591,11H-AM-1010-03,10102018,Cinnamon,White,Cinnamon+White,,,,...,,False,False,False,False,True,False,True,False,POINT (-73.9683613516225 40.7725908847499)
9,-73.97225,40.774288,11D-AM-1010-03,10102018,Gray,Cinnamon,Gray+Cinnamon,,Above Ground,30,...,grooming,False,False,False,False,False,False,True,False,POINT (-73.9722500196844 40.7742879599026)
10,-73.969506,40.782351,20B-PM-1013-05,10132018,Gray,White,Gray+White,,Ground Plane,FALSE,...,,False,False,False,False,False,False,True,False,POINT (-73.9695063535333 40.7823507678183)
12,-73.953217,40.791967,36I-PM-1007-01,10072018,Gray,Cinnamon,Gray+Cinnamon,,Ground Plane,FALSE,...,,False,False,False,False,False,False,True,False,POINT (-73.9532170504865 40.7919669739962)


In [34]:
# Check the "other activities" column
other_activities = pd.DataFrame(s2018_nonull["other_activities"].value_counts())
other_activities.head(50)

Unnamed: 0,other_activities
digging,14
sitting,9
playing,8
burying,6
nut in mouth,4
grooming,3
cleaning,3
walking,3
hopping,3
watching,3
