In [113]:
# Import dependencies
import pandas as pd
from pathlib import Path

import re

In [24]:
squirrel_2020 = Path("Resources/squirrel_data_2020.csv")
squirrel_2018 = Path("Resources/squirrel_data_2018.csv")

# Additional dataset from 2020
parks_2020 = Path("Resources/park_data_2020.csv")

## 2020 Dataset

In [143]:
# Create dataframe from CSV
s2020_df = pd.read_csv(squirrel_2020, encoding="unicode_escape")

# Display DataFrame and its shape
print(f"2020 Squirrel Dataset: {s2020_df.shape}")
s2020_df.head()

2020 Squirrel Dataset: (433, 16)


Unnamed: 0,Area Name,Area ID,Park Name,Park ID,Squirrel ID,Primary Fur Color,Highlights in Fur Color,Color Notes,Location,Above Ground (Height in Feet),Specific Location,Activities,Interactions with Humans,Other Notes or Observations,Squirrel Latitude (DD.DDDDDD),Squirrel Longitude (-DD.DDDDDD)
0,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-01,Gray,White,,Ground Plane,,,Foraging,Indifferent,,40.85941,-73.933936
1,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-02,Gray,White,,Ground Plane,,,Foraging,Indifferent,Looks skinny,40.859436,-73.933937
2,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-03,Gray,White,,Ground Plane,,,"Eating, Digging something",Indifferent,,40.859416,-73.933894
3,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-04,Gray,White,,Ground Plane,,,Running,Indifferent,,40.859418,-73.933895
4,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-05,Gray,Cinnamon,,Ground Plane,,,"Running, Eating",Indifferent,She left food,40.859493,-73.93359


In [144]:
# Print columns
s2020_df.columns

Index(['Area Name', 'Area ID', 'Park Name', 'Park ID', 'Squirrel ID',
       'Primary Fur Color', 'Highlights in Fur Color', 'Color Notes',
       'Location', 'Above Ground (Height in Feet)', 'Specific Location',
       'Activities', 'Interactions with Humans', 'Other Notes or Observations',
       'Squirrel Latitude (DD.DDDDDD)', 'Squirrel Longitude (-DD.DDDDDD)'],
      dtype='object')

In [145]:
# Create dataframe from CSV
parks_df = pd.read_csv(parks_2020, encoding="unicode_escape")

# Display DataFrame and its shape
print(f"2020 Park Dataset: {parks_df.shape}")
parks_df.head()

2020 Park Dataset: (25, 15)


Unnamed: 0,Area Name,Area ID,Park Name,Park ID,Date,Start Time,End Time,"Total Time (in minutes, if available)",Park Conditions,Other Animal Sightings,Litter,Temperature & Weather,Number of Squirrels,Squirrel Sighter(s),Number of Sighters
0,UPPER MANHATTAN,A,Fort Tryon Park,1.0,3/1/20,3:14:00 PM,4:05:00 PM,51,Busy,"Humans, Dogs, Pigeons, Cardinals",Some,"43 degrees, sunny",12,"01, 02, 03, 04",4
1,UPPER MANHATTAN,A,J. Hood Wright Park,2.0,3/1/20,3:30:00 PM,4:00:00 PM,30,Calm,"Humans, Hawks, Dogs, Pigeons, Rat","Some, in trees","cold, clear",24,"05, 06",2
2,UPPER MANHATTAN,A,Highbridge Park,3.0,3/1/20,3:21:00 PM,4:15:00 PM,54,"Calm, pick-up baseball game","Humans, Dogs (3, all on leashes), Downy Woodpe...","Some, especially caught in wooded area in East...",43 degrees,16,"07, 08, 09",3
3,UPPER MANHATTAN,A,St. Nicholas Park,4.0,3/1/20,3:15:00 PM,3:45:00 PM,30,Calm,"Humans, Dogs","Some, backside of park","43 degrees, clear",15,"10, 11, 12",3
4,UPPER MANHATTAN,A,Riverside Park (section near Grant Memorial),5.0,3/1/20,3:15:00 PM,3:45:00 PM,30,Calm,"Humans, Dogs",,,28,"13, 14, 15",3


In [146]:
# Print columns
parks_df.columns

Index(['Area Name', 'Area ID', 'Park Name', 'Park ID', 'Date', 'Start Time',
       'End Time', 'Total Time (in minutes, if available)', 'Park Conditions',
       'Other Animal Sightings', 'Litter', 'Temperature & Weather',
       'Number of Squirrels', 'Squirrel Sighter(s)', 'Number of Sighters'],
      dtype='object')

In [147]:
# Identify the non-null count and data types
s2020_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 433 entries, 0 to 432
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Area Name                        433 non-null    object 
 1   Area ID                          433 non-null    object 
 2   Park Name                        433 non-null    object 
 3   Park ID                          433 non-null    int64  
 4   Squirrel ID                      433 non-null    object 
 5   Primary Fur Color                432 non-null    object 
 6   Highlights in Fur Color          339 non-null    object 
 7   Color Notes                      10 non-null     object 
 8   Location                         399 non-null    object 
 9   Above Ground (Height in Feet)    112 non-null    object 
 10  Specific Location                89 non-null     object 
 11  Activities                       378 non-null    object 
 12  Interactions with Huma

### Minimum requirements:
- lat and long
- park ID (link to park CSV)
- squirrel ID (unique?)
- primary colour
- DROP NULL for: highlights in fur color

NOTE: Data collected 01 March 2020 (3:00PM to 4:38PM)

In [148]:
# Confirm "Squirrel ID" is unique, for use as a primary key later
s2020_df['Squirrel ID'].nunique()

433

In [149]:
# Drop null values for minimum requirement columns
s2020_nonull = s2020_df.dropna(
    subset=["Highlights in Fur Color", "Activities", "Interactions with Humans", "Squirrel Latitude (DD.DDDDDD)", "Squirrel Longitude (-DD.DDDDDD)"],
    how="any")

# Number of unique values for "Other Notes or Observations"
print(f'Unique "other interactions": {s2020_nonull["Other Notes or Observations"].nunique()}') # 76 unique interactions

# Display DataFrame and its shape
print(f"{s2020_nonull.shape}")
s2020_nonull.head()

Unique "other interactions": 76
(192, 16)


Unnamed: 0,Area Name,Area ID,Park Name,Park ID,Squirrel ID,Primary Fur Color,Highlights in Fur Color,Color Notes,Location,Above Ground (Height in Feet),Specific Location,Activities,Interactions with Humans,Other Notes or Observations,Squirrel Latitude (DD.DDDDDD),Squirrel Longitude (-DD.DDDDDD)
0,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-01,Gray,White,,Ground Plane,,,Foraging,Indifferent,,40.85941,-73.933936
1,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-02,Gray,White,,Ground Plane,,,Foraging,Indifferent,Looks skinny,40.859436,-73.933937
2,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-03,Gray,White,,Ground Plane,,,"Eating, Digging something",Indifferent,,40.859416,-73.933894
3,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-04,Gray,White,,Ground Plane,,,Running,Indifferent,,40.859418,-73.933895
4,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-05,Gray,Cinnamon,,Ground Plane,,,"Running, Eating",Indifferent,She left food,40.859493,-73.93359


## 2018 Dataset

In [67]:
# Create dataframe from CSV
s2018_df = pd.read_csv(squirrel_2018)

# Display DataFrame and its shape
print(f"2018 Dataset: {s2018_df.shape}")
s2018_df.head()

2018 Dataset: (3023, 31)


Unnamed: 0,X,Y,Unique Squirrel ID,Hectare,Shift,Date,Hectare Squirrel Number,Age,Primary Fur Color,Highlight Fur Color,...,Kuks,Quaas,Moans,Tail flags,Tail twitches,Approaches,Indifferent,Runs from,Other Interactions,Lat/Long
0,-73.956134,40.794082,37F-PM-1014-03,37F,PM,10142018,3,,,,...,False,False,False,False,False,False,False,False,,POINT (-73.9561344937861 40.7940823884086)
1,-73.968857,40.783783,21B-AM-1019-04,21B,AM,10192018,4,,,,...,False,False,False,False,False,False,False,False,,POINT (-73.9688574691102 40.7837825208444)
2,-73.974281,40.775534,11B-PM-1014-08,11B,PM,10142018,8,,Gray,,...,False,False,False,False,False,False,False,False,,POINT (-73.97428114848522 40.775533619083)
3,-73.959641,40.790313,32E-PM-1017-14,32E,PM,10172018,14,Adult,Gray,,...,False,False,False,False,False,False,False,True,,POINT (-73.9596413903948 40.7903128889029)
4,-73.970268,40.776213,13E-AM-1017-05,13E,AM,10172018,5,Adult,Gray,Cinnamon,...,False,False,False,False,False,False,False,False,,POINT (-73.9702676472613 40.7762126854894)


In [68]:
# Print columns
s2018_df.columns

Index(['X', 'Y', 'Unique Squirrel ID', 'Hectare', 'Shift', 'Date',
       'Hectare Squirrel Number', 'Age', 'Primary Fur Color',
       'Highlight Fur Color', 'Combination of Primary and Highlight Color',
       'Color notes', 'Location', 'Above Ground Sighter Measurement',
       'Specific Location', 'Running', 'Chasing', 'Climbing', 'Eating',
       'Foraging', 'Other Activities', 'Kuks', 'Quaas', 'Moans', 'Tail flags',
       'Tail twitches', 'Approaches', 'Indifferent', 'Runs from',
       'Other Interactions', 'Lat/Long'],
      dtype='object')

In [69]:
s2018_nonull = s2018_df.dropna(
    subset=["Primary Fur Color", "Highlight Fur Color", "Running", "Chasing", "Climbing", "Eating", "Foraging", "Approaches", "Indifferent", "Runs from", "X", "Y"],
    how="any") # NOTE: If you add "Other Interactions", dataset is reduced to 170 rows

# Number of unique values for "Other Notes or Observations"
print(f'Unique "other interactions": {s2018_nonull["Other Interactions"].nunique()}') # 152 unique interactions

# Display DataFrame and its shape
print(f"{s2018_nonull.shape}")
s2018_nonull.head()

Unique "other interactions": 152
(1937, 31)


Unnamed: 0,X,Y,Unique Squirrel ID,Hectare,Shift,Date,Hectare Squirrel Number,Age,Primary Fur Color,Highlight Fur Color,...,Kuks,Quaas,Moans,Tail flags,Tail twitches,Approaches,Indifferent,Runs from,Other Interactions,Lat/Long
4,-73.970268,40.776213,13E-AM-1017-05,13E,AM,10172018,5,Adult,Gray,Cinnamon,...,False,False,False,False,False,False,False,False,,POINT (-73.9702676472613 40.7762126854894)
5,-73.968361,40.772591,11H-AM-1010-03,11H,AM,10102018,3,Adult,Cinnamon,White,...,False,False,False,False,True,False,True,False,,POINT (-73.9683613516225 40.7725908847499)
9,-73.97225,40.774288,11D-AM-1010-03,11D,AM,10102018,3,Adult,Gray,Cinnamon,...,False,False,False,False,False,False,True,False,,POINT (-73.9722500196844 40.7742879599026)
10,-73.969506,40.782351,20B-PM-1013-05,20B,PM,10132018,5,Adult,Gray,White,...,False,False,False,False,False,False,True,False,,POINT (-73.9695063535333 40.7823507678183)
12,-73.953217,40.791967,36I-PM-1007-01,36I,PM,10072018,1,Adult,Gray,Cinnamon,...,False,False,False,False,False,False,True,False,,POINT (-73.9532170504865 40.7919669739962)


## Observations

In [115]:
s2018_nonull["Above Ground Sighter Measurement"].unique()

array([nan, '30', 'FALSE', '10', '6', '24', '8', '25', '3', '4', '20',
       '7', '5', '15', '12', '35', '2', '1', '65', '13', '40', '18', '55',
       '9', '0', '50', '16', '33', '11', '28', '45', '17', '31', '14',
       '19'], dtype=object)

In [116]:
s2020_nonull["Above Ground (Height in Feet)"].unique()

array([nan, '10', '2', '15', '20', '< 1', '3', '6\x89ÛÒ18', '4',
       '20\x89ÛÒ40', '2\x89ÛÒ6', '6', '8', '9', '5', '10\x89ÛÒ12', '12',
       '14', '75'], dtype=object)

In [126]:
text = r"\x89ÛÒ"
decoded_text = bytes(text, "utf-8").decode("unicode_escape")
print(decoded_text)

ÃÃ


In [129]:
text = r"\x89ÛÒ"
decoded_text = bytes(text, "utf-8").decode("unicode_escape")

for character in decoded_text:
    print(f"{character}: {character.encode('utf-8').hex()}")

: c289
Ã: c383
: c29b
Ã: c383
: c292


# Data Cleaning

In [152]:
# Rename columns for 2020 dataset

# Convert columns to a DataFrame
cols_df = pd.DataFrame(s2020_nonull.columns, columns=["name"])

# Define the regex pattern
pattern = '\((.*?)\)' # \( and \) to escape brackets, *.? matches any character (non-newline) zero or more times

new_columns = []
for row in cols_df["name"]:
    stripped_row = re.findall(pattern, row)

    # Remove the brackets and content
    if len(stripped_row) > 0:
        remove_substring = f' ({stripped_row[0].strip("[]")})'
        row = row.strip(remove_substring)

    # Use underscore and cast to lowercase
    new_columns.append(row.replace(" ", "_").lower())

# Update the columns
s2020_nonull.columns = new_columns

s2020_nonull.head()

Unnamed: 0,area_name,area_id,park_name,park_id,squirrel_id,primary_fur_color,highlights_in_fur_color,color_notes,location,above_ground,specific_location,activities,interactions_with_humans,other_notes_or_observations,squirrel_latitude,squirrel_longitude
0,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-01,Gray,White,,Ground Plane,,,Foraging,Indifferent,,40.85941,-73.933936
1,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-02,Gray,White,,Ground Plane,,,Foraging,Indifferent,Looks skinny,40.859436,-73.933937
2,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-03,Gray,White,,Ground Plane,,,"Eating, Digging something",Indifferent,,40.859416,-73.933894
3,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-04,Gray,White,,Ground Plane,,,Running,Indifferent,,40.859418,-73.933895
4,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-05,Gray,Cinnamon,,Ground Plane,,,"Running, Eating",Indifferent,She left food,40.859493,-73.93359
