In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path
import re
import numpy as np
from datetime import datetime

# Import Data

In [2]:
# Base datasets
squirrel_2020 = Path("resources/raw/squirrel_data_2020.csv")
squirrel_2018 = Path("resources/raw/squirrel_data_2018.csv")

# Additional dataset from 2020
parks_2020 = Path("resources/raw/park_data_2020.csv")

# 2020 Dataset - Squirrels

In [3]:
# Create dataframe from CSV
s2020_df = pd.read_csv(squirrel_2020, encoding="unicode_escape")

# Display DataFrame and its shape
print(f"2020 Squirrel Dataset: {s2020_df.shape}")
s2020_df.head()

2020 Squirrel Dataset: (433, 16)


Unnamed: 0,Area Name,Area ID,Park Name,Park ID,Squirrel ID,Primary Fur Color,Highlights in Fur Color,Color Notes,Location,Above Ground (Height in Feet),Specific Location,Activities,Interactions with Humans,Other Notes or Observations,Squirrel Latitude (DD.DDDDDD),Squirrel Longitude (-DD.DDDDDD)
0,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-01,Gray,White,,Ground Plane,,,Foraging,Indifferent,,40.85941,-73.933936
1,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-02,Gray,White,,Ground Plane,,,Foraging,Indifferent,Looks skinny,40.859436,-73.933937
2,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-03,Gray,White,,Ground Plane,,,"Eating, Digging something",Indifferent,,40.859416,-73.933894
3,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-04,Gray,White,,Ground Plane,,,Running,Indifferent,,40.859418,-73.933895
4,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-05,Gray,Cinnamon,,Ground Plane,,,"Running, Eating",Indifferent,She left food,40.859493,-73.93359


## Drop rows and columns

In [4]:
# Drop null values for minimum requirement columns
dropna_columns = [
    "Primary Fur Color",
    "Highlights in Fur Color",
    "Activities",
    "Interactions with Humans",
    "Squirrel Latitude (DD.DDDDDD)",
    "Squirrel Longitude (-DD.DDDDDD)"
]

s2020_nonull = s2020_df.dropna(subset=dropna_columns, how="any")

# Display DataFrame and its shape
print(f"{s2020_nonull.shape}")
s2020_nonull.head()

(192, 16)


Unnamed: 0,Area Name,Area ID,Park Name,Park ID,Squirrel ID,Primary Fur Color,Highlights in Fur Color,Color Notes,Location,Above Ground (Height in Feet),Specific Location,Activities,Interactions with Humans,Other Notes or Observations,Squirrel Latitude (DD.DDDDDD),Squirrel Longitude (-DD.DDDDDD)
0,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-01,Gray,White,,Ground Plane,,,Foraging,Indifferent,,40.85941,-73.933936
1,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-02,Gray,White,,Ground Plane,,,Foraging,Indifferent,Looks skinny,40.859436,-73.933937
2,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-03,Gray,White,,Ground Plane,,,"Eating, Digging something",Indifferent,,40.859416,-73.933894
3,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-04,Gray,White,,Ground Plane,,,Running,Indifferent,,40.859418,-73.933895
4,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-05,Gray,Cinnamon,,Ground Plane,,,"Running, Eating",Indifferent,She left food,40.859493,-73.93359


In [5]:
# Drop columns
drop_columns = ['Area Name', 'Area ID', "Color Notes", "Location", "Above Ground (Height in Feet)", "Specific Location"]

reduced_2020 = s2020_nonull.drop(columns=drop_columns)
reduced_2020.head()

Unnamed: 0,Park Name,Park ID,Squirrel ID,Primary Fur Color,Highlights in Fur Color,Activities,Interactions with Humans,Other Notes or Observations,Squirrel Latitude (DD.DDDDDD),Squirrel Longitude (-DD.DDDDDD)
0,Fort Tryon Park,1,A-01-01,Gray,White,Foraging,Indifferent,,40.85941,-73.933936
1,Fort Tryon Park,1,A-01-02,Gray,White,Foraging,Indifferent,Looks skinny,40.859436,-73.933937
2,Fort Tryon Park,1,A-01-03,Gray,White,"Eating, Digging something",Indifferent,,40.859416,-73.933894
3,Fort Tryon Park,1,A-01-04,Gray,White,Running,Indifferent,,40.859418,-73.933895
4,Fort Tryon Park,1,A-01-05,Gray,Cinnamon,"Running, Eating",Indifferent,She left food,40.859493,-73.93359


## Rename columns

In [6]:
# Rename the columns
cols_df = pd.DataFrame(reduced_2020.columns, columns=["name"])

# Define the regex pattern
pattern = '\((.*?)\)' # \( and \) to escape brackets, *.? matches any character (non-newline) zero or more times

new_columns = []
for row in cols_df["name"]:
    stripped_row = re.findall(pattern, row)

    # Remove the brackets and content
    if len(stripped_row) > 0:
        remove_substring = f' ({stripped_row[0].strip("[]")})'
        row = row.strip(remove_substring)

    # Use underscore and cast to lowercase
    new_columns.append(row.replace(" ", "_").lower())

# Update the columns
reduced_2020.columns = new_columns

reduced_2020.head()

Unnamed: 0,park_name,park_id,squirrel_id,primary_fur_color,highlights_in_fur_color,activities,interactions_with_humans,other_notes_or_observations,squirrel_latitude,squirrel_longitude
0,Fort Tryon Park,1,A-01-01,Gray,White,Foraging,Indifferent,,40.85941,-73.933936
1,Fort Tryon Park,1,A-01-02,Gray,White,Foraging,Indifferent,Looks skinny,40.859436,-73.933937
2,Fort Tryon Park,1,A-01-03,Gray,White,"Eating, Digging something",Indifferent,,40.859416,-73.933894
3,Fort Tryon Park,1,A-01-04,Gray,White,Running,Indifferent,,40.859418,-73.933895
4,Fort Tryon Park,1,A-01-05,Gray,Cinnamon,"Running, Eating",Indifferent,She left food,40.859493,-73.93359


In [7]:
# Display dataframe info
reduced_2020.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 192 entries, 0 to 432
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   park_name                    192 non-null    object 
 1   park_id                      192 non-null    int64  
 2   squirrel_id                  192 non-null    object 
 3   primary_fur_color            192 non-null    object 
 4   highlights_in_fur_color      192 non-null    object 
 5   activities                   192 non-null    object 
 6   interactions_with_humans     192 non-null    object 
 7   other_notes_or_observations  96 non-null     object 
 8   squirrel_latitude            192 non-null    float64
 9   squirrel_longitude           192 non-null    float64
dtypes: float64(2), int64(1), object(7)
memory usage: 16.5+ KB


## Clean 'activities' column

In [8]:
# Create a DataFrame of the 'activities' column for cleaning
activities_df = reduced_2020[['squirrel_id', 'activities']].copy()
activities_df.head()

Unnamed: 0,squirrel_id,activities
0,A-01-01,Foraging
1,A-01-02,Foraging
2,A-01-03,"Eating, Digging something"
3,A-01-04,Running
4,A-01-05,"Running, Eating"


In [9]:
row_list = []
for row in activities_df['activities']:
    row_list.append(row.split(", "))

activities_df['activity_list'] = row_list
activities_df.head()

Unnamed: 0,squirrel_id,activities,activity_list
0,A-01-01,Foraging,[Foraging]
1,A-01-02,Foraging,[Foraging]
2,A-01-03,"Eating, Digging something","[Eating, Digging something]"
3,A-01-04,Running,[Running]
4,A-01-05,"Running, Eating","[Running, Eating]"


In [10]:
idx_list = []
for idx, row in enumerate(activities_df['activity_list']):
    for word in row:
        if (word.startswith("Eating ")):
            print(idx, word)
            idx_list.append(idx)

129 Eating (or pretending to eat)
139 Eating (nuts)
178 Eating (bread crumbs)


In [11]:
activities_df.iloc[idx_list]

Unnamed: 0,squirrel_id,activities,activity_list
330,C-18-01,"Running, Eating (or pretending to eat)","[Running, Eating (or pretending to eat)]"
344,C-19-14,"Eating (nuts), Foraging","[Eating (nuts), Foraging]"
415,D-22-27,"Eating (bread crumbs), Foraging","[Eating (bread crumbs), Foraging]"


In [12]:
pattern = 'eating \((.*?)\)'
sitting = ["hanging out", "hanging", "chilling", "sitting in short tree", "sticking out of a tree", "very carefully watching a cat", "watching #2", "posing"]
shouting = ["vocalization at us", "defending tree"]
other_activities = ["self-cleaning", "sleeping", "sleeping (dead?)", "battery"]

for row in activities_df['activity_list']:
    for idx, word in enumerate(row):
        row[idx] = word.lower()
        word = word.lower()

        # Check for "eating (...)"
        bracket = re.findall(pattern, word)
        if (len(bracket) > 0):
            row[idx] = "eating"

        # Check for "digging "
        if (word.startswith("digging ") or word == "burying"):
            row[idx] = "digging"

        # Check for words in the shouting list
        if (word in shouting):
            row[idx] = "shouting"

        # Convert to "foraging"
        if (word == "nesting/gathering leaves"):
            row[idx] = "foraging"

        # Check for words in the sitting list
        if (word in sitting):
            row[idx] = "sitting"

        # Convert to "climbing"
        if (word == "jumping"):
            row[idx] = "climbing"

        # Account for other_activities
        if (word in other_activities):
            row[idx] = "other_activities"

In [13]:
# Check the 'activities' distribution
flat_list = [value for row in activities_df['activity_list'] for value in row]
flat_df = pd.DataFrame(flat_list)
flat_df.value_counts()

foraging            72
climbing            52
eating              48
running             37
chasing             20
shouting            17
sitting             14
digging              5
other_activities     4
dtype: int64

In [14]:
### Create the boolean activity columns

# Create a list of the reduced activity names
column_names = list(set([word.lower() for word in flat_list]))

# Create new columns for each activity
for col_name in column_names:
    activities_df[col_name] = pd.Series(dtype=bool)

# Loop through the 'activity_list' and populate the boolean columns
for idx, row in activities_df.iterrows():
    for word in row['activity_list']:
        for activity in column_names:
            if (row[activity] == True):
                continue
            if (word == activity):
                row[activity] = True
            else:
                row[activity] = False

# Display the DataFrame
activities_df.head()

Unnamed: 0,squirrel_id,activities,activity_list,foraging,eating,digging,climbing,other_activities,sitting,chasing,shouting,running
0,A-01-01,Foraging,[foraging],True,False,False,False,False,False,False,False,False
1,A-01-02,Foraging,[foraging],True,False,False,False,False,False,False,False,False
2,A-01-03,"Eating, Digging something","[eating, digging]",False,True,True,False,False,False,False,False,False
3,A-01-04,Running,[running],False,False,False,False,False,False,False,False,True
4,A-01-05,"Running, Eating","[running, eating]",False,True,False,False,False,False,False,False,True


In [15]:
# Drop the 'activities' and 'activity_list' columns
activities_df = activities_df.drop(columns=['activities', 'activity_list'])
activities_df.head()

Unnamed: 0,squirrel_id,foraging,eating,digging,climbing,other_activities,sitting,chasing,shouting,running
0,A-01-01,True,False,False,False,False,False,False,False,False
1,A-01-02,True,False,False,False,False,False,False,False,False
2,A-01-03,False,True,True,False,False,False,False,False,False
3,A-01-04,False,False,False,False,False,False,False,False,True
4,A-01-05,False,True,False,False,False,False,False,False,True


In [16]:
# Merge with reduced_2020
s2020_updated = pd.merge(reduced_2020, activities_df, on="squirrel_id", how="inner")
s2020_updated.head()

Unnamed: 0,park_name,park_id,squirrel_id,primary_fur_color,highlights_in_fur_color,activities,interactions_with_humans,other_notes_or_observations,squirrel_latitude,squirrel_longitude,foraging,eating,digging,climbing,other_activities,sitting,chasing,shouting,running
0,Fort Tryon Park,1,A-01-01,Gray,White,Foraging,Indifferent,,40.85941,-73.933936,True,False,False,False,False,False,False,False,False
1,Fort Tryon Park,1,A-01-02,Gray,White,Foraging,Indifferent,Looks skinny,40.859436,-73.933937,True,False,False,False,False,False,False,False,False
2,Fort Tryon Park,1,A-01-03,Gray,White,"Eating, Digging something",Indifferent,,40.859416,-73.933894,False,True,True,False,False,False,False,False,False
3,Fort Tryon Park,1,A-01-04,Gray,White,Running,Indifferent,,40.859418,-73.933895,False,False,False,False,False,False,False,False,True
4,Fort Tryon Park,1,A-01-05,Gray,Cinnamon,"Running, Eating",Indifferent,She left food,40.859493,-73.93359,False,True,False,False,False,False,False,False,True


In [17]:
# Drop the 'activities' column
s2020_updated = s2020_updated.drop(columns=['activities'])
s2020_updated.head()

Unnamed: 0,park_name,park_id,squirrel_id,primary_fur_color,highlights_in_fur_color,interactions_with_humans,other_notes_or_observations,squirrel_latitude,squirrel_longitude,foraging,eating,digging,climbing,other_activities,sitting,chasing,shouting,running
0,Fort Tryon Park,1,A-01-01,Gray,White,Indifferent,,40.85941,-73.933936,True,False,False,False,False,False,False,False,False
1,Fort Tryon Park,1,A-01-02,Gray,White,Indifferent,Looks skinny,40.859436,-73.933937,True,False,False,False,False,False,False,False,False
2,Fort Tryon Park,1,A-01-03,Gray,White,Indifferent,,40.859416,-73.933894,False,True,True,False,False,False,False,False,False
3,Fort Tryon Park,1,A-01-04,Gray,White,Indifferent,,40.859418,-73.933895,False,False,False,False,False,False,False,False,True
4,Fort Tryon Park,1,A-01-05,Gray,Cinnamon,Indifferent,She left food,40.859493,-73.93359,False,True,False,False,False,False,False,False,True


## Clean 'interactions_with_humans' column

In [18]:
# Create a DataFrame of the 'activities' column for cleaning
interactions_df = s2020_updated[['squirrel_id', 'interactions_with_humans']].copy()
interactions_df.head()

Unnamed: 0,squirrel_id,interactions_with_humans
0,A-01-01,Indifferent
1,A-01-02,Indifferent
2,A-01-03,Indifferent
3,A-01-04,Indifferent
4,A-01-05,Indifferent


In [19]:
# Identify all the different interactions
print(interactions_df['interactions_with_humans'].value_counts())

# NOTE: 2018 equivalents = 'Approaches' + 'Indifferent' + 'Runs from' + 'Other Interactions'

Indifferent                                      127
Runs From                                         28
Approaches                                        20
Indifferent, Preoccupied by HAAWK                  6
Watches us from tree                               1
Runs From, watchful                                1
Watching us from tree - very interested in us      1
Approaches, watching us                            1
Runs From, watches us in short tree                1
Friendly                                           1
Indifferent, Runs From                             1
Approaches, Runs From                              1
Watching                                           1
Staring                                            1
Skittish to humans                                 1
Name: interactions_with_humans, dtype: int64


In [20]:
# Combine 'child' activities that match 'parent' activities
interact_options = ["approaches", "indifferent", "runs from"]

for idx, row in enumerate(interactions_df['interactions_with_humans']):
    # Convert to lowercase
    row = row.lower()
    
    # Account for rows with multiple interaction types
    multi_option = row.split(", ")
    if len(multi_option) > 1:
        count = 0
        for word in multi_option:
            # Increment if in interact_options
            if (word in interact_options):
                count += 1
            else:
                # Capture: "Indifferent, (not in interact_options)"
                if (row.startswith("indifferent, ")):
                    interactions_df.loc[idx, 'interactions_with_humans'] = ["indifferent"]

                # Captures the rows with "Watch%"
                else:
                    interactions_df.loc[idx, 'interactions_with_humans'] = [(multi_option[0]), "watching"]

            # All options are in interact_options
            if (count == len(multi_option)):
                interactions_df.loc[idx, 'interactions_with_humans'] = multi_option # [word.replace(" ", "_") for word in multi_option]

    # Account for rows with a single interaction
    else:
        # If applicable, filter child to parent
        if (row.startswith("watch") or row == "staring"):
            interactions_df.loc[idx, 'interactions_with_humans'] = ["watching"]
        if (row == "skittish to humans"):
            interactions_df.loc[idx, 'interactions_with_humans'] = ["runs from"]
        if (row == "friendly"):
            interactions_df.loc[idx, 'interactions_with_humans'] = ["approaches"]
        if (row in interact_options):
            # Convert the row to a list, for simplicity later
            interactions_df.loc[idx, 'interactions_with_humans'] = [row] #[row.replace(" ", "_")]

In [21]:
# Check the updated distribution
print(interactions_df['interactions_with_humans'].value_counts())

[indifferent]               133
[runs from]                  29
[approaches]                 21
[watching]                    4
[runs from, watching]         2
[approaches, watching]        1
[indifferent, runs from]      1
[approaches, runs from]       1
Name: interactions_with_humans, dtype: int64


In [22]:
### Create the boolean interaction columns
unique_interactions = list(set([value for row in interactions_df['interactions_with_humans'] for value in row]))

# Create new columns for each interaction
for col_name in unique_interactions:
    interactions_df[col_name] = pd.Series(dtype=bool)

# Loop through the 'activity_list' and populate the boolean columns
for idx, row in interactions_df.iterrows():
    for word in row['interactions_with_humans']:
        for interaction in unique_interactions:
            if (row[interaction] == True):
                continue
            elif (word == interaction):
                row[interaction] = True
            else:
                row[interaction] = False

# Display the DataFrame
interactions_df.head()

Unnamed: 0,squirrel_id,interactions_with_humans,indifferent,runs from,approaches,watching
0,A-01-01,[indifferent],True,False,False,False
1,A-01-02,[indifferent],True,False,False,False
2,A-01-03,[indifferent],True,False,False,False
3,A-01-04,[indifferent],True,False,False,False
4,A-01-05,[indifferent],True,False,False,False


In [23]:
# Drop the 'interactions_with_humans' column
interactions_df = interactions_df.drop(columns=['interactions_with_humans'])
interactions_df.head()

Unnamed: 0,squirrel_id,indifferent,runs from,approaches,watching
0,A-01-01,True,False,False,False
1,A-01-02,True,False,False,False
2,A-01-03,True,False,False,False
3,A-01-04,True,False,False,False
4,A-01-05,True,False,False,False


In [24]:
# Merge with s2020_updated
s2020_booleans = pd.merge(s2020_updated, interactions_df, on="squirrel_id", how="inner")
s2020_booleans.head()

Unnamed: 0,park_name,park_id,squirrel_id,primary_fur_color,highlights_in_fur_color,interactions_with_humans,other_notes_or_observations,squirrel_latitude,squirrel_longitude,foraging,...,climbing,other_activities,sitting,chasing,shouting,running,indifferent,runs from,approaches,watching
0,Fort Tryon Park,1,A-01-01,Gray,White,Indifferent,,40.85941,-73.933936,True,...,False,False,False,False,False,False,True,False,False,False
1,Fort Tryon Park,1,A-01-02,Gray,White,Indifferent,Looks skinny,40.859436,-73.933937,True,...,False,False,False,False,False,False,True,False,False,False
2,Fort Tryon Park,1,A-01-03,Gray,White,Indifferent,,40.859416,-73.933894,False,...,False,False,False,False,False,False,True,False,False,False
3,Fort Tryon Park,1,A-01-04,Gray,White,Indifferent,,40.859418,-73.933895,False,...,False,False,False,False,False,True,True,False,False,False
4,Fort Tryon Park,1,A-01-05,Gray,Cinnamon,Indifferent,She left food,40.859493,-73.93359,False,...,False,False,False,False,False,True,True,False,False,False


In [25]:
# Drop the 'interactions_with_humans' column
s2020_booleans = s2020_booleans.drop(columns=['interactions_with_humans'])
s2020_booleans.head()

Unnamed: 0,park_name,park_id,squirrel_id,primary_fur_color,highlights_in_fur_color,other_notes_or_observations,squirrel_latitude,squirrel_longitude,foraging,eating,...,climbing,other_activities,sitting,chasing,shouting,running,indifferent,runs from,approaches,watching
0,Fort Tryon Park,1,A-01-01,Gray,White,,40.85941,-73.933936,True,False,...,False,False,False,False,False,False,True,False,False,False
1,Fort Tryon Park,1,A-01-02,Gray,White,Looks skinny,40.859436,-73.933937,True,False,...,False,False,False,False,False,False,True,False,False,False
2,Fort Tryon Park,1,A-01-03,Gray,White,,40.859416,-73.933894,False,True,...,False,False,False,False,False,False,True,False,False,False
3,Fort Tryon Park,1,A-01-04,Gray,White,,40.859418,-73.933895,False,False,...,False,False,False,False,False,True,True,False,False,False
4,Fort Tryon Park,1,A-01-05,Gray,Cinnamon,She left food,40.859493,-73.93359,False,True,...,False,False,False,False,False,True,True,False,False,False


## Correct the Data Types

In [26]:
# Confirm boolean columns are of "bool" type
s2020_booleans.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 192 entries, 0 to 191
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   park_name                    192 non-null    object 
 1   park_id                      192 non-null    int64  
 2   squirrel_id                  192 non-null    object 
 3   primary_fur_color            192 non-null    object 
 4   highlights_in_fur_color      192 non-null    object 
 5   other_notes_or_observations  96 non-null     object 
 6   squirrel_latitude            192 non-null    float64
 7   squirrel_longitude           192 non-null    float64
 8   foraging                     192 non-null    object 
 9   eating                       192 non-null    object 
 10  digging                      192 non-null    object 
 11  climbing                     192 non-null    object 
 12  other_activities             192 non-null    object 
 13  sitting             

In [27]:
# Convert boolean columns to "bool" type
bool_columns = s2020_booleans.columns[8:]
s2020_booleans[bool_columns] = s2020_booleans[bool_columns].astype(bool)

In [28]:
s2020_booleans.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 192 entries, 0 to 191
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   park_name                    192 non-null    object 
 1   park_id                      192 non-null    int64  
 2   squirrel_id                  192 non-null    object 
 3   primary_fur_color            192 non-null    object 
 4   highlights_in_fur_color      192 non-null    object 
 5   other_notes_or_observations  96 non-null     object 
 6   squirrel_latitude            192 non-null    float64
 7   squirrel_longitude           192 non-null    float64
 8   foraging                     192 non-null    bool   
 9   eating                       192 non-null    bool   
 10  digging                      192 non-null    bool   
 11  climbing                     192 non-null    bool   
 12  other_activities             192 non-null    bool   
 13  sitting             

# 2018 Dataset - Squirrels

In [29]:
# Create dataframe from CSV
s2018_df = pd.read_csv(squirrel_2018)

# Display DataFrame and its shape
print(f"2018 Dataset: {s2018_df.shape}")
s2018_df.head()

2018 Dataset: (3023, 31)


Unnamed: 0,X,Y,Unique Squirrel ID,Hectare,Shift,Date,Hectare Squirrel Number,Age,Primary Fur Color,Highlight Fur Color,...,Kuks,Quaas,Moans,Tail flags,Tail twitches,Approaches,Indifferent,Runs from,Other Interactions,Lat/Long
0,-73.956134,40.794082,37F-PM-1014-03,37F,PM,10142018,3,,,,...,False,False,False,False,False,False,False,False,,POINT (-73.9561344937861 40.7940823884086)
1,-73.968857,40.783783,21B-AM-1019-04,21B,AM,10192018,4,,,,...,False,False,False,False,False,False,False,False,,POINT (-73.9688574691102 40.7837825208444)
2,-73.974281,40.775534,11B-PM-1014-08,11B,PM,10142018,8,,Gray,,...,False,False,False,False,False,False,False,False,,POINT (-73.97428114848522 40.775533619083)
3,-73.959641,40.790313,32E-PM-1017-14,32E,PM,10172018,14,Adult,Gray,,...,False,False,False,False,False,False,False,True,,POINT (-73.9596413903948 40.7903128889029)
4,-73.970268,40.776213,13E-AM-1017-05,13E,AM,10172018,5,Adult,Gray,Cinnamon,...,False,False,False,False,False,False,False,False,,POINT (-73.9702676472613 40.7762126854894)


In [30]:
# Identify the non-null count and data types
s2018_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3023 entries, 0 to 3022
Data columns (total 31 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   X                                           3023 non-null   float64
 1   Y                                           3023 non-null   float64
 2   Unique Squirrel ID                          3023 non-null   object 
 3   Hectare                                     3023 non-null   object 
 4   Shift                                       3023 non-null   object 
 5   Date                                        3023 non-null   int64  
 6   Hectare Squirrel Number                     3023 non-null   int64  
 7   Age                                         2902 non-null   object 
 8   Primary Fur Color                           2968 non-null   object 
 9   Highlight Fur Color                         1937 non-null   object 
 10  Combination 

## Drop rows and columns

In [31]:
# Drop null values for minimum requirement columns
s2018_nonull = s2018_df.dropna(
    subset=[
        "Primary Fur Color",
        "Highlight Fur Color",
        "Running",
        "Chasing",
        "Climbing",
        "Eating",
        "Foraging",
        "Approaches",
        "Indifferent",
        "Runs from",
        "X",
        "Y"],
    how="any").reset_index(drop=True)

# Display DataFrame and its shape
print(f"{s2018_nonull.shape}")
s2018_nonull.head()

(1937, 31)


Unnamed: 0,X,Y,Unique Squirrel ID,Hectare,Shift,Date,Hectare Squirrel Number,Age,Primary Fur Color,Highlight Fur Color,...,Kuks,Quaas,Moans,Tail flags,Tail twitches,Approaches,Indifferent,Runs from,Other Interactions,Lat/Long
0,-73.970268,40.776213,13E-AM-1017-05,13E,AM,10172018,5,Adult,Gray,Cinnamon,...,False,False,False,False,False,False,False,False,,POINT (-73.9702676472613 40.7762126854894)
1,-73.968361,40.772591,11H-AM-1010-03,11H,AM,10102018,3,Adult,Cinnamon,White,...,False,False,False,False,True,False,True,False,,POINT (-73.9683613516225 40.7725908847499)
2,-73.97225,40.774288,11D-AM-1010-03,11D,AM,10102018,3,Adult,Gray,Cinnamon,...,False,False,False,False,False,False,True,False,,POINT (-73.9722500196844 40.7742879599026)
3,-73.969506,40.782351,20B-PM-1013-05,20B,PM,10132018,5,Adult,Gray,White,...,False,False,False,False,False,False,True,False,,POINT (-73.9695063535333 40.7823507678183)
4,-73.953217,40.791967,36I-PM-1007-01,36I,PM,10072018,1,Adult,Gray,Cinnamon,...,False,False,False,False,False,False,True,False,,POINT (-73.9532170504865 40.7919669739962)


## Split 'Date' to Year, Month, Day

In [32]:
# Check the distribution of unique values
s2018_nonull['Date'].value_counts()

10132018    306
10142018    258
10072018    249
10062018    211
10102018    203
10122018    175
10082018    143
10172018    134
10182018    115
10192018     97
10202018     46
Name: Date, dtype: int64

In [33]:
# Create a copy of the date column
dates_df = s2018_nonull[['Unique Squirrel ID', 'Date']].copy()
dates_df.head()

Unnamed: 0,Unique Squirrel ID,Date
0,13E-AM-1017-05,10172018
1,11H-AM-1010-03,10102018
2,11D-AM-1010-03,10102018
3,20B-PM-1013-05,10132018
4,36I-PM-1007-01,10072018


In [34]:
# Create the year - month - day columns
date_columns = ['Year', 'Month', 'Day']
for col in date_columns:
    dates_df[col] = pd.Series(dtype=int)

dates_df.head()

Unnamed: 0,Unique Squirrel ID,Date,Year,Month,Day
0,13E-AM-1017-05,10172018,,,
1,11H-AM-1010-03,10102018,,,
2,11D-AM-1010-03,10102018,,,
3,20B-PM-1013-05,10132018,,,
4,36I-PM-1007-01,10072018,,,


In [35]:
# Parse the 'Date' column
for idx, date in enumerate(dates_df['Date']):
    updated = datetime.strptime(str(date), "%m%d%Y")

    dates_df.loc[idx, 'Year'] = updated.year
    dates_df.loc[idx, 'Month'] = updated.month
    dates_df.loc[idx, 'Day'] = updated.day

dates_df.head()

Unnamed: 0,Unique Squirrel ID,Date,Year,Month,Day
0,13E-AM-1017-05,10172018,2018.0,10.0,17.0
1,11H-AM-1010-03,10102018,2018.0,10.0,10.0
2,11D-AM-1010-03,10102018,2018.0,10.0,10.0
3,20B-PM-1013-05,10132018,2018.0,10.0,13.0
4,36I-PM-1007-01,10072018,2018.0,10.0,7.0


In [36]:
# Convert the columns to an integer
dates_df = dates_df.astype({
    'Year': int,
    'Month': int,
    'Day': int
})

dates_df.head()

Unnamed: 0,Unique Squirrel ID,Date,Year,Month,Day
0,13E-AM-1017-05,10172018,2018,10,17
1,11H-AM-1010-03,10102018,2018,10,10
2,11D-AM-1010-03,10102018,2018,10,10
3,20B-PM-1013-05,10132018,2018,10,13
4,36I-PM-1007-01,10072018,2018,10,7


In [37]:
# Drop the 'Date' column
dates_df = dates_df.drop(columns=['Date'])
dates_df.head()

Unnamed: 0,Unique Squirrel ID,Year,Month,Day
0,13E-AM-1017-05,2018,10,17
1,11H-AM-1010-03,2018,10,10
2,11D-AM-1010-03,2018,10,10
3,20B-PM-1013-05,2018,10,13
4,36I-PM-1007-01,2018,10,7


In [38]:
# Merge with s2018_nonull
s2018_nonull = pd.merge(s2018_nonull, dates_df, on="Unique Squirrel ID", how="inner")
s2018_nonull.head()

Unnamed: 0,X,Y,Unique Squirrel ID,Hectare,Shift,Date,Hectare Squirrel Number,Age,Primary Fur Color,Highlight Fur Color,...,Tail flags,Tail twitches,Approaches,Indifferent,Runs from,Other Interactions,Lat/Long,Year,Month,Day
0,-73.970268,40.776213,13E-AM-1017-05,13E,AM,10172018,5,Adult,Gray,Cinnamon,...,False,False,False,False,False,,POINT (-73.9702676472613 40.7762126854894),2018,10,17
1,-73.968361,40.772591,11H-AM-1010-03,11H,AM,10102018,3,Adult,Cinnamon,White,...,False,True,False,True,False,,POINT (-73.9683613516225 40.7725908847499),2018,10,10
2,-73.97225,40.774288,11D-AM-1010-03,11D,AM,10102018,3,Adult,Gray,Cinnamon,...,False,False,False,True,False,,POINT (-73.9722500196844 40.7742879599026),2018,10,10
3,-73.969506,40.782351,20B-PM-1013-05,20B,PM,10132018,5,Adult,Gray,White,...,False,False,False,True,False,,POINT (-73.9695063535333 40.7823507678183),2018,10,13
4,-73.953217,40.791967,36I-PM-1007-01,36I,PM,10072018,1,Adult,Gray,Cinnamon,...,False,False,False,True,False,,POINT (-73.9532170504865 40.7919669739962),2018,10,7


In [39]:
# Drop the 'Date' column
s2018_nonull = s2018_nonull.drop(columns=['Date'])
s2018_nonull.head()

Unnamed: 0,X,Y,Unique Squirrel ID,Hectare,Shift,Hectare Squirrel Number,Age,Primary Fur Color,Highlight Fur Color,Combination of Primary and Highlight Color,...,Tail flags,Tail twitches,Approaches,Indifferent,Runs from,Other Interactions,Lat/Long,Year,Month,Day
0,-73.970268,40.776213,13E-AM-1017-05,13E,AM,5,Adult,Gray,Cinnamon,Gray+Cinnamon,...,False,False,False,False,False,,POINT (-73.9702676472613 40.7762126854894),2018,10,17
1,-73.968361,40.772591,11H-AM-1010-03,11H,AM,3,Adult,Cinnamon,White,Cinnamon+White,...,False,True,False,True,False,,POINT (-73.9683613516225 40.7725908847499),2018,10,10
2,-73.97225,40.774288,11D-AM-1010-03,11D,AM,3,Adult,Gray,Cinnamon,Gray+Cinnamon,...,False,False,False,True,False,,POINT (-73.9722500196844 40.7742879599026),2018,10,10
3,-73.969506,40.782351,20B-PM-1013-05,20B,PM,5,Adult,Gray,White,Gray+White,...,False,False,False,True,False,,POINT (-73.9695063535333 40.7823507678183),2018,10,13
4,-73.953217,40.791967,36I-PM-1007-01,36I,PM,1,Adult,Gray,Cinnamon,Gray+Cinnamon,...,False,False,False,True,False,,POINT (-73.9532170504865 40.7919669739962),2018,10,7


## Delete Duplicate Rows

In [40]:
s2018_nonull[s2018_nonull['Unique Squirrel ID'].duplicated() == True]

Unnamed: 0,X,Y,Unique Squirrel ID,Hectare,Shift,Hectare Squirrel Number,Age,Primary Fur Color,Highlight Fur Color,Combination of Primary and Highlight Color,...,Tail flags,Tail twitches,Approaches,Indifferent,Runs from,Other Interactions,Lat/Long,Year,Month,Day
59,-73.957044,40.794851,37E-PM-1006-03,37E,PM,3,Adult,Gray,Cinnamon,Gray+Cinnamon,...,False,False,False,False,True,me,POINT (-73.9570437717691 40.794850940803904),2018,10,6
60,-73.956967,40.794879,37E-PM-1006-03,37E,PM,3,Adult,Gray,Cinnamon,Gray+Cinnamon,...,False,False,False,False,True,me,POINT (-73.9569670900695 40.794879285681304),2018,10,6
61,-73.956967,40.794879,37E-PM-1006-03,37E,PM,3,Adult,Gray,Cinnamon,Gray+Cinnamon,...,False,False,False,False,True,me,POINT (-73.9569670900695 40.794879285681304),2018,10,6
332,-73.974945,40.772066,7D-PM-1010-01,07D,PM,1,Adult,Gray,White,Gray+White,...,False,False,False,False,False,,POINT (-73.9749446629706 40.7720659696067),2018,10,10
333,-73.974852,40.771959,7D-PM-1010-01,07D,PM,1,Adult,Gray,White,Gray+White,...,False,False,False,False,False,,POINT (-73.9748522796455 40.7719586621003),2018,10,10
334,-73.974852,40.771959,7D-PM-1010-01,07D,PM,1,Adult,Gray,White,Gray+White,...,False,False,False,False,False,,POINT (-73.9748522796455 40.7719586621003),2018,10,10
998,-73.957711,40.798406,40B-AM-1019-06,40B,AM,6,Juvenile,Gray,White,Gray+White,...,False,False,False,False,True,,POINT (-73.957710975899 40.7984055614047),2018,10,19
999,-73.958236,40.798775,40B-AM-1019-06,40B,AM,6,Juvenile,Gray,White,Gray+White,...,False,False,False,False,True,,POINT (-73.9582363437135 40.7987752300505),2018,10,19
1000,-73.958236,40.798775,40B-AM-1019-06,40B,AM,6,Juvenile,Gray,White,Gray+White,...,False,False,False,False,True,,POINT (-73.9582363437135 40.7987752300505),2018,10,19
1348,-73.977262,40.769282,4C-PM-1010-05,04C,PM,5,Adult,Gray,Cinnamon,Gray+Cinnamon,...,False,False,False,False,False,,POINT (-73.9772624834791 40.7692821918543),2018,10,10


In [41]:
# Drop the duplicate IDs
s2018_nonull = s2018_nonull[s2018_nonull['Unique Squirrel ID'].duplicated() == False]
s2018_nonull.head()

Unnamed: 0,X,Y,Unique Squirrel ID,Hectare,Shift,Hectare Squirrel Number,Age,Primary Fur Color,Highlight Fur Color,Combination of Primary and Highlight Color,...,Tail flags,Tail twitches,Approaches,Indifferent,Runs from,Other Interactions,Lat/Long,Year,Month,Day
0,-73.970268,40.776213,13E-AM-1017-05,13E,AM,5,Adult,Gray,Cinnamon,Gray+Cinnamon,...,False,False,False,False,False,,POINT (-73.9702676472613 40.7762126854894),2018,10,17
1,-73.968361,40.772591,11H-AM-1010-03,11H,AM,3,Adult,Cinnamon,White,Cinnamon+White,...,False,True,False,True,False,,POINT (-73.9683613516225 40.7725908847499),2018,10,10
2,-73.97225,40.774288,11D-AM-1010-03,11D,AM,3,Adult,Gray,Cinnamon,Gray+Cinnamon,...,False,False,False,True,False,,POINT (-73.9722500196844 40.7742879599026),2018,10,10
3,-73.969506,40.782351,20B-PM-1013-05,20B,PM,5,Adult,Gray,White,Gray+White,...,False,False,False,True,False,,POINT (-73.9695063535333 40.7823507678183),2018,10,13
4,-73.953217,40.791967,36I-PM-1007-01,36I,PM,1,Adult,Gray,Cinnamon,Gray+Cinnamon,...,False,False,False,True,False,,POINT (-73.9532170504865 40.7919669739962),2018,10,7


## Rename columns

In [42]:
# Rename the columns
cols_df = pd.DataFrame(s2018_nonull.columns, columns=["name"])

# Define the regex pattern
pattern = '\((.*?)\)' # \( and \) to escape brackets, *.? matches any character (non-newline) zero or more times

new_columns = []
for row in cols_df["name"]:
    # Use underscore and cast to lowercase
    new_columns.append(row.replace(" ", "_").lower())

# Update the columns
s2018_nonull.columns = new_columns

s2018_nonull.head()

Unnamed: 0,x,y,unique_squirrel_id,hectare,shift,hectare_squirrel_number,age,primary_fur_color,highlight_fur_color,combination_of_primary_and_highlight_color,...,tail_flags,tail_twitches,approaches,indifferent,runs_from,other_interactions,lat/long,year,month,day
0,-73.970268,40.776213,13E-AM-1017-05,13E,AM,5,Adult,Gray,Cinnamon,Gray+Cinnamon,...,False,False,False,False,False,,POINT (-73.9702676472613 40.7762126854894),2018,10,17
1,-73.968361,40.772591,11H-AM-1010-03,11H,AM,3,Adult,Cinnamon,White,Cinnamon+White,...,False,True,False,True,False,,POINT (-73.9683613516225 40.7725908847499),2018,10,10
2,-73.97225,40.774288,11D-AM-1010-03,11D,AM,3,Adult,Gray,Cinnamon,Gray+Cinnamon,...,False,False,False,True,False,,POINT (-73.9722500196844 40.7742879599026),2018,10,10
3,-73.969506,40.782351,20B-PM-1013-05,20B,PM,5,Adult,Gray,White,Gray+White,...,False,False,False,True,False,,POINT (-73.9695063535333 40.7823507678183),2018,10,13
4,-73.953217,40.791967,36I-PM-1007-01,36I,PM,1,Adult,Gray,Cinnamon,Gray+Cinnamon,...,False,False,False,True,False,,POINT (-73.9532170504865 40.7919669739962),2018,10,7


In [43]:
# Check the columns
s2018_nonull.columns

Index(['x', 'y', 'unique_squirrel_id', 'hectare', 'shift',
       'hectare_squirrel_number', 'age', 'primary_fur_color',
       'highlight_fur_color', 'combination_of_primary_and_highlight_color',
       'color_notes', 'location', 'above_ground_sighter_measurement',
       'specific_location', 'running', 'chasing', 'climbing', 'eating',
       'foraging', 'other_activities', 'kuks', 'quaas', 'moans', 'tail_flags',
       'tail_twitches', 'approaches', 'indifferent', 'runs_from',
       'other_interactions', 'lat/long', 'year', 'month', 'day'],
      dtype='object')

In [44]:
## Delete unwanted columns but maintain original copy
s2018_drop_columns = s2018_nonull.drop(columns=[
    'hectare',
    'shift',
    'hectare_squirrel_number',
    'combination_of_primary_and_highlight_color',
    'color_notes',
    'location',
    'above_ground_sighter_measurement',
    'specific_location',
    'tail_flags',
    'tail_twitches'
]).reset_index(drop=True)

# Rename columns to match 2020 dataset
s2018_rename_columns = s2018_drop_columns.rename(columns={
    "x": "squirrel_longitude",
    "y": "squirrel_latitude",
    "highlight_fur_colour": "highlights_in_fur_color",
    "unique_squirrel_id": "squirrel_id"
})

s2018_rename_columns.head()

Unnamed: 0,squirrel_longitude,squirrel_latitude,squirrel_id,age,primary_fur_color,highlight_fur_color,running,chasing,climbing,eating,...,quaas,moans,approaches,indifferent,runs_from,other_interactions,lat/long,year,month,day
0,-73.970268,40.776213,13E-AM-1017-05,Adult,Gray,Cinnamon,False,False,False,False,...,False,False,False,False,False,,POINT (-73.9702676472613 40.7762126854894),2018,10,17
1,-73.968361,40.772591,11H-AM-1010-03,Adult,Cinnamon,White,False,False,False,False,...,False,False,False,True,False,,POINT (-73.9683613516225 40.7725908847499),2018,10,10
2,-73.97225,40.774288,11D-AM-1010-03,Adult,Gray,Cinnamon,False,False,True,False,...,False,False,False,True,False,,POINT (-73.9722500196844 40.7742879599026),2018,10,10
3,-73.969506,40.782351,20B-PM-1013-05,Adult,Gray,White,False,False,False,False,...,False,False,False,True,False,,POINT (-73.9695063535333 40.7823507678183),2018,10,13
4,-73.953217,40.791967,36I-PM-1007-01,Adult,Gray,Cinnamon,False,False,False,False,...,False,False,False,True,False,,POINT (-73.9532170504865 40.7919669739962),2018,10,7


## Combine 'kuks', 'quaas', 'moans' to 'shouting'

In [45]:
# Isolate the "shouting" equivalent columns - kuks, quaas, moans
shouting_df = s2018_rename_columns[['squirrel_id', 'kuks', 'quaas', 'moans']].copy()

shouting_df['shouting'] = pd.Series(dtype=bool)
shouting_df.head()

Unnamed: 0,squirrel_id,kuks,quaas,moans,shouting
0,13E-AM-1017-05,False,False,False,
1,11H-AM-1010-03,False,False,False,
2,11D-AM-1010-03,False,False,False,
3,20B-PM-1013-05,False,False,False,
4,36I-PM-1007-01,False,False,False,


In [46]:
# If any of the shouting_cols are True, set 'shouting' to True
shouting_cols = shouting_df.columns[1:4]

for idx, row in shouting_df.iterrows():
    for col in shouting_cols:
        if row[col] == True:
            shouting_df.loc[idx, 'shouting'] = True
            break
        else:
            shouting_df.loc[idx, 'shouting'] = False

shouting_df['shouting'].value_counts()

False    1855
True       78
Name: shouting, dtype: int64

In [47]:
# Drop the shouting_cols
shouting_df = shouting_df.drop(columns=shouting_cols)
shouting_df.head()

Unnamed: 0,squirrel_id,shouting
0,13E-AM-1017-05,False
1,11H-AM-1010-03,False
2,11D-AM-1010-03,False
3,20B-PM-1013-05,False
4,36I-PM-1007-01,False


In [48]:
# Merge with s2018_rename_columns
s2018_booleans = pd.merge(s2018_rename_columns, shouting_df, on="squirrel_id", how="inner")
s2018_booleans.head()

Unnamed: 0,squirrel_longitude,squirrel_latitude,squirrel_id,age,primary_fur_color,highlight_fur_color,running,chasing,climbing,eating,...,moans,approaches,indifferent,runs_from,other_interactions,lat/long,year,month,day,shouting
0,-73.970268,40.776213,13E-AM-1017-05,Adult,Gray,Cinnamon,False,False,False,False,...,False,False,False,False,,POINT (-73.9702676472613 40.7762126854894),2018,10,17,False
1,-73.968361,40.772591,11H-AM-1010-03,Adult,Cinnamon,White,False,False,False,False,...,False,False,True,False,,POINT (-73.9683613516225 40.7725908847499),2018,10,10,False
2,-73.97225,40.774288,11D-AM-1010-03,Adult,Gray,Cinnamon,False,False,True,False,...,False,False,True,False,,POINT (-73.9722500196844 40.7742879599026),2018,10,10,False
3,-73.969506,40.782351,20B-PM-1013-05,Adult,Gray,White,False,False,False,False,...,False,False,True,False,,POINT (-73.9695063535333 40.7823507678183),2018,10,13,False
4,-73.953217,40.791967,36I-PM-1007-01,Adult,Gray,Cinnamon,False,False,False,False,...,False,False,True,False,,POINT (-73.9532170504865 40.7919669739962),2018,10,7,False


In [49]:
# Drop the shouting_cols from s2018_booleans
s2018_booleans = s2018_booleans.drop(columns=shouting_cols)
s2018_booleans.head()

Unnamed: 0,squirrel_longitude,squirrel_latitude,squirrel_id,age,primary_fur_color,highlight_fur_color,running,chasing,climbing,eating,...,other_activities,approaches,indifferent,runs_from,other_interactions,lat/long,year,month,day,shouting
0,-73.970268,40.776213,13E-AM-1017-05,Adult,Gray,Cinnamon,False,False,False,False,...,,False,False,False,,POINT (-73.9702676472613 40.7762126854894),2018,10,17,False
1,-73.968361,40.772591,11H-AM-1010-03,Adult,Cinnamon,White,False,False,False,False,...,,False,True,False,,POINT (-73.9683613516225 40.7725908847499),2018,10,10,False
2,-73.97225,40.774288,11D-AM-1010-03,Adult,Gray,Cinnamon,False,False,True,False,...,grooming,False,True,False,,POINT (-73.9722500196844 40.7742879599026),2018,10,10,False
3,-73.969506,40.782351,20B-PM-1013-05,Adult,Gray,White,False,False,False,False,...,,False,True,False,,POINT (-73.9695063535333 40.7823507678183),2018,10,13,False
4,-73.953217,40.791967,36I-PM-1007-01,Adult,Gray,Cinnamon,False,False,False,False,...,,False,True,False,,POINT (-73.9532170504865 40.7919669739962),2018,10,7,False


## Investigate 'other_activities' column

In [50]:
# Check the "other activities" column
other_activities = pd.DataFrame(s2018_booleans['other_activities'].value_counts())
other_activities

Unnamed: 0,other_activities
digging,14
sitting,9
playing,8
burying,6
nut in mouth,4
...,...
jumping tree to tree,1
carrying nut away,1
chasing #2 and climbing tree w/ #2,1
standing upright,1


In [51]:
# Isolate the "other activities" column
other_df = s2018_booleans[['squirrel_id', 'other_activities']].copy()

# Only include rows with a non-null value - REGEX DOESN'T WORK IF array has NaN values!
other_df = other_df.dropna(how="any")

# Equivalent columns from 2020 currently missing from 2018
missing_cols = ['sitting', 'digging']

# Create new columns for each activity
for col_name in missing_cols:
    other_df[col_name] = pd.Series(dtype=bool)

other_df.head()

Unnamed: 0,squirrel_id,other_activities,sitting,digging
2,11D-AM-1010-03,grooming,,
11,12I-AM-1013-01,sitting,,
19,33H-AM-1019-02,wrestling with mother,,
25,2B-PM-1013-01,running (with nut),,
29,6I-PM-1013-06,playing with #5,,


In [71]:
# CONVERT THIS TO A DICTIONARY LATER, for simplicity!
# Combine activities
patterns = [
    'sit', 'watch', 'staring', 'posing', 'hanging out', 'chillin', 'dig', 'bury', 'chas', 'climb', 'eat', 'hop', 'foraging', 'running', 'jump']
activity_col = [
    'sitting', 'watching', 'sitting', 'sitting', 'sitting', 'sitting', 'digging', 'digging', 'chasing', 'climbing', 'eating', 'running', 'foraging', 'running', 'climbing']

all_indices = []
for idx_pattern, pattern in enumerate(patterns):
    idx_sit = list(other_df[other_df['other_activities'].str.contains(pattern)].index)
    # all_indices.append(idx_sit)
    
    for idx in idx_sit:
        all_indices.append(idx)
        other_df.loc[idx, activity_col[idx_pattern]] = True

# false_rows = set(other_df.index).difference(set(all_indices))
otherdf_cols = other_df.columns
for idx, row in other_df.iterrows():
    for col in otherdf_cols:
        if (row[col] == True):
            continue
        # elif (row[col] == "NaN"):
        elif isinstance(row[col], float):
            other_df.loc[idx, col] = False

other_df.head()

Unnamed: 0,squirrel_id,other_activities,sitting,digging,watching,chasing,climbing,eating,running,foraging
2,11D-AM-1010-03,grooming,False,False,False,False,False,False,False,False
11,12I-AM-1013-01,sitting,True,False,False,False,False,False,False,False
19,33H-AM-1019-02,wrestling with mother,False,False,False,False,False,False,False,False
25,2B-PM-1013-01,running (with nut),False,False,False,False,False,False,True,False
29,6I-PM-1013-06,playing with #5,False,False,False,False,False,False,False,False


In [72]:
other_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 285 entries, 2 to 1927
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   squirrel_id       285 non-null    object
 1   other_activities  285 non-null    object
 2   sitting           285 non-null    object
 3   digging           285 non-null    object
 4   watching          285 non-null    object
 5   chasing           285 non-null    object
 6   climbing          285 non-null    object
 7   eating            285 non-null    object
 8   running           285 non-null    object
 9   foraging          285 non-null    object
dtypes: object(10)
memory usage: 32.6+ KB


In [73]:
# Loop through the s2018 booleans, if there's a match in the other_df, then compare cols
count = 0

# Create a variable to hold the different column names
s2018_columns = s2018_booleans.columns[7:12] # ['running', 'chasing', 'climbing', 'eating', 'foraging']
other_columns = other_df.columns[2:] # ['sitting', 'digging', 'chasing', 'jumping', 'climbing', 'eating', 'running', 'foraging']

# Append missing columns to s2018_booleans
missing_cols = list(set(other_columns).difference(set(s2018_columns))) # ['sitting', 'digging']
for col in missing_cols:
    s2018_booleans[col] = pd.Series(dtype=bool)

# Loop through both dataframes
for idx, row in s2018_booleans.iterrows():
    for idx_other, row_other in other_df.iterrows():
        
        # If the squirrel_id exists in both dataframes, check the columns
        if (row['squirrel_id'] == row_other['squirrel_id']):
            
            
            for s2018_col in s2018_columns:
                for other_col in other_columns:
                    
                    # Only interested where the s2018_col matches other_col, AND if True
                    if (s2018_col == other_col) and (other_df.loc[idx_other, other_col] == True):

                        # Update existing columns as required
                        s2018_booleans.loc[idx, s2018_col] = True

                    # Columns that are not yet in the s2018_booleans dataframe
                    if (other_col in missing_cols) and (other_df.loc[idx_other, other_col] == True):
                        s2018_booleans.loc[idx, other_col] = True
                    else:
                        s2018_booleans.loc[idx, other_col] = False
            count += 1
print(count)

285


In [74]:
s2018_booleans.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1933 entries, 0 to 1932
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   squirrel_longitude   1933 non-null   float64
 1   squirrel_latitude    1933 non-null   float64
 2   squirrel_id          1933 non-null   object 
 3   age                  1870 non-null   object 
 4   primary_fur_color    1933 non-null   object 
 5   highlight_fur_color  1933 non-null   object 
 6   running              285 non-null    object 
 7   chasing              1933 non-null   bool   
 8   climbing             1933 non-null   bool   
 9   eating               1933 non-null   bool   
 10  foraging             1933 non-null   bool   
 11  other_activities     285 non-null    object 
 12  approaches           1933 non-null   bool   
 13  indifferent          1933 non-null   bool   
 14  runs_from            1933 non-null   bool   
 15  other_interactions   169 non-null    o

In [75]:
# Convert to boolean datatype
bool_columns = ['running', 'shouting', 'sitting', 'digging', 'watching']
s2018_booleans[bool_columns] = s2018_booleans[bool_columns].astype(bool)

s2018_booleans.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1933 entries, 0 to 1932
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   squirrel_longitude   1933 non-null   float64
 1   squirrel_latitude    1933 non-null   float64
 2   squirrel_id          1933 non-null   object 
 3   age                  1870 non-null   object 
 4   primary_fur_color    1933 non-null   object 
 5   highlight_fur_color  1933 non-null   object 
 6   running              1933 non-null   bool   
 7   chasing              1933 non-null   bool   
 8   climbing             1933 non-null   bool   
 9   eating               1933 non-null   bool   
 10  foraging             1933 non-null   bool   
 11  other_activities     285 non-null    object 
 12  approaches           1933 non-null   bool   
 13  indifferent          1933 non-null   bool   
 14  runs_from            1933 non-null   bool   
 15  other_interactions   169 non-null    o

# Merge both datasets

In [76]:
print(s2020_booleans.columns)
s2020_booleans.head()

Index(['park_name', 'park_id', 'squirrel_id', 'primary_fur_color',
       'highlights_in_fur_color', 'other_notes_or_observations',
       'squirrel_latitude', 'squirrel_longitude', 'foraging', 'eating',
       'digging', 'climbing', 'other_activities', 'sitting', 'chasing',
       'shouting', 'running', 'indifferent', 'runs from', 'approaches',
       'watching'],
      dtype='object')


Unnamed: 0,park_name,park_id,squirrel_id,primary_fur_color,highlights_in_fur_color,other_notes_or_observations,squirrel_latitude,squirrel_longitude,foraging,eating,...,climbing,other_activities,sitting,chasing,shouting,running,indifferent,runs from,approaches,watching
0,Fort Tryon Park,1,A-01-01,Gray,White,,40.85941,-73.933936,True,False,...,False,False,False,False,False,False,True,False,False,False
1,Fort Tryon Park,1,A-01-02,Gray,White,Looks skinny,40.859436,-73.933937,True,False,...,False,False,False,False,False,False,True,False,False,False
2,Fort Tryon Park,1,A-01-03,Gray,White,,40.859416,-73.933894,False,True,...,False,False,False,False,False,False,True,False,False,False
3,Fort Tryon Park,1,A-01-04,Gray,White,,40.859418,-73.933895,False,False,...,False,False,False,False,False,True,True,False,False,False
4,Fort Tryon Park,1,A-01-05,Gray,Cinnamon,She left food,40.859493,-73.93359,False,True,...,False,False,False,False,False,True,True,False,False,False


In [77]:
print(s2018_booleans.columns)
s2018_booleans.head()

Index(['squirrel_longitude', 'squirrel_latitude', 'squirrel_id', 'age',
       'primary_fur_color', 'highlight_fur_color', 'running', 'chasing',
       'climbing', 'eating', 'foraging', 'other_activities', 'approaches',
       'indifferent', 'runs_from', 'other_interactions', 'lat/long', 'year',
       'month', 'day', 'shouting', 'sitting', 'digging', 'watching'],
      dtype='object')


Unnamed: 0,squirrel_longitude,squirrel_latitude,squirrel_id,age,primary_fur_color,highlight_fur_color,running,chasing,climbing,eating,...,runs_from,other_interactions,lat/long,year,month,day,shouting,sitting,digging,watching
0,-73.970268,40.776213,13E-AM-1017-05,Adult,Gray,Cinnamon,True,False,False,False,...,False,,POINT (-73.9702676472613 40.7762126854894),2018,10,17,False,True,True,True
1,-73.968361,40.772591,11H-AM-1010-03,Adult,Cinnamon,White,True,False,False,False,...,False,,POINT (-73.9683613516225 40.7725908847499),2018,10,10,False,True,True,True
2,-73.97225,40.774288,11D-AM-1010-03,Adult,Gray,Cinnamon,False,False,False,False,...,False,,POINT (-73.9722500196844 40.7742879599026),2018,10,10,False,False,False,False
3,-73.969506,40.782351,20B-PM-1013-05,Adult,Gray,White,True,False,False,False,...,False,,POINT (-73.9695063535333 40.7823507678183),2018,10,13,False,True,True,True
4,-73.953217,40.791967,36I-PM-1007-01,Adult,Gray,Cinnamon,True,False,False,False,...,False,,POINT (-73.9532170504865 40.7919669739962),2018,10,7,False,True,True,True


In [78]:
# Confirm there is no overlap between the squirrel_id
ids_2020 = set(s2020_booleans['squirrel_id'].unique())
ids_2018 = set(s2018_booleans['squirrel_id'].unique())

ids_2020.intersection(ids_2018) # Since empty set, can combine the two datasets

set()

In [79]:
# Drop unused columns
s2020_clean = s2020_booleans.drop(columns=[
    'park_name', # for now, add back in later
    'park_id', # for now, add back in later
    'other_notes_or_observations',
    'other_activities'])

s2018_clean = s2018_booleans.drop(columns=[
    # 'date',
    'age', # for now, add back in later, once added equivalent 2020 column of NaN?
    'other_activities',
    'other_interactions',
    'lat/long'])

In [80]:
print(len(s2020_clean.columns))
s2020_clean.columns

17


Index(['squirrel_id', 'primary_fur_color', 'highlights_in_fur_color',
       'squirrel_latitude', 'squirrel_longitude', 'foraging', 'eating',
       'digging', 'climbing', 'sitting', 'chasing', 'shouting', 'running',
       'indifferent', 'runs from', 'approaches', 'watching'],
      dtype='object')

In [81]:
print(len(s2018_clean.columns))
s2018_clean.columns

20


Index(['squirrel_longitude', 'squirrel_latitude', 'squirrel_id',
       'primary_fur_color', 'highlight_fur_color', 'running', 'chasing',
       'climbing', 'eating', 'foraging', 'approaches', 'indifferent',
       'runs_from', 'year', 'month', 'day', 'shouting', 'sitting', 'digging',
       'watching'],
      dtype='object')

In [82]:
set(s2020_clean).difference(set(s2018_clean))

{'highlights_in_fur_color', 'runs from'}

## Add year, month, day to 2020 Dataset

In [83]:
# Add the year, month, day column to the 2020 dataset - 01 March 2020
s2020_clean['year'] = 2020
s2020_clean['month'] = 3
s2020_clean['day'] = 1

## Rename and rearrange columns

In [84]:
# Rename columns
s2020_clean = s2020_clean.rename(columns={
    'primary_fur_color': 'primary_colour',
    'highlights_in_fur_color': 'fur_highlights',
    'squirrel_longitude': 'longitude',
    'squirrel_latitude': 'latitude',
    'runs from': 'runs_from'
})

s2018_clean = s2018_clean.rename(columns={
    'squirrel_longitude': 'longitude',
    'squirrel_latitude': 'latitude',
    'primary_fur_color': 'primary_colour',
    'highlight_fur_color': 'fur_highlights'
})

In [85]:
# Rearrange columns
rearranged = [
    'squirrel_id', 'longitude', 'latitude', 'year', 'month', 'day', 'primary_colour', 'fur_highlights',
    'chasing', 'climbing', 'digging', 'eating', 'foraging', 'running', 'shouting', 'sitting',
    'approaches', 'indifferent', 'runs_from', 'watching']

s2020_clean = s2020_clean[rearranged]
s2018_clean = s2018_clean[rearranged]

In [86]:
s2020_clean.head()

Unnamed: 0,squirrel_id,longitude,latitude,year,month,day,primary_colour,fur_highlights,chasing,climbing,digging,eating,foraging,running,shouting,sitting,approaches,indifferent,runs_from,watching
0,A-01-01,-73.933936,40.85941,2020,3,1,Gray,White,False,False,False,False,True,False,False,False,False,True,False,False
1,A-01-02,-73.933937,40.859436,2020,3,1,Gray,White,False,False,False,False,True,False,False,False,False,True,False,False
2,A-01-03,-73.933894,40.859416,2020,3,1,Gray,White,False,False,True,True,False,False,False,False,False,True,False,False
3,A-01-04,-73.933895,40.859418,2020,3,1,Gray,White,False,False,False,False,False,True,False,False,False,True,False,False
4,A-01-05,-73.93359,40.859493,2020,3,1,Gray,Cinnamon,False,False,False,True,False,True,False,False,False,True,False,False


In [87]:
s2018_clean.head()

Unnamed: 0,squirrel_id,longitude,latitude,year,month,day,primary_colour,fur_highlights,chasing,climbing,digging,eating,foraging,running,shouting,sitting,approaches,indifferent,runs_from,watching
0,13E-AM-1017-05,-73.970268,40.776213,2018,10,17,Gray,Cinnamon,False,False,True,False,True,True,False,True,False,False,False,True
1,11H-AM-1010-03,-73.968361,40.772591,2018,10,10,Cinnamon,White,False,False,True,False,True,True,False,True,False,True,False,True
2,11D-AM-1010-03,-73.97225,40.774288,2018,10,10,Gray,Cinnamon,False,False,False,False,False,False,False,False,False,True,False,False
3,20B-PM-1013-05,-73.969506,40.782351,2018,10,13,Gray,White,False,False,True,False,True,True,False,True,False,True,False,True
4,36I-PM-1007-01,-73.953217,40.791967,2018,10,7,Gray,Cinnamon,False,False,True,False,True,True,False,True,False,True,False,True


In [88]:
# Append the datasets
combined_df = pd.concat([s2020_clean, s2018_clean], ignore_index=True)
combined_df.head()

Unnamed: 0,squirrel_id,longitude,latitude,year,month,day,primary_colour,fur_highlights,chasing,climbing,digging,eating,foraging,running,shouting,sitting,approaches,indifferent,runs_from,watching
0,A-01-01,-73.933936,40.85941,2020,3,1,Gray,White,False,False,False,False,True,False,False,False,False,True,False,False
1,A-01-02,-73.933937,40.859436,2020,3,1,Gray,White,False,False,False,False,True,False,False,False,False,True,False,False
2,A-01-03,-73.933894,40.859416,2020,3,1,Gray,White,False,False,True,True,False,False,False,False,False,True,False,False
3,A-01-04,-73.933895,40.859418,2020,3,1,Gray,White,False,False,False,False,False,True,False,False,False,True,False,False
4,A-01-05,-73.93359,40.859493,2020,3,1,Gray,Cinnamon,False,False,False,True,False,True,False,False,False,True,False,False


In [89]:
combined_df.info() # need to convert some columns to bool, and check the values for digging and sitting

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2125 entries, 0 to 2124
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   squirrel_id     2125 non-null   object 
 1   longitude       2125 non-null   float64
 2   latitude        2125 non-null   float64
 3   year            2125 non-null   int64  
 4   month           2125 non-null   int64  
 5   day             2125 non-null   int64  
 6   primary_colour  2125 non-null   object 
 7   fur_highlights  2125 non-null   object 
 8   chasing         2125 non-null   bool   
 9   climbing        2125 non-null   bool   
 10  digging         2125 non-null   bool   
 11  eating          2125 non-null   bool   
 12  foraging        2125 non-null   bool   
 13  running         2125 non-null   bool   
 14  shouting        2125 non-null   bool   
 15  sitting         2125 non-null   bool   
 16  approaches      2125 non-null   bool   
 17  indifferent     2125 non-null   b

In [90]:
# Convert the dtypes result to a DataFrame
dtype_df = pd.DataFrame(combined_df.dtypes, columns=['type'])

# Get columns either: object/VARCHAR or float64/FLOAT(p)
length_cols = dtype_df.loc[(dtype_df['type'] == "float64") | (dtype_df['type'] == "object")].index

for col_name in length_cols:
    # Variables to save the longest character and the word
    max_char = 0
    max_word = ""

    # Loop through each column
    for word in combined_df[col_name]:
        if len(str(word)) > max_char:
            max_char = len(str(word))
            max_word = str(word)

    # Print the longest values
    print(f'{col_name}: {max_char}, {max_word}')

squirrel_id: 14, 13E-AM-1017-05
longitude: 17, -73.9702676472613
latitude: 16, 40.7762126854894
primary_colour: 8, Cinnamon
fur_highlights: 22, Black, Cinnamon, White


## Parse the fur highlights

In [91]:
# Fur highlights needs to be parsed
combined_df['fur_highlights'].value_counts()

Cinnamon                  832
White                     633
Cinnamon, White           282
Gray                      227
Gray, White                59
Black                      38
Black, Cinnamon, White     32
Black, White               10
Black, Cinnamon             9
Gray, Black                 3
Name: fur_highlights, dtype: int64

In [92]:
unique_highlights = combined_df['fur_highlights'].unique()

highlight = []
for row in unique_highlights:
    row_string = row.split(", ")
    for value in row_string:
        highlight.append(value.lower())

highlight = list(set(highlight))
highlight

['black', 'white', 'cinnamon', 'gray']

In [93]:
# Create columns in combined_df for each highlight
for col in highlight:
    combined_df[col] = pd.Series(dtype=bool)

In [94]:
combined_df.head()

Unnamed: 0,squirrel_id,longitude,latitude,year,month,day,primary_colour,fur_highlights,chasing,climbing,...,shouting,sitting,approaches,indifferent,runs_from,watching,black,white,cinnamon,gray
0,A-01-01,-73.933936,40.85941,2020,3,1,Gray,White,False,False,...,False,False,False,True,False,False,,,,
1,A-01-02,-73.933937,40.859436,2020,3,1,Gray,White,False,False,...,False,False,False,True,False,False,,,,
2,A-01-03,-73.933894,40.859416,2020,3,1,Gray,White,False,False,...,False,False,False,True,False,False,,,,
3,A-01-04,-73.933895,40.859418,2020,3,1,Gray,White,False,False,...,False,False,False,True,False,False,,,,
4,A-01-05,-73.93359,40.859493,2020,3,1,Gray,Cinnamon,False,False,...,False,False,False,True,False,False,,,,


In [95]:
for idx, row in combined_df.iterrows():
    highlight_row = row['fur_highlights'].split(", ")
    
    if len(highlight_row) > 1:
        for value in highlight_row:
            for col_name in highlight:
                # print(combined_df.loc[idx, col_name])
                if (combined_df.loc[idx, col_name] == True):
                    continue
                if (value.lower() == col_name):
                    combined_df.loc[idx, col_name] = True
                else:
                    combined_df.loc[idx, col_name] = False
    else:
        highlight_value = highlight_row[0].lower()
        for col_name in highlight:
            if (highlight_value == col_name):
                combined_df.loc[idx, col_name] = True
            else:
                combined_df.loc[idx, col_name] = False

In [96]:
combined_df[['fur_highlights', 'cinnamon', 'black', 'white', 'gray']]

Unnamed: 0,fur_highlights,cinnamon,black,white,gray
0,White,False,False,True,False
1,White,False,False,True,False
2,White,False,False,True,False
3,White,False,False,True,False
4,Cinnamon,True,False,False,False
...,...,...,...,...,...
2120,Gray,False,False,False,True
2121,White,False,False,True,False
2122,"Black, Cinnamon, White",True,True,True,False
2123,"Cinnamon, White",True,False,True,False


In [97]:
# Drop the fur_highlights column
combined_df = combined_df.drop(columns=['fur_highlights'])
combined_df.head()

Unnamed: 0,squirrel_id,longitude,latitude,year,month,day,primary_colour,chasing,climbing,digging,...,shouting,sitting,approaches,indifferent,runs_from,watching,black,white,cinnamon,gray
0,A-01-01,-73.933936,40.85941,2020,3,1,Gray,False,False,False,...,False,False,False,True,False,False,False,True,False,False
1,A-01-02,-73.933937,40.859436,2020,3,1,Gray,False,False,False,...,False,False,False,True,False,False,False,True,False,False
2,A-01-03,-73.933894,40.859416,2020,3,1,Gray,False,False,True,...,False,False,False,True,False,False,False,True,False,False
3,A-01-04,-73.933895,40.859418,2020,3,1,Gray,False,False,False,...,False,False,False,True,False,False,False,True,False,False
4,A-01-05,-73.93359,40.859493,2020,3,1,Gray,False,False,False,...,False,False,False,True,False,False,False,False,True,False


In [98]:
# Convert the highlight columns to boolean
combined_df[highlight] = combined_df[highlight].astype(bool)
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2125 entries, 0 to 2124
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   squirrel_id     2125 non-null   object 
 1   longitude       2125 non-null   float64
 2   latitude        2125 non-null   float64
 3   year            2125 non-null   int64  
 4   month           2125 non-null   int64  
 5   day             2125 non-null   int64  
 6   primary_colour  2125 non-null   object 
 7   chasing         2125 non-null   bool   
 8   climbing        2125 non-null   bool   
 9   digging         2125 non-null   bool   
 10  eating          2125 non-null   bool   
 11  foraging        2125 non-null   bool   
 12  running         2125 non-null   bool   
 13  shouting        2125 non-null   bool   
 14  sitting         2125 non-null   bool   
 15  approaches      2125 non-null   bool   
 16  indifferent     2125 non-null   bool   
 17  runs_from       2125 non-null   b

In [99]:
# Check the primary_colour column
combined_df['primary_colour'].value_counts()

Gray        1748
Cinnamon     342
Black         35
Name: primary_colour, dtype: int64

In [100]:
# Convert the dtypes result to a DataFrame
dtype_df = pd.DataFrame(combined_df.dtypes, columns=['type'])

# Get columns either: object/VARCHAR or float64/FLOAT(p)
length_cols = dtype_df.loc[(dtype_df['type'] == "float64") | (dtype_df['type'] == "object")].index

for col_name in length_cols:
    # Variables to save the longest character and the word
    max_char = 0
    max_word = ""

    # Loop through each column
    for word in combined_df[col_name]:
        if len(str(word)) > max_char:
            max_char = len(str(word))
            max_word = str(word)

    # Print the longest values
    print(f'{col_name}: {max_char}, {max_word}')

squirrel_id: 14, 13E-AM-1017-05
longitude: 17, -73.9702676472613
latitude: 16, 40.7762126854894
primary_colour: 8, Cinnamon


## Check for primary key equivalents

In [101]:
combined_df[['latitude', 'longitude']].value_counts()

latitude   longitude 
40.846088  -73.940613    3
40.846332  -73.940369    3
40.841375  -73.934242    2
40.712872  -74.006878    2
40.847190  -73.941308    2
                        ..
40.772659  -73.973787    1
40.772642  -73.973733    1
40.772622  -73.970856    1
40.772620  -73.976725    1
40.860825  -73.932871    1
Length: 2115, dtype: int64

In [102]:
combined_df['squirrel_id'].value_counts()

A-01-01           1
5E-AM-1008-05     1
20G-PM-1013-02    1
10I-PM-1013-01    1
30B-PM-1019-05    1
                 ..
41A-AM-1013-01    1
15F-AM-1007-08    1
36I-PM-1007-06    1
1B-AM-1012-17     1
5E-PM-1012-01     1
Name: squirrel_id, Length: 2125, dtype: int64

In [103]:
combined_df[combined_df['squirrel_id'].duplicated() == True]

Unnamed: 0,squirrel_id,longitude,latitude,year,month,day,primary_colour,chasing,climbing,digging,...,shouting,sitting,approaches,indifferent,runs_from,watching,black,white,cinnamon,gray


In [104]:
print(combined_df['squirrel_id'].nunique(), len(combined_df))

2125 2125


## Create separate CSVs

In [105]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2125 entries, 0 to 2124
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   squirrel_id     2125 non-null   object 
 1   longitude       2125 non-null   float64
 2   latitude        2125 non-null   float64
 3   year            2125 non-null   int64  
 4   month           2125 non-null   int64  
 5   day             2125 non-null   int64  
 6   primary_colour  2125 non-null   object 
 7   chasing         2125 non-null   bool   
 8   climbing        2125 non-null   bool   
 9   digging         2125 non-null   bool   
 10  eating          2125 non-null   bool   
 11  foraging        2125 non-null   bool   
 12  running         2125 non-null   bool   
 13  shouting        2125 non-null   bool   
 14  sitting         2125 non-null   bool   
 15  approaches      2125 non-null   bool   
 16  indifferent     2125 non-null   bool   
 17  runs_from       2125 non-null   b

In [106]:
metadata_df = combined_df[[
    'squirrel_id', 'latitude', 'longitude', 'year', 'month', 'day'
]]
metadata_df.to_csv("resources/clean/metadata.csv", index=False, header=True)

In [107]:
appearance_df = combined_df[[
    'squirrel_id', 'primary_colour',
    'black', 'cinnamon', 'gray', 'white'
]]
appearance_df.to_csv("resources/clean/appearance.csv", index=False, header=True)

In [108]:
activities_df = combined_df[[
    'squirrel_id', 'chasing', 'climbing', 'digging',
    'eating', 'foraging', 'running', 'shouting', 'sitting'
]]
activities_df.to_csv("resources/clean/activities.csv", index=False, header=True)

In [109]:
interactions_df = combined_df[[
    'squirrel_id', 'approaches', 'indifferent',
    'runs_from', 'watching'
]]
interactions_df.to_csv("resources/clean/interactions.csv", index=False, header=True)

## Check Activities Counts

In [111]:
combined_df[combined_df['digging'] == True]

Unnamed: 0,squirrel_id,longitude,latitude,year,month,day,primary_colour,chasing,climbing,digging,...,shouting,sitting,approaches,indifferent,runs_from,watching,black,white,cinnamon,gray
2,A-01-03,-73.933894,40.859416,2020,3,1,Gray,False,False,True,...,False,False,False,True,False,False,False,True,False,False
9,A-01-10,-73.933717,40.859636,2020,3,1,Gray,False,False,True,...,False,False,False,True,False,False,False,True,False,False
10,A-01-11,-73.933738,40.859576,2020,3,1,Gray,False,False,True,...,False,False,False,True,False,False,True,False,False,False
130,C-19-04,-74.005233,40.712893,2020,3,1,Gray,False,False,True,...,False,False,True,False,False,False,False,True,True,False
131,C-19-05,-74.005233,40.712893,2020,3,1,Gray,False,False,True,...,False,False,True,False,False,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2120,21F-PM-1018-02,-73.964544,40.781160,2018,10,18,Cinnamon,False,False,True,...,False,True,False,False,True,True,False,False,False,True
2121,19A-PM-1013-05,-73.970402,40.782560,2018,10,13,Gray,False,False,True,...,False,True,False,True,False,True,False,True,False,False
2122,22D-PM-1012-07,-73.966587,40.783678,2018,10,12,Gray,False,False,True,...,False,True,False,True,False,True,True,True,True,False
2123,29B-PM-1010-02,-73.963994,40.789915,2018,10,10,Gray,False,False,True,...,False,True,False,True,False,True,False,True,True,False


In [112]:
combined_df['day'].value_counts()

13    306
14    258
7     249
6     210
10    201
1     192
12    175
8     143
17    134
18    115
19     96
20     46
Name: day, dtype: int64