# What are the squirrels in NYC up to?

In [None]:
import pandas as pd
import numpy as np
import datetime
import json

In [None]:
squirrel_df = pd.read_csv("Squirrel_Census.csv")
len(squirrel_df)

### Understanding the dataset
What are the columns? What are the data types? Are there null values?


In [None]:
squirrel_df.columns

In [None]:
squirrel_df.info()

In [None]:
squirrel_df["Hectare"].unique()

In [None]:
squirrel_df["Primary Fur Color"].unique()
# squirrel_df["Highlight Fur Color"].unique()
# squirrel_df["Color notes"].unique()

In [None]:
# in Java
for i in range(150):
    field = squirrel_df.iloc[i]["Other Activities"]
    if field == field:
        print(field)

In [None]:
# in Python
squirrel_df[~squirrel_df["Other Interactions"].isna()]["Other Interactions"]

In [None]:
i=0
i+=20
# squirrel_df[~squirrel_df["Other Activities"].isna()]["Other Activities"].iloc[i:i+20]
squirrel_df[~squirrel_df["Other Interactions"].isna()]["Other Interactions"].iloc[i:i+20]

### Data Cleaning

In [None]:
# first, let's clean the notes so that we can separate words and numbers

from cleantext.sklearn import CleanTransformer

cleaner = CleanTransformer(no_punct=True,
                           lower=True, 
                           no_emoji=True, 
                           no_line_breaks=True, 
                           no_urls=True, 
                           normalize_whitespace=True,
                           to_ascii=False)

In [None]:
text_columns = ["Other Activities", "Other Interactions"]
for col in text_columns:
    new_col = f"{col}_clean"
    squirrel_df[new_col] = cleaner.transform(squirrel_df[col].str.replace("-", " ").fillna(""))

In [None]:
# 1332
squirrel_df["Other Activities_clean"].sample(10)

In [None]:
# Always clean dates
squirrel_df["Date"]

In [None]:
def easy_convert_date(orig):
    orig = str(orig)
    month = int(orig[0:2])
    day = int(orig[2:4])
    year = 2018
    return datetime.date(year, month, day).strftime("%m/%d/%Y")

squirrel_df["date_string"] = squirrel_df.Date.apply(easy_convert_date)

In [None]:
# squirrel_df["date_month"] = squirrel_df["Date"].apply(lambda x: int(str(x)[0:2]))
# squirrel_df["date_day"] = squirrel_df["Date"].apply(lambda x: int(str(x)[2:4]))
# squirrel_df["date_year"] = 2018

In [None]:
print(squirrel_df.date_day.min())
print(squirrel_df.date_day.max())

In [None]:
squirrel_df.date_string.sort_values().unique()

In [None]:
# text cleaning
squirrel_df["Other Activities"].sample(10)

### EDA

In [None]:
# How many squirrels per hectacre? 
# What activities are they engaging in?
# What activities do they do together?
# How often are they interacting with other squirrels?

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# squirrel_df.groupby("Hectare").count()
squirrel_df.groupby("Hectare").count().sort_values(by="Hectare Squirrel Number", ascending=False).X.plot(kind="hist")

In [None]:
activities = squirrel_df.columns[15:20]
sounds = squirrel_df.columns[21:24]
tails = squirrel_df.columns[24:26]
interactions = squirrel_df.columns[26:29]

In [None]:
print(activities)
print(sounds)
print(tails)
print(interactions)

In [None]:
activity_counts = [len(squirrel_df[squirrel_df[col]]) for col in activities]
df = pd.DataFrame({"activity": activities, "counts": activity_counts})
df

In [None]:
# are there any activities frequently done together?
all_cols = np.concatenate([activities, sounds, tails, interactions])

In [None]:
counts = [len(squirrel_df[squirrel_df[col]]) for col in all_cols]
df = pd.DataFrame({"action": all_cols, "counts": counts})
plot = plt.bar(x = df.action, height = df.counts)
plt.bar_label(plot, counts)

plt.xticks(all_cols, rotation=75)
plt.xlabel("Action")
plt.ylabel("Number of Squirrels")
plt.title("Number of Squirrels doing each action")

In [None]:
overlap_lists = []
for i, col in enumerate(all_cols):
    row_overlap = []
    col_i = all_cols[i]
    for j, col in enumerate(all_cols):
        col_j = all_cols[j]
        # val = len(squirrel_df[(squirrel_df[col_i] == True) & (squirrel_df[col_j] == True)])
        val = len(squirrel_df[(squirrel_df[col_i] == True) & (squirrel_df[col_j] == True)]) / len(squirrel_df[(squirrel_df[col_i] == True)])
        row_overlap.append(val)
    overlap_lists.append(row_overlap)

overlapdf = pd.DataFrame(overlap_lists, index=all_cols, columns=all_cols)
overlapdf

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(overlapdf, annot=True, linewidth=.5, cmap="crest")
# sns.heatmap(overlapdf, annot=True,linewidth=.5, cmap="crest")

In [None]:
# cols = np.concatenate([interactions, tails])
cols = np.concatenate([activities, interactions])
overlap_lists = []
for i, col in enumerate(cols):
    row_overlap = []
    col_i = all_cols[i]
    for j, col in enumerate(cols):
        col_j = all_cols[j]
        # val = len(squirrel_df[(squirrel_df[col_i] == True) & (squirrel_df[col_j] == True)])
        val = len(squirrel_df[(squirrel_df[col_i] == True) & (squirrel_df[col_j] == True)]) / len(squirrel_df[(squirrel_df[col_i] == True)])
        row_overlap.append(val)
    overlap_lists.append(row_overlap)

overlapdf = pd.DataFrame(overlap_lists, index=cols, columns=cols)
sns.heatmap(overlapdf, annot=True,linewidth=.5, cmap="crest")

In [None]:
# how often to squirrels hang out with their friends?
# squirrel_df["Other Activities"].str.contains("\d").value_counts()
# squirrel_df[squirrel_df["Other Interactions"].str.contains("\d").fillna(False)]["Other Interactions"]
# squirrel_df[squirrel_df["Other Activities"].str.contains("\d").fillna(False)]["Other Activities"]
num_hangouts = len(squirrel_df[squirrel_df["Other Activities"].str.contains("\d").fillna(False)])
percent_hangouts = round((num_hangouts / len(squirrel_df[~squirrel_df["Other Activities"].isna()])), 2) *100
print(f"Squirrels hang out with other squirrels {percent_hangouts} % of the time")

Findings: 
- Squirrels spend a lot of time running and foraging
- When they are running away, they often climb trees
- They interact with other humans, dogs, and squirrels
- 19% of the squirrels with detailed interactions are chasing/playing with other squirrels