In [11]:
import pandas as pd
import numpy as np

import seaborn as sns

from matplotlib import pyplot as plt

%matplotlib inline

## Loading the Data

### Species

In [2]:
df_species = pd.read_csv("species_info.csv")
df_species.head()

Unnamed: 0,category,scientific_name,common_names,conservation_status
0,Mammal,Clethrionomys gapperi gapperi,Gapper's Red-Backed Vole,
1,Mammal,Bos bison,"American Bison, Bison",
2,Mammal,Bos taurus,"Aurochs, Aurochs, Domestic Cattle (Feral), Dom...",
3,Mammal,Ovis aries,"Domestic Sheep, Mouflon, Red Sheep, Sheep (Feral)",
4,Mammal,Cervus elaphus,Wapiti Or Elk,


In [6]:
df_species.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5824 entries, 0 to 5823
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   category             5824 non-null   object
 1   scientific_name      5824 non-null   object
 2   common_names         5824 non-null   object
 3   conservation_status  191 non-null    object
dtypes: object(4)
memory usage: 182.1+ KB


### Observations

In [4]:
df_observations = pd.read_csv("observations.csv")
df_observations.head()

Unnamed: 0,scientific_name,park_name,observations
0,Vicia benghalensis,Great Smoky Mountains National Park,68
1,Neovison vison,Great Smoky Mountains National Park,77
2,Prunus subcordata,Yosemite National Park,138
3,Abutilon theophrasti,Bryce National Park,84
4,Githopsis specularioides,Great Smoky Mountains National Park,85


In [7]:
df_observations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23296 entries, 0 to 23295
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   scientific_name  23296 non-null  object
 1   park_name        23296 non-null  object
 2   observations     23296 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 546.1+ KB


In [15]:
print(f"Species: {df_species.shape[0]}")
print(f"Observations: {df_observations.shape[0]}")

Species: 5824
Observations: 23296


## Exploratoring the Data

### Species

#### Category

In [39]:
print(f"Number of Species: {df_species.scientific_name.nunique():,.0f}")

Number of Species: 5,541


In [21]:
print(f"Number of Categories: {df_species.category.nunique()}")
print(f"Categories: {df_species.category.unique()}")

Number of Categories: 7
Categories: ['Mammal' 'Bird' 'Reptile' 'Amphibian' 'Fish' 'Vascular Plant'
 'Nonvascular Plant']


In [32]:
print("Grouping Species by ...")
print(df_species.groupby("category").size())

Grouping Species by ...
category
Amphibian              80
Bird                  521
Fish                  127
Mammal                214
Nonvascular Plant     333
Reptile                79
Vascular Plant       4470
dtype: int64


#### Conservation Status

In [26]:
print(f"Number of Conservation Status: {df_species.conservation_status.nunique()}")
print(f"Conservation Status: {df_species.conservation_status.unique()}")

Number of Conservation Status: 4
Conservation Status: [nan 'Species of Concern' 'Endangered' 'Threatened' 'In Recovery']


In [40]:
print(f"nan values: {df_species.conservation_status.isna().sum():,.0f}")

nan values: 5,633


In [31]:
print("Grouping Species by ...")
print(df_species.groupby("conservation_status").size())

Grouping Species by ...
conservation_status
Endangered             16
In Recovery             4
Species of Concern    161
Threatened             10
dtype: int64


### Observations

In [33]:
print(f"Number of parks: {df_observations.park_name.nunique()}")
print(f"Name of parks: {df_observations.park_name.unique()}")

Number of parks: 4
Name of parks: ['Great Smoky Mountains National Park' 'Yosemite National Park'
 'Bryce National Park' 'Yellowstone National Park']


In [38]:
print(f"Number of observations: {df_observations.observations.sum():,.0f}")

Number of observations: 3,314,739


## Analysing the Data

Filling `nan` values from `Species` dataset as `No Intervention`

In [42]:
df_species.fillna("No Intervention", inplace=True)
df_species.groupby("conservation_status").size()

conservation_status
Endangered              16
In Recovery              4
No Intervention       5633
Species of Concern     161
Threatened              10
dtype: int64