In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from itertools import chain
import string



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
observations = pd.read_csv('/kaggle/input/biodiversity-in-national-parks-project/observations.csv')
observations.head()

In [None]:
species_info = pd.read_csv('/kaggle/input/biodiversity-in-national-parks-project/species_info.csv')
species_info.conservation_status.fillna('Without Concern', inplace=True)
species_info.head()

# **What is the distribution of conservation status for species?**

In [None]:
needconserv = species_info[species_info['conservation_status'] != 'Without Concern']
conservationCategory = needconserv.groupby(['conservation_status', 'category']).scientific_name.count().reset_index()
conservationCategory

In [None]:
plt.figure(figsize=(8, 5))
ax = sns.histplot(data=needconserv, x='category', hue='conservation_status', element='step', discrete=False, common_norm=False)
plt.xticks(rotation=30)

Based on the graph, most of the category fall in the conservation status 'Species of Concern'. There is a number of 'Endangered' and 'Threatened' status to, but not to much. And In Recovery status there is only two category, that is Bird and Mammal.
We can conlude that there is many animals that must be considered their habitats.

# Are certain types of species more likely to be endangered ?****

## species_info['is_protected'] = species_info['conservation_status'] != 'Without Concern'
category_isprotec = species_info.groupby(['category', 'is_protected']).scientific_name.nunique().reset_index().pivot(index='category', columns='is_protected', values='scientific_name').reset_index()
category_isprotec.columns= ['category', 'not_protected', 'protected']
category_isprotec['percent_protected'] = category_isprotec.protected / (category_isprotec.protected + category_isprotec.not_protected) * 100
category_isprotec

From the table above we can see that Bird, Mammal, Vascular Plant  have more number in protected column, thats means that this is species more likely to be endangered.

# Are the differences between species and their conservation status significant ?****

This section will run some chi-squared tests to see if different species have statistically significant differences in conservation status rates. In order to run a chi squared test, a contingency table will need to be created

First we will test the difference conservation status between Bird and Fish. We will calculate whether Bird significantly endangered than Fish. 

Null Hypothesis : Between Bird and Fish there is no significant relationship i.e. the variables independent.

Alternative : Between Bird and Fish there is  a significant relationship i.e. the variables dependent.

The standard P-Value to test statistical significance is 0.05.

In [None]:
birdfish_cross = [[75, 413],
                  [11, 115]]
chi2, pval, dof, expected = chi2_contingency(birdfish_cross)
print(pval)

Above we can see that P-Value between Bird and Fish is 0.076, that is above 0.05 and we can accept the Null Hypothesis and assume that there doesn't seem to be any significant relationship between them.

The next pair is going to test Mammal and Amphibian

In [None]:
nonvpreptil_cross = [[5, 328],
                      [5, 73]]
chi2, pval2, dof, expected = chi2_contingency(nonvpreptil_cross)
pval2

This time the P-Value is 0.033 which is below the standard threshold of 0.05. Nonvascular Plant  are shown to have a statistically significant higher rate of needed protection compared with Reptiles.

# Which animal is most prevalent and what is their distribution amongst parks ?****

In [None]:
def remove_punc(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

def most_prev(cat, head=5):
    Names = species_info[species_info.category == cat].common_names.apply(remove_punc).str.split().tolist()
    commonNames = []
    for item in Names:
        a = list(dict.fromkeys(item))
        commonNames.append(a)

    allnames = list(chain.from_iterable(i if isinstance(i, list) else [i] for i in Names))
    
    word = []
    for i in allnames:
        a = allnames.count(i)
        word.append((i, a))
        
    animal = pd.DataFrame(set(word), columns=['Word', 'Total']).sort_values('Total', ascending=False)
    
    return animal.head(head)

In [None]:
vasc_plant = most_prev('Vascular Plant')
vasc_plant

From this analysis, in Vascular Plant category  seems that Sedge is the most prevalent animal.

In [None]:
species_info['is_sedge'] = species_info.common_names.str.contains(r'\bSedge\b', regex=True)
sedge_observation = observations.merge(species_info[species_info.is_sedge])
sedge_observation.head()

In [None]:
park_dist = sedge_observation.groupby('park_name').common_names.count().reset_index()
park_dist

In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(data=park_dist, x='park_name', y='common_names')
plt.xticks(rotation=20)

## Conclusions

The project was able to make several data visualizations and inferences about the various species in four of the National Parks that comprised this data set.

This project was also able to answer some of the questions first posed in the beginning:

- What is the distribution of conservation status for species?
    - The vast majority of species were not part of conservation.(5,633 vs 191)
- Are certain types of species more likely to be endangered?
    - Mammals and Birds had the highest percentage of being in protection.
- Are the differences between species and their conservation status significant?
    - While Birds and Fishs did not have significant difference in conservation percentage, Nonvascular plants and Reptiles exhibited a statistically significant difference.
- Which animal is most prevalent and what is their distribution amongst parks?
    - the study found that in Vascular Plant category Sedge occurred the most number of times and their distribution amongst parks seems equal.