# LIBRARIES



In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import pycountry 


#### Import ```attacks.csv```

In [2]:
pd.set_option ("display.max_columns", None) # Display all the columns
df = pd.read_csv("data/attacks.csv", encoding="latin1") # Read file

## Hipothesis 1: 
### Are sharks more aggressive according to the continent?
## Hipothesis 2: 
### Are sharks more aggressive according to the season of the year?
## Hipothesis 3: 
### Which sharks attack the most by continent and by season?

![shark](https://media.giphy.com/media/PfHrNe1cSKAjC/giphy.gif)

<div style="background-color: yellow;">
.
</div>

# 01.CLEANING 


In [3]:
df.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

#### 01.Remove the ```columns``` i don't need

In [4]:
df.drop(columns=["Investigator or Source", 
                 "pdf", 
                 "href formula", 
                 "href", 
                 "Case Number.1", 
                 "Case Number.2", 
                 "original order", 
                 "Unnamed: 22",
                 "Unnamed: 23",
                 "Name",
                 "Location",
                 "Case Number",
                 "Area",
                 "Age",
                 "Sex ",
                 "Time",
                 "Fatal (Y/N)",
                 "Type",
                 "Activity",
                 "Injury"], 
        axis=1, inplace=True)
df

Unnamed: 0,Date,Year,Country,Species
0,25-Jun-2018,2018.0,USA,White shark
1,18-Jun-2018,2018.0,USA,
2,09-Jun-2018,2018.0,USA,
3,08-Jun-2018,2018.0,AUSTRALIA,2 m shark
4,04-Jun-2018,2018.0,MEXICO,"Tiger shark, 3m"
...,...,...,...,...
25718,,,,
25719,,,,
25720,,,,
25721,,,,


#### 02. Remove the ```rows``` i don't need

In [5]:
df.dropna(axis = 0, how = 'all', inplace = True)
df

Unnamed: 0,Date,Year,Country,Species
0,25-Jun-2018,2018.0,USA,White shark
1,18-Jun-2018,2018.0,USA,
2,09-Jun-2018,2018.0,USA,
3,08-Jun-2018,2018.0,AUSTRALIA,2 m shark
4,04-Jun-2018,2018.0,MEXICO,"Tiger shark, 3m"
...,...,...,...,...
6297,Before 1903,0.0,AUSTRALIA,
6298,Before 1903,0.0,AUSTRALIA,
6299,1900-1905,0.0,USA,
6300,1883-1889,0.0,PANAMA,


<div style="background-color: yellow;">
.
</div>

# 02.TRANSFORMING

    (1) Continents:
        - Asia
        - Europe
        - North America
        - South America
        - Oceania
        - Africa
        
    (2) Seasons:
        ASIA, EUROPE, NORTH AMERICA
        - Winter: Dec(12), Jan(1), Feb(2)
        - Spring: Mar(3), Apr(4), May(5)
        - Summer: Jun(6), Jul(7), Aug(8)
        - Autumn: Sep(9), Oct(10), Nov(11)
        
        SOUTH AMERICA, AFRICA, OCEANIA
        - Winter: Dec(12), Jan(1), Feb(2)
        - Spring: Mar(3), Apr(4), May(5)
        - Summer: Jun(6), Jul(7), Aug(8)
        - Autumn: Sep(9), Oct(10), Nov(11)
        
    (3) Type Sharks:
        - White Shark: WHITE
        - Bull Shark: BULL
        - Mako Shark: MAKO
        - Tiger Shark: TIGER
        - Blacktip Shark: BLACKTIP


### (1) CONTINENTS

In [6]:
# First we remove the NaN values from the Country column, otherwise will get an error trying to capitalize floats.
df.dropna(subset=['Country'], inplace=True) 

# split(): the original string is split into words
# capitalize(): capitalizes the first letter of each word
# join(): the words are joined using the join() method.
df['Country'] = df['Country'].apply(lambda x: ' '.join([word.capitalize() for word in x.split()]))
df


Unnamed: 0,Date,Year,Country,Species
0,25-Jun-2018,2018.0,Usa,White shark
1,18-Jun-2018,2018.0,Usa,
2,09-Jun-2018,2018.0,Usa,
3,08-Jun-2018,2018.0,Australia,2 m shark
4,04-Jun-2018,2018.0,Mexico,"Tiger shark, 3m"
...,...,...,...,...
6297,Before 1903,0.0,Australia,
6298,Before 1903,0.0,Australia,
6299,1900-1905,0.0,Usa,
6300,1883-1889,0.0,Panama,


#### 02. Now we want to create a ```for loop``` to check if the elements of the ```column "Country"``` are detected by ```pycountry```.

In [7]:
valid_countries = []
for country in df['Country']:
    if pycountry.countries.get(name=country) is not None:
        valid_countries.append(country)

#### 03. Now we want the column ````"Country"``` to have only the values of the list ```valid_countries```.

In [28]:
df = df[df['Country'].isin(valid_countries)] #  using isin you can filter the rows according to whether the values belong to the valid_countries list or not.
df = df.reset_index()

In [29]:
country_to_continent = {
    'Australia': 'Oceania',
    'Mexico': 'North America',
    'Brazil': 'South America',
    'South Africa': 'Africa',
    'Thailand': 'Asia',
    'Costa Rica': 'North America',
    'Maldives': 'Asia',
    'Bahamas': 'North America',
    'New Caledonia': 'Oceania',
    'Ecuador': 'South America',
    'Malaysia': 'Asia',
    'Cuba': 'North America',
    'Mauritius': 'Africa',
    'New Zealand': 'Oceania',
    'Spain': 'Europe',
    'Samoa': 'Oceania',
    'Solomon Islands': 'Oceania',
    'Japan': 'Asia',
    'Egypt': 'Africa',
    'Comoros': 'Africa',
    'French Polynesia': 'Oceania',
    'United Kingdom': 'Europe',
    'United Arab Emirates': 'Asia',
    'Philippines': 'Asia',
    'Indonesia': 'Asia',
    'China': 'Asia',
    'Fiji': 'Oceania',
    'Dominican Republic': 'North America',
    'Cayman Islands': 'North America',
    'Aruba': 'North America',
    'Mozambique': 'Africa',
    'Puerto Rico': 'North America',
    'Italy': 'Europe',
    'Greece': 'Europe',
    'France': 'Europe',
    'Papua New Guinea': 'Oceania',
    'Kiribati': 'Oceania',
    'Israel': 'Asia',
    'Jamaica': 'North America',
    'Guam': 'Oceania',
    'Seychelles': 'Africa',
    'Belize': 'North America',
    'Nigeria': 'Africa',
    'Tonga': 'Oceania',
    'Canada': 'North America',
    'Croatia': 'Europe',
    'Saudi Arabia': 'Asia',
    'Chile': 'South America',
    'Kenya': 'Africa',
    'Malta': 'Europe',
    'Madagascar': 'Africa',
    'Panama': 'North America',
    'Somalia': 'Africa',
    'Norway': 'Europe',
    'Senegal': 'Africa',
    'Yemen': 'Asia',
    'Sierra Leone': 'Africa',
    'Liberia': 'Africa',
    'Vanuatu': 'Oceania',
    'Honduras': 'North America',
    'Sri Lanka': 'Asia',
    'Uruguay': 'South America',
    'India': 'Asia',
    'Marshall Islands': 'Oceania',
    'Hong Kong': 'Asia',
    'El Salvador': 'North America',
    'Angola': 'Africa',
    'Bermuda': 'North America',
    'Montenegro': 'Europe',
    'Tunisia': 'Africa',
    'Namibia': 'Africa',
    'Portugal': 'Europe',
    'Palau': 'Oceania',
    'Grenada': 'North America',
    'Turkey': 'Asia',
    'Singapore': 'Asia',
    'Sudan': 'Africa',
    'American Samoa': 'Oceania',
    'Argentina': 'South America',
    'Guatemala': 'North America',
    'Nicaragua': 'North America',
    'Iraq': 'Asia',
    'Iceland': 'Europe',
    'Barbados': 'North America',
    'Guyana': 'South America',
    'Haiti': 'North America',
    'Ireland': 'Europe',
    'Lebanon': 'Asia',
    'Paraguay': 'South America',
    'Georgia': 'Asia',
    'Guinea': 'Africa',
    'Cook Islands': 'Oceania',
    'Martinique': 'North America',
}

# create a new dictionary that maps each continent to a list of countries in that continent
continent_to_countries = {}

for country, continent in country_to_continent.items():
    if continent not in continent_to_countries:
        continent_to_countries[continent] = []
    continent_to_countries[continent].append(country)
    
#print(continent_to_countries)

In [30]:
df['Continent'] = df['Country'].map(country_to_continent).fillna('Unknown')
df

Unnamed: 0,index,Date,Year,Country,Species,Continent,Month,Season,Type Shark,Species.1
0,4,2018-06-04,2018.0,Mexico,"Tiger shark, 3m",North America,6,Summer,Tiger,"Tiger shark, 3m"
1,6,2018-06-03,2018.0,Brazil,Tiger shark,South America,6,Winter,Tiger,Tiger shark
2,18,2018-04-28,2018.0,Costa Rica,Tiger shark,North America,4,Spring,Tiger,Tiger shark
3,22,2018-04-24,2018.0,Australia,"White shark, 3.5 m",Oceania,4,Autumn,White Shark,"White shark, 3.5 m"
4,23,2018-04-23,2018.0,Maldives,Tiger shark,Asia,4,Spring,Tiger,Tiger shark
...,...,...,...,...,...,...,...,...,...,...
587,5489,1905-08-24,1905.0,Egypt,"Tiger shark, 3.9 m",Africa,8,Winter,Tiger,"Tiger shark, 3.9 m"
588,5545,1901-07-30,1901.0,South Africa,White shark,Africa,7,Winter,White Shark,White shark
589,5814,1880-11-25,1880.0,Australia,Bull shark,Oceania,11,Spring,Bull,Bull shark
590,5921,1868-09-01,1868.0,Italy,White shark,Europe,9,Autumn,White Shark,White shark


###  (2) SEASONS

#### 01. I need to create a ```column``` with the ```months``` of the year based on the column "Date"

In [31]:
# Convert the 'Date' column to datetime format. 
# errors='coerce' is used to convert any invalid value in NaT
df['Date'] = pd.to_datetime(df['Date'], errors='coerce') 

# Delete NaT values from Date
df.dropna(subset=['Date'], inplace=True)

# Create a column 'Month' with the values of the months
df['Month'] = df['Date'].dt.month 

df

Unnamed: 0,index,Date,Year,Country,Species,Continent,Month,Season,Type Shark,Species.1
0,4,2018-06-04,2018.0,Mexico,"Tiger shark, 3m",North America,6,Summer,Tiger,"Tiger shark, 3m"
1,6,2018-06-03,2018.0,Brazil,Tiger shark,South America,6,Winter,Tiger,Tiger shark
2,18,2018-04-28,2018.0,Costa Rica,Tiger shark,North America,4,Spring,Tiger,Tiger shark
3,22,2018-04-24,2018.0,Australia,"White shark, 3.5 m",Oceania,4,Autumn,White Shark,"White shark, 3.5 m"
4,23,2018-04-23,2018.0,Maldives,Tiger shark,Asia,4,Spring,Tiger,Tiger shark
...,...,...,...,...,...,...,...,...,...,...
587,5489,1905-08-24,1905.0,Egypt,"Tiger shark, 3.9 m",Africa,8,Winter,Tiger,"Tiger shark, 3.9 m"
588,5545,1901-07-30,1901.0,South Africa,White shark,Africa,7,Winter,White Shark,White shark
589,5814,1880-11-25,1880.0,Australia,Bull shark,Oceania,11,Spring,Bull,Bull shark
590,5921,1868-09-01,1868.0,Italy,White shark,Europe,9,Autumn,White Shark,White shark


#### 03. I need to create a ```column``` with the ```seasons``` of the year grouping the months in the column "Month"

In [32]:
def get_seasons(continent, month):
    
    if continent == 'North America':
        if month in [12, 1, 2]:
            return 'Winter'
        elif month in [3, 4, 5]:
            return 'Spring'
        elif month in [6, 7, 8]:
            return 'Summer'
        else:
            return 'Autumn'
        
    elif continent == 'South America':
        if month in [12, 1, 2]:
            return 'Summer'
        elif month in [3, 4, 5]:
            return 'Autumn'
        elif month in [6, 7, 8]:
            return 'Winter'
        else:
            return 'Spring'
        
    elif continent == 'Europe':
        if month in [12, 1, 2]:
            return 'Winter'
        elif month in [3, 4, 5]:
            return 'Spring'
        elif month in [6, 7, 8]:
            return 'Summer'
        else:
            return 'Autumn'
        
    elif continent == 'Africa':
        if month in [12, 1, 2]:
            return 'Summer'
        elif month in [3, 4, 5]:
            return 'Autumn'
        elif month in [6, 7, 8]:
            return 'Winter'
        else:
            return 'Spring'
        
    elif continent == 'Asia':
        if month in [12, 1, 2]:
            return 'Winter'
        elif month in [3, 4, 5]:
            return 'Spring'
        elif month in [6, 7, 8]:
            return 'Summer'
        else:
            return 'Autumn'
        
    elif continent == 'Oceania':
        if month in [12, 1, 2]:
            return 'Summer'
        elif month in [3, 4, 5]:
            return 'Autumn'
        elif month in [6, 7, 8]:
            return 'Winter'
        else:
            return 'Spring'


In [33]:
df['Season'] = df.apply(lambda x: get_seasons(x['Continent'], x['Month']), axis=1)
df.reset_index()
df_1 = df
df_1

Unnamed: 0,index,Date,Year,Country,Species,Continent,Month,Season,Type Shark,Species.1
0,4,2018-06-04,2018.0,Mexico,"Tiger shark, 3m",North America,6,Summer,Tiger,"Tiger shark, 3m"
1,6,2018-06-03,2018.0,Brazil,Tiger shark,South America,6,Winter,Tiger,Tiger shark
2,18,2018-04-28,2018.0,Costa Rica,Tiger shark,North America,4,Spring,Tiger,Tiger shark
3,22,2018-04-24,2018.0,Australia,"White shark, 3.5 m",Oceania,4,Autumn,White Shark,"White shark, 3.5 m"
4,23,2018-04-23,2018.0,Maldives,Tiger shark,Asia,4,Spring,Tiger,Tiger shark
...,...,...,...,...,...,...,...,...,...,...
587,5489,1905-08-24,1905.0,Egypt,"Tiger shark, 3.9 m",Africa,8,Winter,Tiger,"Tiger shark, 3.9 m"
588,5545,1901-07-30,1901.0,South Africa,White shark,Africa,7,Winter,White Shark,White shark
589,5814,1880-11-25,1880.0,Australia,Bull shark,Oceania,11,Spring,Bull,Bull shark
590,5921,1868-09-01,1868.0,Italy,White shark,Europe,9,Autumn,White Shark,White shark


### (3) TYPES OF SHARK

In [14]:
list(df['Species '].unique())

['2 m shark',
 'Tiger shark, 3m',
 nan,
 'Tiger shark',
 'Grey reef shark',
 'Shark involvement not confirmed',
 'Questionable',
 '3 m shark',
 'White shark, 3.5 m',
 'White shark, 2.5 m',
 'Juvenile bull shark',
 'Bull shark',
 'White shark',
 'Wobbegong shark',
 '3.5 m shark',
 '1.8 m shark',
 'Blacktip shark',
 'Juvenile white shark,  2.7 to 3.2 m',
 'Bull shark, 2 m',
 'Possibly a wobbegong',
 'Injury believed caused by an eel, not a shark',
 'Galapagos shark?',
 '2m shark',
 'Bull shark, 3 m ',
 'Grey reef shark. 2 m',
 'small shark',
 'Wobbegong shark?',
 'Juvenile nurse shark',
 "Nurse shark. 5'",
 'Tiger shark, female',
 'Wobbegong shark, 1 m',
 'White shark, 4.5 m',
 'Death may have been due to drowning',
 'Porbeagle, 1.5 m',
 'White shark, 3 to 3.5m ',
 'White shark, 3 m',
 'Shark involvement questionable',
 "5' shark",
 'Oceanic whitetip shark, 1.8 to 2 m',
 "Blue shark 6'",
 'Shark involvement prior to death not confirmed',
 'White shark, 4 m',
 'Seven-gill shark',
 "10' sh

#### 01. I need to create a ```column``` with the months of the year based on the column "Date"

In [15]:
df.dropna(subset=['Species '], inplace=True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset=['Species '], inplace=True)


In [16]:
mask_bull = df['Species '].str.contains('bull', case=False)
mask_mako = df['Species '].str.contains('mako', case=False)
mask_tiger = df['Species '].str.contains('tiger', case=False)
mask_blacktip = df['Species '].str.contains('blacktip', case=False)
mask_white = df['Species '].str.contains('white', case=False) & df['Species '].str.contains('shark', case=False)


df.loc[mask_bull, 'Type Shark'] = 'Bull'
df.loc[mask_mako, 'Type Shark'] = 'Mako'
df.loc[mask_tiger, 'Type Shark'] = 'Tiger'
df.loc[mask_blacktip, 'Type Shark'] = 'Blacktip'
df.loc[mask_white, 'Type Shark'] = 'White Shark'

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[mask_bull, 'Type Shark'] = 'Bull'


Unnamed: 0,Date,Year,Country,Species,Continent,Month,Season,Type Shark
3,2018-06-08,2018.0,Australia,2 m shark,Oceania,6,Winter,
4,2018-06-04,2018.0,Mexico,"Tiger shark, 3m",North America,6,Summer,Tiger
6,2018-06-03,2018.0,Brazil,Tiger shark,South America,6,Winter,Tiger
10,2018-05-24,2018.0,Australia,Grey reef shark,Oceania,5,Autumn,
16,2018-05-09,2018.0,Australia,Shark involvement not confirmed,Oceania,5,Autumn,
...,...,...,...,...,...,...,...,...
6009,1853-04-29,1853.0,Greece,234-lb shark,Europe,4,Spring,
6014,1852-12-19,1852.0,Australia,Shark involvement prior to death unconfirmed,Oceania,12,Summer,
6021,1852-02-26,1852.0,South Africa,White sharks,Africa,2,Summer,White Shark
6035,1849-01-27,1849.0,Australia,Shark involvement prior to death unconfirmed,Oceania,1,Summer,


In [17]:
df.dropna(subset=['Type Shark'], inplace=True) 
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset=['Type Shark'], inplace=True)


Unnamed: 0,Date,Year,Country,Species,Continent,Month,Season,Type Shark
4,2018-06-04,2018.0,Mexico,"Tiger shark, 3m",North America,6,Summer,Tiger
6,2018-06-03,2018.0,Brazil,Tiger shark,South America,6,Winter,Tiger
18,2018-04-28,2018.0,Costa Rica,Tiger shark,North America,4,Spring,Tiger
22,2018-04-24,2018.0,Australia,"White shark, 3.5 m",Oceania,4,Autumn,White Shark
23,2018-04-23,2018.0,Maldives,Tiger shark,Asia,4,Spring,Tiger
...,...,...,...,...,...,...,...,...
5489,1905-08-24,1905.0,Egypt,"Tiger shark, 3.9 m",Africa,8,Winter,Tiger
5545,1901-07-30,1901.0,South Africa,White shark,Africa,7,Winter,White Shark
5814,1880-11-25,1880.0,Australia,Bull shark,Oceania,11,Spring,Bull
5921,1868-09-01,1868.0,Italy,White shark,Europe,9,Autumn,White Shark


In [18]:
df_1

Unnamed: 0,Date,Year,Country,Species,Continent,Month,Season,Type Shark
4,2018-06-04,2018.0,Mexico,"Tiger shark, 3m",North America,6,Summer,Tiger
6,2018-06-03,2018.0,Brazil,Tiger shark,South America,6,Winter,Tiger
18,2018-04-28,2018.0,Costa Rica,Tiger shark,North America,4,Spring,Tiger
22,2018-04-24,2018.0,Australia,"White shark, 3.5 m",Oceania,4,Autumn,White Shark
23,2018-04-23,2018.0,Maldives,Tiger shark,Asia,4,Spring,Tiger
...,...,...,...,...,...,...,...,...
5489,1905-08-24,1905.0,Egypt,"Tiger shark, 3.9 m",Africa,8,Winter,Tiger
5545,1901-07-30,1901.0,South Africa,White shark,Africa,7,Winter,White Shark
5814,1880-11-25,1880.0,Australia,Bull shark,Oceania,11,Spring,Bull
5921,1868-09-01,1868.0,Italy,White shark,Europe,9,Autumn,White Shark


<div style="background-color: yellow;">
.
</div>

# 03.VISUALIZATION

    (1). BARPLOT?

In [19]:
#sns.countplot(x=df_months["Month"], palette="magma");

In [20]:
#sns.countplot(x=df_1["Month"], hue=df_1["Species "], palette="magma");

# DRAFT 


In [21]:
df['Species'] = df['Species '].str.replace(r'.*bull.*', 'BULL', case=False, regex=True) 
df['Species'] = df['Species '].str.replace(r'.*mako.*', 'MAKO', case=False, regex=True)
df['Species'] = df['Species '].str.replace(r'.*white shark.*', 'WHITE SHARK', case=False, regex=True)
df['Species'] = df['Species '].str.replace(r'.*tiger.*', 'TIGER', case=False, regex=True)
df['Species'] = df['Species '].str.replace(r'.*blacktip.*', 'BLACKTIP', case=False, regex=True)
df.sample()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Species'] = df['Species '].str.replace(r'.*bull.*', 'BULL', case=False, regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Species'] = df['Species '].str.replace(r'.*mako.*', 'MAKO', case=False, regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Species'] = df['Species '

Unnamed: 0,Date,Year,Country,Species,Continent,Month,Season,Type Shark,Species.1
2550,1992-06-17,1992.0,Japan,"White shark, identification by K. Nakaya",Asia,6,Summer,White Shark,"White shark, identification by K. Nakaya"


In [22]:
shark_species = ['MAKO', 'TIGER', 'WHITE SHARK', 'BULL', 'BLACKTIP']

df[df['Species '].isin(shark_species)]

Unnamed: 0,Date,Year,Country,Species,Continent,Month,Season,Type Shark,Species.1


In [23]:
list(df.columns)

['Date',
 'Year',
 'Country',
 'Species ',
 'Continent',
 'Month',
 'Season',
 'Type Shark',
 'Species']

In [24]:
df.isnull().sum()

Date          0
Year          0
Country       0
Species       0
Continent     0
Month         0
Season        0
Type Shark    0
Species       0
dtype: int64

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 592 entries, 4 to 6021
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Date        592 non-null    datetime64[ns]
 1   Year        592 non-null    float64       
 2   Country     592 non-null    object        
 3   Species     592 non-null    object        
 4   Continent   592 non-null    object        
 5   Month       592 non-null    int32         
 6   Season      592 non-null    object        
 7   Type Shark  592 non-null    object        
 8   Species     592 non-null    object        
dtypes: datetime64[ns](1), float64(1), int32(1), object(6)
memory usage: 43.9+ KB


In [26]:
df.Country.unique()

array(['Mexico', 'Brazil', 'Costa Rica', 'Australia', 'Maldives',
       'South Africa', 'Thailand', 'Bahamas', 'New Caledonia', 'Cuba',
       'Egypt', 'Indonesia', 'New Zealand', 'United Arab Emirates',
       'French Polynesia', 'France', 'Jamaica', 'Tonga',
       'Papua New Guinea', 'Seychelles', 'Malaysia', 'Guam', 'Croatia',
       'Fiji', 'Uruguay', 'United Kingdom', 'Italy', 'Mauritius',
       'Hong Kong', 'Japan', 'Chile', 'El Salvador', 'Spain', 'Portugal',
       'Israel', 'Mozambique', 'Palau', 'Greece', 'Bermuda', 'Turkey',
       'American Samoa', 'Marshall Islands', 'Malta', 'Montenegro',
       'Argentina', 'Canada', 'Panama', 'Nicaragua', 'Iraq'], dtype=object)

In [27]:
list(df.Time.unique()) # get the unique values

AttributeError: 'DataFrame' object has no attribute 'Time'

In [None]:
list1= list(df.Country.unique())
list1

In [None]:
country_name = 'MID ATLANTIC OCEAN'
country_obj = pycountry.countries.get(name=country_name)
if country_obj is not None:
    print(f"{country_name} EXISTS in pycountry")
else:
    print(f"{country_name} does not exist in pycountry")

In [None]:
df['Country'] = df['Country'].apply(lambda x: x.title())

# Filtering

In [None]:
df.Year == 

In [None]:
country_to_continent = {
    'Australia': ('Oceania', 'South'),
    'Mexico': ('North America', 'North'),
    'Brazil': ('South America', 'South'),
    'South Africa': ('Africa', 'South'),
    'Thailand': ('Asia', 'North'),
    'Costa Rica': ('North America', 'North'),
    'Maldives': ('Asia', 'North'),
    'Bahamas': ('North America', 'North'),
    'New Caledonia': ('Oceania', 'South'),
    'Ecuador': ('South America', 'South'),
    'Malaysia': ('Asia', 'North'),
    'Cuba': ('North America', 'North'),
    'Mauritius': ('Africa', 'South'),
    'New Zealand': ('Oceania', 'South'),
    'Spain': ('Europe', 'North'),
    'Samoa': ('Oceania', 'South'),
    'Solomon Islands': ('Oceania', 'South'),
    'Japan': ('Asia', 'North'),
    'Egypt': ('Africa', 'North'),
    'Comoros': ('Africa', 'South'),
    'French Polynesia': ('Oceania', 'South'),
    'United Kingdom': ('Europe', 'North'),
    'United Arab Emirates': ('Asia', 'North'),
    'Philippines': ('Asia', 'North'),
    'Indonesia': ('Asia', 'North'),
    'China': ('Asia', 'North'),
    'Fiji': ('Oceania', 'South'),
    'Dominican Republic': ('North America', 'North'),
    'Cayman Islands': ('North America', 'North'),
    'Aruba': ('North America', 'North'),
    'Mozambique': ('Africa', 'South'),
    'Puerto Rico': ('North America', 'North'),
    'Italy': ('Europe', 'North'),
    'Greece': ('Europe', 'North'),
    'France': ('Europe', 'North'),
    'Papua New Guinea': ('Oceania', 'South'),
    'Kiribati': ('Oceania', 'South'),
    'Israel': ('Asia', 'North'),
    'Jamaica': ('North America', 'North'),
    'Guam': ('Oceania', 'North'),
    'Seychelles': ('Africa', 'South'),
    'Belize': ('North America', 'North'),
    'Nigeria': ('Africa', 'North'),
    'Tonga': ('Oceania', 'South'),
    'Canada': ('North America', 'North'),
    'Croatia': ('Europe', 'North'),
    'Saudi Arabia': ('Asia', 'North'),
    'Chile': ('South America', 'South'),
    'Kenya': ('Africa', 'South'),
    'Malta': ('Europe', 'North'),
    'Madagascar': ('Africa', 'South'),
    'Panama': ('North America', 'North'),
    'Somalia': ('Africa', 'North'),
    'Norway': ('Europe', 'North'),
    'Senegal': ('Africa', 'North'),
    'Yemen': ('Asia', 'North'),
    'Sierra Leone': ('Africa', 'North'),
    'Liberia': ('Africa', 'North'),
    'Vanuatu': ('Oceania', 'South'),
    'Honduras': ('North America', 'North'),
    'Sri Lanka': ('Asia', 'North'),
    'Uruguay': ('South America', 'South'),
    'India': ('Asia', 'North'),
    'Marshall Islands': ('Oceania', 'North'),
    'Hong Kong': ('Asia', 'North'),
    'El Salvador': ('North America', 'North'),
    'Angola': ('Africa', 'South'),
    'Bermuda': ('North America', 'North'),
    'Montenegro': ('Europe', 'North'),
    'Tunisia': ('Africa', 'North'),
    'Namibia': ('Africa', 'South'),
    'Portugal': ('Europe', 'North'),
    'Palau': ('Oceania', 'North'),
    'Grenada': ('North America', 'North'),
    'Turkey': ('Asia', 'North'),
    'Singapore': ('Asia', 'North'),
    'Sudan': ('Africa', 'North'),
    'American Samoa': ('Oceania', 'South'),
    'Argentina': ('South America', 'South'),
    'Guatemala': ('North America', 'North'),
    'Nicaragua': ('North America', 'North'),
    'Iraq': ('Asia', 'North'),
    'Iceland': ('Europe', 'North'),
    'Barbados': ('North America', 'North'),
    'Guyana': ('South America', 'South'),
    'Haiti': ('North America', 'North'),
    'Ireland': ('Europe', 'North'),
    'Lebanon': ('Asia', 'North'),
    'Paraguay': ('South America', 'South'),
    'Georgia': ('Asia', 'North'),
    'Guinea': ('Africa', 'North'),
    'Cook Islands': ('Oceania', 'South'),
    'Martinique': ('North America', 'North')
}