# LIBRARIES



In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

#### Import ```attacks.csv```

In [2]:
pd.set_option ("display.max_columns", None) # Display all the columns
df = pd.read_csv("data/attacks.csv", encoding="latin1") # Read file

![shark](https://media.giphy.com/media/PfHrNe1cSKAjC/giphy.gif)

# 01.CLEANING 


In [3]:
df.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [4]:
df.Country.value_counts()

Country
USA                       2229
AUSTRALIA                 1338
SOUTH AFRICA               579
PAPUA NEW GUINEA           134
NEW ZEALAND                128
                          ... 
MALDIVE ISLANDS              1
NICARAGUA                    1
NORTH SEA                    1
RED SEA / INDIAN OCEAN       1
CEYLON (SRI LANKA)           1
Name: count, Length: 212, dtype: int64

#### 01.Remove the ```columns```  we don't need.

In [5]:
df.drop(columns=["Investigator or Source", 
                 "pdf", 
                 "href formula", 
                 "href", 
                 "Case Number.1", 
                 "Case Number.2", 
                 "original order", 
                 "Unnamed: 22",
                 "Unnamed: 23",
                 "Name",
                 "Location",
                 "Case Number",
                 "Area",
                 "Age",
                 "Sex ",
                 "Time",
                 "Fatal (Y/N)",
                 "Type",
                 "Year",
                 "Activity",
                 "Species ",
                 "Injury"], 
        axis=1, inplace=True)

#### 02. Remove the ```rows``` we don't need

In [6]:
df.dropna(axis = 0, how = 'all', inplace = True)
df

Unnamed: 0,Date,Country
0,25-Jun-2018,USA
1,18-Jun-2018,USA
2,09-Jun-2018,USA
3,08-Jun-2018,AUSTRALIA
4,04-Jun-2018,MEXICO
...,...,...
6297,Before 1903,AUSTRALIA
6298,Before 1903,AUSTRALIA
6299,1900-1905,USA
6300,1883-1889,PANAMA


# 02.TRANSFORMING
### (1) CONTINENTS - HEMISPHERE
### (2) SEASON
### (2) TYPE SHARK

    (1) Continents:
        - Asia
        - Europe
        - North America
        - South America
        - Oceania
        - Africa
        
    (2) Seasons:
        ASIA, EUROPE, NORTH AMERICA
        - Winter: Dec(12), Jan(1), Feb(2)
        - Spring: Mar(3), Apr(4), May(5)
        - Summer: Jun(6), Jul(7), Aug(8)
        - Autumn: Sep(9), Oct(10), Nov(11)
        
        SOUTH AMERICA, AFRICA, OCEANIA
        - Winter: Dec(12), Jan(1), Feb(2)
        - Spring: Mar(3), Apr(4), May(5)
        - Summer: Jun(6), Jul(7), Aug(8)
        - Autumn: Sep(9), Oct(10), Nov(11)
        
    (3) Type Sharks:
        - White Shark: WHITE
        - Bull Shark: BULL
        - Mako Shark: MAKO
        - Tiger Shark: TIGER
        - Blacktip Shark: BLACKTIP


### (1) COUNTRY to CONTINENTS to HEMISPHERE
We want to create a column called "Continents" according to the name of the countries in the "Country" column.

#### 01. First the ```formatting``` of the countries will be modified to ```capitalize the first letter of each word```.

In [7]:
# Remove the NaN values from the Country column,
# otherwise will get an error trying to capitalize floats.
df.dropna(subset=['Country'], inplace=True) 

# split(): the original string is split into words
# capitalize(): capitalizes the first letter of each word
# join(): the words are joined again
df['Country'] = df['Country'].apply(lambda x: ' '.join([word.capitalize() for word in x.split()]))
df

Unnamed: 0,Date,Country
0,25-Jun-2018,Usa
1,18-Jun-2018,Usa
2,09-Jun-2018,Usa
3,08-Jun-2018,Australia
4,04-Jun-2018,Mexico
...,...,...
6297,Before 1903,Australia
6298,Before 1903,Australia
6299,1900-1905,Usa
6300,1883-1889,Panama


#### 02. The ```continent``` must now be associated with each ```country```. For this we will create a ```dictionary``` associating each country to its continent.

In [8]:
list_1 = df["Country"].unique()
list_1

array(['Usa', 'Australia', 'Mexico', 'Brazil', 'England', 'South Africa',
       'Thailand', 'Costa Rica', 'Maldives', 'Bahamas', 'New Caledonia',
       'Ecuador', 'Malaysia', 'Libya', 'Cuba', 'Mauritius', 'New Zealand',
       'Spain', 'Samoa', 'Solomon Islands', 'Japan', 'Egypt',
       'St Helena, British Overseas Territory', 'Comoros', 'Reunion',
       'French Polynesia', 'United Kingdom', 'United Arab Emirates',
       'Philippines', 'Indonesia', 'China', 'Columbia', 'Cape Verde',
       'Fiji', 'Dominican Republic', 'Cayman Islands', 'Aruba',
       'Mozambique', 'Puerto Rico', 'Italy', 'Atlantic Ocean', 'Greece',
       'St. Martin', 'France', 'Papua New Guinea', 'Trinidad & Tobago',
       'Kiribati', 'Israel', 'Diego Garcia', 'Taiwan', 'Jamaica',
       'Palestinian Territories', 'Guam', 'Seychelles', 'Belize',
       'Nigeria', 'Tonga', 'Scotland', 'Canada', 'Croatia',
       'Saudi Arabia', 'Chile', 'Antigua', 'Kenya', 'Russia',
       'Turks & Caicos', 'United Arab Emirat

In [9]:
len(df["Country"].unique())

201

In [10]:
country_to_continent = {
    'Usa': 'North America',
    'Australia': 'Oceania',
    'Mexico': 'North America',
    'Brazil': 'South America',
    'South Africa': 'Africa',
    'Thailand': 'Asia',
    'Costa Rica': 'North America',
    'Maldives': 'Asia',
    'Bahamas': 'North America',
    'New Caledonia': 'Oceania',
    'Ecuador': 'South America',
    'Malaysia': 'Asia',
    'Cuba': 'North America',
    'Mauritius': 'Africa',
    'New Zealand': 'Oceania',
    'Spain': 'Europe',
    'Samoa': 'Oceania',
    'Solomon Islands': 'Oceania',
    'Japan': 'Asia',
    'Egypt': 'Africa',
    'Comoros': 'Africa',
    'French Polynesia': 'Oceania',
    'United Kingdom': 'Europe',
    'United Arab Emirates': 'Asia',
    'Philippines': 'Asia',
    'Indonesia': 'Asia',
    'China': 'Asia',
    'Fiji': 'Oceania',
    'Dominican Republic': 'North America',
    'Cayman Islands': 'North America',
    'Aruba': 'North America',
    'Mozambique': 'Africa',
    'Puerto Rico': 'North America',
    'Italy': 'Europe',
    'Greece': 'Europe',
    'France': 'Europe',
    'Papua New Guinea': 'Oceania',
    'Kiribati': 'Oceania',
    'Israel': 'Asia',
    'Jamaica': 'North America',
    'Guam': 'Oceania',
    'Seychelles': 'Africa',
    'Belize': 'North America',
    'Nigeria': 'Africa',
    'Tonga': 'Oceania',
    'Canada': 'North America',
    'Croatia': 'Europe',
    'Saudi Arabia': 'Asia',
    'Chile': 'South America',
    'Kenya': 'Africa',
    'Malta': 'Europe',
    'Madagascar': 'Africa',
    'Panama': 'North America',
    'Somalia': 'Africa',
    'Norway': 'Europe',
    'Senegal': 'Africa',
    'Yemen': 'Asia',
    'Sierra Leone': 'Africa',
    'Liberia': 'Africa',
    'Vanuatu': 'Oceania',
    'Honduras': 'North America',
    'Sri Lanka': 'Asia',
    'Uruguay': 'South America',
    'India': 'Asia',
    'Marshall Islands': 'Oceania',
    'Hong Kong': 'Asia',
    'El Salvador': 'North America',
    'Angola': 'Africa',
    'Bermuda': 'North America',
    'Montenegro': 'Europe',
    'Tunisia': 'Africa',
    'Namibia': 'Africa',
    'Portugal': 'Europe',
    'Palau': 'Oceania',
    'Grenada': 'North America',
    'Turkey': 'Asia',
    'Singapore': 'Asia',
    'Sudan': 'Africa',
    'American Samoa': 'Oceania',
    'Argentina': 'South America',
    'Guatemala': 'North America',
    'Nicaragua': 'North America',
    'Iraq': 'Asia',
    'Iceland': 'Europe',
    'Barbados': 'North America',
    'Guyana': 'South America',
    'Haiti': 'North America',
    'Ireland': 'Europe',
    'Lebanon': 'Asia',
    'Paraguay': 'South America',
    'Georgia': 'Asia',
    'Guinea': 'Africa',
    'Cook Islands': 'Oceania',
    'Martinique': 'North America',
    'Libya': 'Africa',
    'Djibouti': 'Africa',
    'St. Maartin': 'North America',
    'Caribbean Sea': 'North America',
    'Palestinian Territories': 'Asia',
    'Pacific Ocean': 'Oceania',
    'Italy / Croatia': 'Europe',
    'Scotland': 'Europe',
    'British New Guinea': 'Oceania',
    'Red Sea / Indian Ocean': 'Asia',
    'Colombia': 'South America',
    'British Virgin Islands': 'North America',
    'Iran': 'Asia',
    'Nevis': 'North America',
    'St Helena, British Overseas Territory': 'Africa',
    'Venezuela': 'South America',
    'England': 'Europe',
    'Tanzania': 'Africa',
    'Netherlands Antilles': 'North America',
    'Vietnam': 'Asia',
    'West Indies': 'North America',
    'New Guinea': 'Oceania',
    'Columbia': 'South America',
    'Red Sea': 'Asia',
    'Java': 'Asia',
    'Mediterranean Sea': 'Europe',
    'Andaman / Nicobar Islandas': 'Asia',
    'Bangladesh': 'Asia',
    'Russia': 'Europe',
    'Micronesia': 'Oceania',
    'Falkland Islands': 'South America',
    'Maldive Islands': 'Asia',
    'Okinawa': 'Asia',
    'Sudan?': 'Africa',
    'Northern Arabian Sea': 'Asia',
    'Tasman Sea': 'Oceania',
    'Antigua': 'North America',
    'Bay Of Bengal': 'Asia',
    'Admiralty Islands': 'Oceania',
    'Kuwait': 'Asia',
    'San Domingo': 'North America',
    'The Balkans': 'Europe',
    'Indian Ocean': 'Asia',
    'Peru': 'South America',
    'New Britain': 'Oceania',
    'Azores': 'Europe',
    'St. Martin': 'North America',
    'Algeria': 'Africa',
    'Bahrein': 'Asia',
    'Burma': 'Asia',
    'North Atlantic Ocean': 'North America',
    'Federated States Of Micronesia': 'Oceania',
    'Taiwan': 'Asia',
    'Persian Gulf': 'Asia',
    'Tobago': 'South America',
    'Central Pacific': 'Oceania',
    'South Pacific Ocean': 'Oceania',
    'Between Portugal & India': 'Europe',
    'South China Sea': 'Asia',
    'Andaman Islands': 'Asia',
    'Africa': 'Africa',
    'Ceylon': 'Asia',
    'Iran / Iraq': 'Asia',
    'Cape Verde': 'Africa',
    'North Sea': 'Europe',
    'Greenland': 'North America',
    'Gabon': 'Africa',
    'Gulf Of Aden': 'Asia',
    'Western Samoa': 'Oceania',
    'South Atlantic Ocean': 'South America',
    'Equatorial Guinea / Cameroon': 'Africa',
    'Ghana': 'Africa',
    'Ceylon (sri Lanka)': 'Asia',
    'Roatan': 'North America',
    'Reunion': 'Africa',
    'Tuvalu': 'Oceania',
    'Mid-pacifc Ocean': 'Oceania',
    'Turks & Caicos': 'North America',
    'Reunion Island': 'Africa',
    'Johnston Island': 'Oceania',
    'Sweden': 'Europe',
    'Egypt / Israel': 'Asia',
    'Slovenia': 'Europe',
    'North Pacific Ocean': 'North America',
    'Indian Ocean?': 'Asia',
    'Northern Mariana Islands': 'Oceania',
    'Crete': 'Europe',
    'South Korea': 'Asia',
    'Red Sea?': 'Asia',
    'United Arab Emirates (uae)': 'Asia',
    'British West Indies': 'North America',
    'Monaco': 'Europe',
    'Mayotte': 'Africa',
    'Korea': 'Asia',
    'Grand Cayman': 'North America',
    'Trinidad & Tobago': 'South America',
    'Syria': 'Asia',
    'Diego Garcia': 'Asia',
    'Coast Of Africa': 'Africa',
    'Southwest Pacific Ocean': 'Oceania',
    'Cyprus': 'Asia',
    'British Isles': 'Europe',
    'Solomon Islands / Vanuatu': 'Oceania',
    'Curacao': 'North America'

}

In [11]:
list_2 = list(country_to_continent)
list_2

['Usa',
 'Australia',
 'Mexico',
 'Brazil',
 'South Africa',
 'Thailand',
 'Costa Rica',
 'Maldives',
 'Bahamas',
 'New Caledonia',
 'Ecuador',
 'Malaysia',
 'Cuba',
 'Mauritius',
 'New Zealand',
 'Spain',
 'Samoa',
 'Solomon Islands',
 'Japan',
 'Egypt',
 'Comoros',
 'French Polynesia',
 'United Kingdom',
 'United Arab Emirates',
 'Philippines',
 'Indonesia',
 'China',
 'Fiji',
 'Dominican Republic',
 'Cayman Islands',
 'Aruba',
 'Mozambique',
 'Puerto Rico',
 'Italy',
 'Greece',
 'France',
 'Papua New Guinea',
 'Kiribati',
 'Israel',
 'Jamaica',
 'Guam',
 'Seychelles',
 'Belize',
 'Nigeria',
 'Tonga',
 'Canada',
 'Croatia',
 'Saudi Arabia',
 'Chile',
 'Kenya',
 'Malta',
 'Madagascar',
 'Panama',
 'Somalia',
 'Norway',
 'Senegal',
 'Yemen',
 'Sierra Leone',
 'Liberia',
 'Vanuatu',
 'Honduras',
 'Sri Lanka',
 'Uruguay',
 'India',
 'Marshall Islands',
 'Hong Kong',
 'El Salvador',
 'Angola',
 'Bermuda',
 'Montenegro',
 'Tunisia',
 'Namibia',
 'Portugal',
 'Palau',
 'Grenada',
 'Turkey',

#### 03. We will check which values we have left as invalid

In [12]:
non_valid_countries = set(list_1) - set(list_2)
print('Elements not in list 1:', list(non_valid_countries))

Elements not in list 1: ['Mid Atlantic Ocean', 'Ocean', 'Asia?', 'Atlantic Ocean']


#### 04. Now we will create a for ```loop``` to store in a new list the continents according to the country.

In [13]:
# key: country
# value: continent
# list: continents
continent_to_countries = {}

for country, continent in country_to_continent.items():
    if continent in continent_to_countries:
        continent_to_countries[continent].append(country)
    else:
        continent_to_countries[continent] = [country]

#### 05. We have to ```map``` each country with its corresponding continent.

In [14]:
# we add fillna(np.nan) to fill the missing values with NaN
df['Continent'] = df['Country'].map(country_to_continent).fillna(np.nan) 

In [15]:
# We delete NaN values from the column "Country"
df.dropna(subset=['Country'], inplace=True) 

In [16]:
df

Unnamed: 0,Date,Country,Continent
0,25-Jun-2018,Usa,North America
1,18-Jun-2018,Usa,North America
2,09-Jun-2018,Usa,North America
3,08-Jun-2018,Australia,Oceania
4,04-Jun-2018,Mexico,North America
...,...,...,...
6297,Before 1903,Australia,Oceania
6298,Before 1903,Australia,Oceania
6299,1900-1905,Usa,North America
6300,1883-1889,Panama,North America


In [17]:
pd.isnull(df).sum()

Date          0
Country       0
Continent    24
dtype: int64

#### 06. We have to create a ```function``` to assign the ```hemisphere``` corresponding to each continent.

In [18]:
def get_hemisphere(continent):

    if continent == 'North America':
        return 'North'
    elif continent == 'Europe':
        return 'North'
    elif continent == 'Asia':
        return 'North'
    
    elif continent == 'South America':
        return 'South'
    elif continent == 'Oceania':
        return 'South'
    elif continent == 'Africa':
        return 'South'
    else:
        return None        

In [19]:
df['Hemisphere'] = df.apply(lambda x: get_hemisphere(x['Continent']), axis=1)
df

Unnamed: 0,Date,Country,Continent,Hemisphere
0,25-Jun-2018,Usa,North America,North
1,18-Jun-2018,Usa,North America,North
2,09-Jun-2018,Usa,North America,North
3,08-Jun-2018,Australia,Oceania,South
4,04-Jun-2018,Mexico,North America,North
...,...,...,...,...
6297,Before 1903,Australia,Oceania,South
6298,Before 1903,Australia,Oceania,South
6299,1900-1905,Usa,North America,North
6300,1883-1889,Panama,North America,North


###  (2) SEASONS

#### 01. I need to create a ```column``` with the ```months``` of the year based on the column "Date"

In [20]:
# Convert the 'Date' column to datetime format. 
# errors='coerce' is used to convert any invalid value in NaT = missing date
df['Date'] = pd.to_datetime(df['Date'], errors='coerce') 

# Delete NaT values from Date
df.dropna(subset=['Date'], inplace=True)

# Create a column 'Month' with the values of the months
df['Month'] = df['Date'].dt.month 
# df['Year'] = df['Date'].dt.year
df.drop(columns=["Date"], axis=1, inplace=True)
df

Unnamed: 0,Country,Continent,Hemisphere,Month
0,Usa,North America,North,6
1,Usa,North America,North,6
2,Usa,North America,North,6
3,Australia,Oceania,South,6
4,Mexico,North America,North,6
...,...,...,...,...
6140,Jamaica,North America,North,12
6141,Martinique,North America,North,3
6142,Usa,North America,North,8
6151,Usa,North America,North,7


#### 03. I need to create a ```column``` with the ```seasons``` of the year grouping the months. It must be taken into account that the seasons change according to the continent.

In [21]:
# We'll create a function.
def get_seasons(continent, month):
    
    if continent == 'North America':
        if month in [12, 1, 2]:
            return 'Winter'
        elif month in [3, 4, 5]:
            return 'Spring'
        elif month in [6, 7, 8]:
            return 'Summer'
        else:
            return 'Autumn'
        
    elif continent == 'South America':
        if month in [12, 1, 2]:
            return 'Summer'
        elif month in [3, 4, 5]:
            return 'Autumn'
        elif month in [6, 7, 8]:
            return 'Winter'
        else:
            return 'Spring'
        
    elif continent == 'Europe':
        if month in [12, 1, 2]:
            return 'Winter'
        elif month in [3, 4, 5]:
            return 'Spring'
        elif month in [6, 7, 8]:
            return 'Summer'
        else:
            return 'Autumn'
        
    elif continent == 'Africa':
        if month in [12, 1, 2]:
            return 'Summer'
        elif month in [3, 4, 5]:
            return 'Autumn'
        elif month in [6, 7, 8]:
            return 'Winter'
        else:
            return 'Spring'
        
    elif continent == 'Asia':
        if month in [12, 1, 2]:
            return 'Winter'
        elif month in [3, 4, 5]:
            return 'Spring'
        elif month in [6, 7, 8]:
            return 'Summer'
        else:
            return 'Autumn'
        
    elif continent == 'Oceania':
        if month in [12, 1, 2]:
            return 'Summer'
        elif month in [3, 4, 5]:
            return 'Autumn'
        elif month in [6, 7, 8]:
            return 'Winter'
        else:
            return 'Spring'
    else:
        return None


#### 04. Now we have to create a new column ```Season``` storing the values from the function ```Get_Season```.

In [22]:
df['Season'] = df.apply(lambda x: get_seasons(x['Continent'], x['Month']), axis=1)
df.reset_index()

Unnamed: 0,index,Country,Continent,Hemisphere,Month,Season
0,0,Usa,North America,North,6,Summer
1,1,Usa,North America,North,6,Summer
2,2,Usa,North America,North,6,Summer
3,3,Australia,Oceania,South,6,Winter
4,4,Mexico,North America,North,6,Summer
...,...,...,...,...,...,...
4743,6140,Jamaica,North America,North,12,Winter
4744,6141,Martinique,North America,North,3,Spring
4745,6142,Usa,North America,North,8,Summer
4746,6151,Usa,North America,North,7,Summer


In [23]:
df = df.dropna(subset=["Continent"])

In [24]:
pd.isnull(df).sum()

Country       0
Continent     0
Hemisphere    0
Month         0
Season        0
dtype: int64

In [25]:
df

Unnamed: 0,Country,Continent,Hemisphere,Month,Season
0,Usa,North America,North,6,Summer
1,Usa,North America,North,6,Summer
2,Usa,North America,North,6,Summer
3,Australia,Oceania,South,6,Winter
4,Mexico,North America,North,6,Summer
...,...,...,...,...,...
6140,Jamaica,North America,North,12,Winter
6141,Martinique,North America,North,3,Spring
6142,Usa,North America,North,8,Summer
6151,Usa,North America,North,7,Summer


In [26]:
#df.to_csv("data/continent_season.csv", index=False)

### (3) TYPES OF SHARK

In [27]:
list(df['Species '].unique())

KeyError: 'Species '

#### 01. I need to create a ```column``` with the months of the year based on the column "Date"

In [None]:
df.dropna(subset=['Species '], inplace=True)

In [None]:
mask_bull = df['Species '].str.contains('bull', case=False)
mask_mako = df['Species '].str.contains('mako', case=False)
mask_tiger = df['Species '].str.contains('tiger', case=False)
mask_blacktip = df['Species '].str.contains('blacktip', case=False)
mask_white = df['Species '].str.contains('white', case=False) & df['Species '].str.contains('shark', case=False)


df.loc[mask_bull, 'Type Shark'] = 'Bull'
df.loc[mask_mako, 'Type Shark'] = 'Mako'
df.loc[mask_tiger, 'Type Shark'] = 'Tiger'
df.loc[mask_blacktip, 'Type Shark'] = 'Blacktip'
df.loc[mask_white, 'Type Shark'] = 'White Shark'

df

In [None]:
df = df.dropna(subset=["Type Shark"])

In [None]:
df = df.drop('Species ', axis=1)
df

In [None]:
pd.isnull(df).sum()

In [None]:
#df.to_csv("data/continent_season_type-shark.csv", index=False)

<div style="background-color: yellow;">
.
</div>

# DRAFT 


df['Species'] = df['Species '].str.replace(r'.*bull.*', 'BULL', case=False, regex=True) 
df['Species'] = df['Species '].str.replace(r'.*mako.*', 'MAKO', case=False, regex=True)
df['Species'] = df['Species '].str.replace(r'.*white shark.*', 'WHITE SHARK', case=False, regex=True)
df['Species'] = df['Species '].str.replace(r'.*tiger.*', 'TIGER', case=False, regex=True)
df['Species'] = df['Species '].str.replace(r'.*blacktip.*', 'BLACKTIP', case=False, regex=True)
df.sample()

shark_species = ['MAKO', 'TIGER', 'WHITE SHARK', 'BULL', 'BLACKTIP']

df[df['Species '].isin(shark_species)]

```pd.to_datetime```