In [18]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [19]:
# load in datasets 
arctic_greenland = pd.read_csv('Arctic_Ocean__Greenland_sea_dataset.csv', encoding='latin1')
indian = pd.read_csv('Indian_Ocean_Dataset.csv', encoding='latin1')
mediterranean = pd.read_csv('Mediterranean_Sea_Dataset.csv', encoding='latin1')
north_atlantic = pd.read_csv('North_Atlantic_Dataset.csv', encoding='latin1')
north_pacific = pd.read_csv('North_Pacific_Ocean_Dataset.csv', encoding='latin1')
south_atlantic = pd.read_csv('South_Atlantic_Dataset.csv', encoding='latin1')
south_pacific = pd.read_csv('South_Pacific_Dataset.csv', encoding='latin1')
southern = pd.read_csv('Southern_Ocean_dataset.csv', encoding='latin1')

In [20]:
# combine the oceans
atlantic = pd.concat([north_atlantic, south_atlantic])
pacific = pd.concat([north_pacific, south_pacific])
#world_oceans = pd.concat([arctic_greenland, indian, mediterranean, north_atlantic, north_pacific, south_atlantic, south_pacific, southern])

In [21]:
all_oceans = [arctic_greenland, indian, mediterranean, atlantic, pacific, southern]

# counts total points
rows = 0
for ocean in all_oceans:
    rows = rows + len(ocean)
    
print(rows)

11127


In [22]:
# groups by ocean & date, filters out unnecessary columns, drops null values
world = pd.concat(all_oceans)
world = world.filter(['Oceans', 'Microplastics Measurement (density)', 'Unit', 
                      'Concentration Class', 'Latitude', 'Longitude', 'Date', 'Water Sample Depth (m)'])
world['Date'] = pd.to_datetime(world['Date'])
world = world.sort_values(by=['Oceans', 'Date'])
world = world.dropna()
world = world.reset_index(drop=True)

In [23]:
# drops duplicates
world = world.drop_duplicates()
world

Unnamed: 0,Oceans,Microplastics Measurement (density),Unit,Concentration Class,Latitude,Longitude,Date,Water Sample Depth (m)
0,Arctic Ocean,0.00300,pieces/m3,Low,77.3834,13.8334,2015-05-28,0.45
1,Arctic Ocean,0.00900,pieces/m3,Medium,69.0302,-16.6200,2015-06-12,0.45
2,Arctic Ocean,0.00500,pieces/m3,Medium,65.6830,-18.0861,2015-06-19,0.45
3,Arctic Ocean,0.00700,pieces/m3,Medium,65.7056,-21.6652,2015-06-19,0.45
4,Arctic Ocean,0.00800,pieces/m3,Medium,66.4346,-163.2724,2015-06-26,0.45
...,...,...,...,...,...,...,...,...
10256,Southern Ocean,0.00931,pieces/m3,Medium,-65.9729,158.9910,2017-02-02,0.00
10257,Southern Ocean,0.00000,pieces/m3,Very Low,-67.2879,163.5447,2017-02-03,0.00
10258,Southern Ocean,0.00000,pieces/m3,Very Low,-67.0973,167.3309,2017-02-04,0.00
10259,Southern Ocean,0.01100,pieces/m3,Medium,-64.5400,-61.9986,2017-02-07,0.45


In [24]:
# transform datetime to epoch
world['Date'] = world['Date'].apply(lambda x: x.timestamp() / (3600 * 24)) 
world

Unnamed: 0,Oceans,Microplastics Measurement (density),Unit,Concentration Class,Latitude,Longitude,Date,Water Sample Depth (m)
0,Arctic Ocean,0.00300,pieces/m3,Low,77.3834,13.8334,16583.0,0.45
1,Arctic Ocean,0.00900,pieces/m3,Medium,69.0302,-16.6200,16598.0,0.45
2,Arctic Ocean,0.00500,pieces/m3,Medium,65.6830,-18.0861,16605.0,0.45
3,Arctic Ocean,0.00700,pieces/m3,Medium,65.7056,-21.6652,16605.0,0.45
4,Arctic Ocean,0.00800,pieces/m3,Medium,66.4346,-163.2724,16612.0,0.45
...,...,...,...,...,...,...,...,...
10256,Southern Ocean,0.00931,pieces/m3,Medium,-65.9729,158.9910,17199.0,0.00
10257,Southern Ocean,0.00000,pieces/m3,Very Low,-67.2879,163.5447,17200.0,0.00
10258,Southern Ocean,0.00000,pieces/m3,Very Low,-67.0973,167.3309,17201.0,0.00
10259,Southern Ocean,0.01100,pieces/m3,Medium,-64.5400,-61.9986,17204.0,0.45


In [25]:
# metrics
min_density = world['Microplastics Measurement (density)'].min()
max_density = world['Microplastics Measurement (density)'].max()
median_density = world['Microplastics Measurement (density)'].median()

print(f' Minimum density: {min_density}, Maximum density: {max_density}, Median density: {median_density}')

 Minimum density: 0.0, Maximum density: 57665.0, Median density: 0.016091


In [27]:
# finding the duplicates 
# result = world[ world['Oceans'] == 'Arctic Ocean']
# result = world [ world['Latitude'] == 80.1349]
# result = result.drop(['SubRegions', 'Regions', 'Country', 'State', 'Transect Number', 'x', 'y',
#                      'NCEI Accession Link', 'Collecting Time (min)', 'Volunteers Number', 'Ocean Bottom Depth (m)',
#                      'Sediment Sample Depth (m)', 'Sampling Point on Beach', 'Short Reference', 'Long Reference',
#                      'DOI'], axis=1)


In [28]:
# export to csv
world.to_csv('world_oceans.csv', index=True)

In [118]:
# Checks the NaN values, which appear in Microplastics Measurement (Don't RUN)
world1 = pd.concat(all_oceans)
world1 = world1.filter(['Oceans', 'Microplastics Measurement (density)', 'Unit', 
                        'Concentration Class', 'Latitude', 'Longitude', 'Date'])
rows_with_nan = world1[world1.isna().any(axis=1)]
rows_with_nan = rows_with_nan.filter(['Microplastics Measurement (density)', 'Oceans', 'Beach Location', 'State', 'Country',
                                     'Sampling Method'])
value_counts = rows_with_nan['Oceans'].value_counts()
print(value_counts)

columns_with_nan = rows_with_nan.columns[rows_with_nan.isna().any()]
print(columns_with_nan)
# world = world.sort_values(by='Oceans')
# print(world)

Pacific Ocean     366
Indian Ocean        5
Atlantic Ocean      3
Name: Oceans, dtype: int64
Index(['Microplastics Measurement (density)', 'Oceans'], dtype='object')
