In [1]:
# jupyter notebook for capstone project

# Capstone Project - Predict Fishing Habits

## Overview of Process - CRISP-DM:
1. Business Understanding
2. Data Understanding
3. Data Preparation
4. Modeling
5. Evaluation
6. Deployment

# 1. Business Understanding

# 2. Data Understanding

In [39]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# load datasets
drifting_longlines = pd.read_csv('datasets/drifting_longlines.csv')
fixed_gear = pd.read_csv('datasets/fixed_gear.csv')
pole_and_line = pd.read_csv('datasets/pole_and_line.csv')
purse_seines = pd.read_csv('datasets/purse_seines.csv')
trawlers = pd.read_csv('datasets/trawlers.csv')
trollers = pd.read_csv('datasets/trollers.csv')
unknown = pd.read_csv('datasets/unknown.csv')

In [4]:
fishing_vessels = ['drifting_longlines', 
                   'fixed_gear',
                   'pole_and_line',
                   'purse_seines',
                   'trawlers',
                   'trollers',
                   'unknown']

## Drifting Longlines

In [40]:
# display top 5 rows
drifting_longlines.head()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source,vessel_type
0,12639560000000.0,1327137000.0,232994.28125,311748.65625,8.2,230.5,14.865583,-26.853662,-1.0,dalhousie_longliner,drifting_longlines
1,12639560000000.0,1327137000.0,233994.265625,312410.34375,7.3,238.399994,14.86387,-26.8568,-1.0,dalhousie_longliner,drifting_longlines
2,12639560000000.0,1327137000.0,233994.265625,312410.34375,6.8,238.899994,14.861551,-26.860649,-1.0,dalhousie_longliner,drifting_longlines
3,12639560000000.0,1327143000.0,233994.265625,315417.375,6.9,251.800003,14.822686,-26.865898,-1.0,dalhousie_longliner,drifting_longlines
4,12639560000000.0,1327143000.0,233996.390625,316172.5625,6.1,231.100006,14.821825,-26.867579,-1.0,dalhousie_longliner,drifting_longlines


In [41]:
# display info
drifting_longlines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13968727 entries, 0 to 13968726
Data columns (total 11 columns):
 #   Column               Dtype  
---  ------               -----  
 0   mmsi                 float64
 1   timestamp            float64
 2   distance_from_shore  float64
 3   distance_from_port   float64
 4   speed                float64
 5   course               float64
 6   lat                  float64
 7   lon                  float64
 8   is_fishing           float64
 9   source               object 
 10  vessel_type          object 
dtypes: float64(9), object(2)
memory usage: 1.1+ GB


In [42]:
# display summary stats for continuous columns
drifting_longlines.describe()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing
count,13968730.0,13968730.0,13968730.0,13968730.0,13968630.0,13968630.0,13968730.0,13968730.0,13968730.0
mean,129385000000000.0,1434290000.0,584531.1,789750.5,5.464779,181.4876,-8.997629,3.758693,-0.9743015
std,78873570000000.0,39842750.0,542006.8,691543.8,4.043567,105.0503,24.39311,109.5971,0.2119947
min,5601266000000.0,1325376000.0,0.0,0.0,0.0,0.0,-75.19017,-180.0,-1.0
25%,62603840000000.0,1410706000.0,101909.2,213020.6,2.1,90.7,-26.0155,-88.08668,-1.0
50%,118485900000000.0,1447302000.0,457639.3,637524.9,5.5,181.1,-14.97954,-1.716495,-1.0
75%,198075800000000.0,1466506000.0,960366.4,1210432.0,8.5,271.1,4.48579,100.9811,-1.0
max,281205800000000.0,1480032000.0,4430996.0,7181037.0,102.3,511.0,83.33266,179.9938,1.0


In [8]:
# add vessel_type column to assist with concatenation
drifting_longlines['vessel_type'] = 'drifting_longlines'

In [55]:
# number of unique vessels
drifting_longline_ids = drifting_longlines['mmsi'].unique()
print(f'There are {len(drifting_longline_ids)} unique anonymized vessel IDs')

There are 110 unique anonymized vessel IDs


## Fixed Gear

In [9]:
fixed_gear.head()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source
0,7572519000000.0,1347664000.0,0.0,36054.625,0.0,0.0,42.798748,-8.944992,-1.0,gfw
1,7572519000000.0,1348056000.0,0.0,36054.625,0.0,0.0,42.798717,-8.945075,-1.0,gfw
2,7572519000000.0,1350409000.0,0.0,90970.296875,0.0,198.199997,43.106419,-9.215466,-1.0,gfw
3,7572519000000.0,1350410000.0,0.0,90970.296875,0.0,186.899994,43.106434,-9.215431,-1.0,gfw
4,7572519000000.0,1350411000.0,0.0,90970.296875,0.0,190.5,43.10643,-9.215442,-1.0,gfw


In [10]:
fixed_gear.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1559137 entries, 0 to 1559136
Data columns (total 10 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   mmsi                 1559137 non-null  float64
 1   timestamp            1559137 non-null  float64
 2   distance_from_shore  1559137 non-null  float64
 3   distance_from_port   1559137 non-null  float64
 4   speed                1559137 non-null  float64
 5   course               1559137 non-null  float64
 6   lat                  1559137 non-null  float64
 7   lon                  1559137 non-null  float64
 8   is_fishing           1559137 non-null  float64
 9   source               1559137 non-null  object 
dtypes: float64(9), object(1)
memory usage: 119.0+ MB


In [11]:
fixed_gear.describe()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing
count,1559137.0,1559137.0,1559137.0,1559137.0,1559137.0,1559137.0,1559137.0,1559137.0,1559137.0
mean,153075200000000.0,1421486000.0,37618.78,59898.48,2.227195,187.7938,50.95086,1.274018,-0.96591
std,89763830000000.0,37828300.0,109018.8,126972.9,3.41279,117.7506,5.894565,8.512244,0.2173124
min,7572519000000.0,1325625000.0,0.0,0.0,0.0,0.0,-83.2646,-179.2441,-1.0
25%,88780180000000.0,1387594000.0,0.0,5656.715,0.0,77.4,47.45566,-3.909275,-1.0
50%,130528900000000.0,1427254000.0,0.0,26906.59,0.1,205.5,50.50242,-2.333808,-1.0
75%,261683000000000.0,1455255000.0,34131.26,55143.91,3.8,287.0,56.02126,8.220293,-1.0
max,280291300000000.0,1480032000.0,3099833.0,11816760.0,102.3,511.0,84.79108,170.9277,1.0


In [12]:
# add vessel_type column to assist with concatenation
fixed_gear['vessel_type'] = 'fixed_gear'

In [56]:
# number of unique vessels
fixed_gear_ids = fixed_gear['mmsi'].unique()
print(f'There are {len(fixed_gear_ids)} unique anonymized vessel IDs')

There are 36 unique anonymized vessel IDs


## Pole and Line

In [13]:
pole_and_line.head()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source
0,18483460000000.0,1340882000.0,0.0,2236.013184,0.0,0.0,28.967354,-13.537797,-1.0,gfw
1,18483460000000.0,1340884000.0,0.0,2236.013184,0.0,125.199997,28.967373,-13.537838,-1.0,gfw
2,18483460000000.0,1340885000.0,0.0,2236.013184,0.0,0.0,28.967354,-13.537838,-1.0,gfw
3,18483460000000.0,1340888000.0,0.0,2236.013184,0.0,0.0,28.967354,-13.537838,-1.0,gfw
4,18483460000000.0,1340925000.0,1999.950928,2828.357666,8.7,203.100006,28.929653,-13.543955,-1.0,gfw


In [14]:
pole_and_line.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161315 entries, 0 to 161314
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   mmsi                 161315 non-null  float64
 1   timestamp            161315 non-null  float64
 2   distance_from_shore  161315 non-null  float64
 3   distance_from_port   161315 non-null  float64
 4   speed                161315 non-null  float64
 5   course               161315 non-null  float64
 6   lat                  161315 non-null  float64
 7   lon                  161315 non-null  float64
 8   is_fishing           161315 non-null  float64
 9   source               161315 non-null  object 
dtypes: float64(9), object(1)
memory usage: 12.3+ MB


In [15]:
pole_and_line.describe()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing
count,161315.0,161315.0,161315.0,161315.0,161315.0,161315.0,161315.0,161315.0,161315.0
mean,76598410000000.0,1414174000.0,48944.34,73830.4,2.111584,132.245911,33.456753,-4.570087,-0.967918
std,57405770000000.0,43006470.0,203649.9,233360.4,3.696588,117.424918,5.665633,53.524248,0.205442
min,18483460000000.0,1327882000.0,0.0,0.0,0.0,0.0,-41.853848,-70.921013,-1.0
25%,18483460000000.0,1368384000.0,0.0,2236.013,0.0,3.3,28.96594,-28.527719,-1.0
50%,87031420000000.0,1423536000.0,0.0,14421.85,0.0,115.199997,33.029419,-16.734444,-1.0
75%,87031420000000.0,1456109000.0,22802.95,53243.41,1.8,228.699997,38.531128,-13.539565,-1.0
max,214572700000000.0,1480031000.0,2110362.0,3005100.0,102.300003,360.0,77.078987,177.63298,1.0


In [16]:
# add vessel_type column to assist with concatenation
pole_and_line['vessel_type'] = 'pole_and_line'

In [57]:
# number of unique vessels
pole_and_line_ids = pole_and_line['mmsi'].unique()
print(f'There are {len(pole_and_line_ids)} unique anonymized vessel IDs')

There are 6 unique anonymized vessel IDs


## Purse Seines

In [17]:
purse_seines.head()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source
0,9924005000000.0,1379601000.0,0.0,1414.178833,0.0,298.5,8.8615,-79.668427,-1.0,false_positives
1,9924005000000.0,1379602000.0,0.0,1414.178833,0.0,298.5,8.861506,-79.668442,-1.0,false_positives
2,9924005000000.0,1379604000.0,0.0,1414.178833,0.1,128.399994,8.861511,-79.668488,-1.0,false_positives
3,9924005000000.0,1379605000.0,0.0,1414.178833,0.1,111.199997,8.861511,-79.66848,-1.0,false_positives
4,9924005000000.0,1379608000.0,0.0,1414.178833,0.0,41.700001,8.861502,-79.668503,-1.0,false_positives


In [18]:
purse_seines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1545323 entries, 0 to 1545322
Data columns (total 10 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   mmsi                 1545323 non-null  float64
 1   timestamp            1545323 non-null  float64
 2   distance_from_shore  1545323 non-null  float64
 3   distance_from_port   1545323 non-null  float64
 4   speed                1545316 non-null  float64
 5   course               1545316 non-null  float64
 6   lat                  1545323 non-null  float64
 7   lon                  1545323 non-null  float64
 8   is_fishing           1545323 non-null  float64
 9   source               1545323 non-null  object 
dtypes: float64(9), object(1)
memory usage: 117.9+ MB


In [19]:
purse_seines.describe()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing
count,1545323.0,1545323.0,1545323.0,1545323.0,1545316.0,1545316.0,1545323.0,1545323.0,1545323.0
mean,87880810000000.0,1431543000.0,230119.9,342074.5,5.119294,190.5941,13.90564,14.72753,-0.983325
std,65223890000000.0,36458650.0,384146.3,507096.8,5.593512,104.2562,18.91573,111.6258,0.1418592
min,9924005000000.0,1325378000.0,0.0,0.0,0.0,0.0,-69.22275,-180.0,-1.0
25%,38322970000000.0,1409017000.0,0.0,8062.06,0.1,98.2,-0.4091158,-79.66909,-1.0
50%,59665170000000.0,1438458000.0,40495.92,96930.59,1.3,203.4,5.338995,-1.8976,-1.0
75%,158317000000000.0,1462147000.0,286660.0,493442.1,11.2,279.1,35.10037,141.0436,-1.0
max,267966700000000.0,1480032000.0,2315626.0,6728604.0,102.3,511.0,78.33025,179.9934,1.0


In [20]:
# add vessel_type column to assist with concatenation
purse_seines['vessel_type'] = 'purse_seines'

In [58]:
# number of unique vessels
purse_seines_ids = purse_seines['mmsi'].unique()
print(f'There are {len(purse_seines_ids)} unique anonymized vessel IDs')

There are 28 unique anonymized vessel IDs


## Trawlers


In [21]:
trawlers.head()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source
0,1252340000000.0,1325376000.0,0.0,0.0,0.0,153.0,52.458649,4.5812,-1.0,gfw
1,1252340000000.0,1325378000.0,0.0,0.0,0.0,153.0,52.458668,4.581167,-1.0,gfw
2,1252340000000.0,1325379000.0,0.0,0.0,0.0,153.0,52.458633,4.581183,-1.0,gfw
3,1252340000000.0,1325380000.0,0.0,0.0,0.0,153.0,52.458649,4.581234,-1.0,gfw
4,1252340000000.0,1325381000.0,0.0,0.0,0.0,153.0,52.458649,4.581183,-1.0,gfw


In [22]:
trawlers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4369101 entries, 0 to 4369100
Data columns (total 10 columns):
 #   Column               Dtype  
---  ------               -----  
 0   mmsi                 float64
 1   timestamp            float64
 2   distance_from_shore  float64
 3   distance_from_port   float64
 4   speed                float64
 5   course               float64
 6   lat                  float64
 7   lon                  float64
 8   is_fishing           float64
 9   source               object 
dtypes: float64(9), object(1)
memory usage: 333.3+ MB


In [23]:
trawlers.describe()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing
count,4369101.0,4369101.0,4369101.0,4369101.0,4369023.0,4369023.0,4369101.0,4369101.0,4369101.0
mean,157895200000000.0,1426220000.0,78198.02,149664.8,2.972401,174.4043,32.66757,4.927771,-0.9449078
std,94947790000000.0,38764720.0,204074.7,327953.2,4.105081,115.4701,38.3724,81.27552,0.2844518
min,1252340000000.0,1325376000.0,0.0,0.0,0.0,0.0,-84.98024,-179.9996,-1.0
25%,77261930000000.0,1397205000.0,0.0,2236.013,0.0,64.8,36.23107,-8.288531,-1.0
50%,175387400000000.0,1434811000.0,4242.537,36054.62,1.5,187.0,52.09994,4.584417,-1.0
75%,240226000000000.0,1458922000.0,55783.93,96080.94,4.5,276.0,54.63578,14.32063,-1.0
max,277515300000000.0,1480032000.0,3257453.0,12452200.0,102.3,511.0,84.61642,179.9934,1.0


In [24]:
# add vessel_type column to assist with concatenation
trawlers['vessel_type'] = 'trawlers'

In [59]:
# number of unique vessels
trawlers_ids = trawlers['mmsi'].unique()
print(f'There are {len(trawlers_ids)} unique anonymized vessel IDs')

There are 49 unique anonymized vessel IDs


## Trollers

In [25]:
trollers.head()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source
0,76527010000000.0,1337836000.0,0.0,3162.200195,0.0,0.0,51.887592,4.356583,-1.0,gfw
1,76527010000000.0,1338199000.0,0.0,4999.877441,0.0,0.0,51.24213,4.403008,-1.0,gfw
2,76527010000000.0,1343752000.0,0.0,66308.25,8.6,292.200012,51.960873,5.196125,-1.0,gfw
3,76527010000000.0,1350795000.0,0.0,15296.682617,0.0,0.0,51.231094,4.526647,-1.0,gfw
4,76527010000000.0,1351808000.0,0.0,15296.682617,0.0,0.0,51.231098,4.526487,-1.0,gfw


In [26]:
trollers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166243 entries, 0 to 166242
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   mmsi                 166243 non-null  float64
 1   timestamp            166243 non-null  float64
 2   distance_from_shore  166243 non-null  float64
 3   distance_from_port   166243 non-null  float64
 4   speed                166243 non-null  float64
 5   course               166243 non-null  float64
 6   lat                  166243 non-null  float64
 7   lon                  166243 non-null  float64
 8   is_fishing           166243 non-null  float64
 9   source               166243 non-null  object 
dtypes: float64(9), object(1)
memory usage: 12.7+ MB


In [27]:
trollers.describe()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing
count,166243.0,166243.0,166243.0,166243.0,166243.0,166243.0,166243.0,166243.0,166243.0
mean,148691700000000.0,1426558000.0,5116.678753,15086.69,1.343616,147.816762,51.137754,1.448651,-0.934469
std,60432650000000.0,40446950.0,13921.012902,20499.72,2.719976,124.286331,7.369866,45.366115,0.312635
min,76527010000000.0,1325625000.0,0.0,0.0,0.0,0.0,19.234579,-125.083504,-1.0
25%,112940900000000.0,1405839000.0,0.0,1414.179,0.0,0.0,41.640491,12.241529,-1.0
50%,112940900000000.0,1436347000.0,999.975464,6708.039,0.0,174.399994,55.475067,12.274098,-1.0
75%,167072400000000.0,1461682000.0,999.975464,17492.43,0.4,252.5,57.109627,15.57439,-1.0
max,274063800000000.0,1480032000.0,97742.171875,1441175.0,102.300003,360.0,57.973133,41.346321,1.0


In [28]:
# add vessel_type column to assist with concatenation
trollers['vessel_type'] = 'trollers'

In [60]:
# number of unique vessels
trollers_ids = trollers['mmsi'].unique()
print(f'There are {len(trollers_ids)} unique anonymized vessel IDs')

There are 5 unique anonymized vessel IDs


## Unknown

In [29]:
unknown.head()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source
0,183307100000000.0,1343786000.0,314242.1875,538727.9375,13.1,62.700001,2.230797,157.382812,-1.0,crowd_sourced
1,183307100000000.0,1343786000.0,314242.1875,538727.9375,13.8,65.199997,2.232352,157.386047,-1.0,crowd_sourced
2,183307100000000.0,1343792000.0,343947.9375,513526.09375,13.0,61.700001,2.410787,157.745605,-1.0,crowd_sourced
3,183307100000000.0,1343799000.0,369211.75,491134.5625,13.4,63.799999,2.591992,158.094574,-1.0,crowd_sourced
4,183307100000000.0,1343805000.0,362496.25,472878.4375,12.6,66.0,2.759518,158.458908,-1.0,crowd_sourced


In [30]:
unknown.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6811552 entries, 0 to 6811551
Data columns (total 10 columns):
 #   Column               Dtype  
---  ------               -----  
 0   mmsi                 float64
 1   timestamp            float64
 2   distance_from_shore  float64
 3   distance_from_port   float64
 4   speed                float64
 5   course               float64
 6   lat                  float64
 7   lon                  float64
 8   is_fishing           float64
 9   source               object 
dtypes: float64(9), object(1)
memory usage: 519.7+ MB


In [31]:
unknown.describe()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing
count,6811552.0,6811552.0,6811552.0,6811552.0,6811533.0,6811533.0,6811552.0,6811552.0,6811552.0
mean,134133900000000.0,1436159000.0,324438.6,478629.9,3.791863,183.5413,20.10856,16.66546,-0.9835547
std,77133380000000.0,35847280.0,512175.8,701424.7,5.270977,114.7332,38.95049,82.96402,0.1596271
min,1272260000000.0,1325376000.0,0.0,0.0,0.0,0.0,-83.67238,-179.9992,-1.0
25%,80798080000000.0,1415268000.0,0.0,17719.61,0.0,81.5,-20.14818,-24.99953,-1.0
50%,131404900000000.0,1446398000.0,38012.22,97619.32,2.1,193.5,35.69063,6.2783,-1.0
75%,185191400000000.0,1464825000.0,518944.8,738854.7,7.2,282.5,56.00087,57.50281,-1.0
max,276728900000000.0,1480032000.0,3509276.0,10959990.0,102.3,511.0,84.68929,179.993,1.0


In [32]:
# add vessel_type column to assist with concatenation
unknown['vessel_type'] = 'unknown'

In [61]:
# number of unique vessels
unknown_ids = unknown['mmsi'].unique()
print(f'There are {len(unknown_ids)} unique anonymized vessel IDs')

There are 120 unique anonymized vessel IDs


## All Fishing Vessels

In [94]:
# create consolidated dataset
boats_df = pd.concat([drifting_longlines,
                           fixed_gear,
                           pole_and_line,
                           purse_seines,
                           trawlers,
                           trollers,
                           unknown], axis=0)

In [95]:
boats_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28581398 entries, 0 to 6811551
Data columns (total 11 columns):
 #   Column               Dtype  
---  ------               -----  
 0   mmsi                 float64
 1   timestamp            float64
 2   distance_from_shore  float64
 3   distance_from_port   float64
 4   speed                float64
 5   course               float64
 6   lat                  float64
 7   lon                  float64
 8   is_fishing           float64
 9   source               object 
 10  vessel_type          object 
dtypes: float64(9), object(2)
memory usage: 2.6+ GB


In [96]:
boats_df['vessel_type'].value_counts()

drifting_longlines    13968727
unknown                6811552
trawlers               4369101
fixed_gear             1559137
purse_seines           1545323
trollers                166243
pole_and_line           161315
Name: vessel_type, dtype: int64

In [97]:
# number of unique vessels
all_ids = boats_df['mmsi'].unique()
print(f'There are {len(all_ids)} unique anonymized vessel IDs')

There are 354 unique anonymized vessel IDs


In [98]:
# print unique timestamp
time_stamps = boats_df['timestamp'].unique()
print(f'There are {len(time_stamps)} unique time stamps')

There are 22293667 unique time stamps


In [99]:
# breakdown of target variable
boats_df['is_fishing'].value_counts()

-1.000000    28027543
 0.000000      295979
 1.000000      247498
 0.666667        4806
 0.333333        4096
 0.750000         752
 0.250000         670
 0.800000          33
 0.166667          12
 0.400000           9
Name: is_fishing, dtype: int64

In [103]:
from datetime import datetime as dt

In [104]:
unix_timestamp = boats_df['timestamp'].iloc[2]
unix_timestamp

1327136734.0

In [105]:
converted_timestamp = dt.utcfromtimestamp(unix_timestamp).strftime('%Y-%m-%d %H:%M:%S')
converted_timestamp

'2012-01-21 09:05:34'

In [106]:
# create converted column
boats_df['updated_timestamp'] = boats_df['timestamp'].apply(lambda x: dt.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))

In [107]:
boats_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28581398 entries, 0 to 6811551
Data columns (total 12 columns):
 #   Column               Dtype  
---  ------               -----  
 0   mmsi                 float64
 1   timestamp            float64
 2   distance_from_shore  float64
 3   distance_from_port   float64
 4   speed                float64
 5   course               float64
 6   lat                  float64
 7   lon                  float64
 8   is_fishing           float64
 9   source               object 
 10  vessel_type          object 
 11  updated_timestamp    object 
dtypes: float64(9), object(3)
memory usage: 2.8+ GB
