# Fish Welfare Project
## Part 2.5: Fish Count Database, Numbers

* Author: Angelina Li
* Date: 2019/11/02
* Description: This notebook attempts to clean data from the fish count database.

## Notebook tasks
1. Import in fish count data.
2. See if the two farmed datasets are really the same, and import in counts from each.

**1. Import in fish count data.**

In [1]:
import numpy as np
import os
import pandas as pd
import re
import random

from collections import Counter

In [2]:
MAIN_DIR = ".."
DATA_DIR = os.path.join(MAIN_DIR, "data")
COUNT_INPUT_DIR = os.path.join(DATA_DIR, "fish_count", "input")
COUNT_OUTPUT_DIR = os.path.join(DATA_DIR, "fish_count", "output")

In [3]:
CT_DECAPOD_FP = os.path.join(COUNT_INPUT_DIR, "Farmed-decapods-2015.xlsx")
CT_FISH_FP = os.path.join(COUNT_INPUT_DIR, "Farmed-fishes-2015.xlsx")
CT_WILD_FP = os.path.join(COUNT_INPUT_DIR, "fishcount_estimated_wild_fish_2007-2016.xlsx")

In [4]:
# grab all of the datas
deca_df = pd.read_excel(CT_DECAPOD_FP, sheet_name="Decapods", header=8)
print(len(deca_df))
deca_df.head(3)

1852


Unnamed: 0,Country,FAO Species Category,Scientific name,Decapod species?,Crustacean species?,Class,Order,Family,Multi-species?,Year,Production (t),EMW id,Estimated mean weight (lower),Estimated mean weight (upper),mean weight (lower),mean weight (upprr),Numbers (lower) millions,Numbers (upper) millions
0,Afghanistan,Cyprinids nei,Cyprinidae,N,N,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,1000.0,,0.0,0.0,,,,
1,Afghanistan,Rainbow trout,Oncorhynchus mykiss,N,N,Actinopterygii,SALMONIFORMES,Salmonidae,,2015.0,150.0,,0.0,0.0,,,,
2,Albania,Bighead carp,Hypophthalmichthys nobilis,N,N,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,16.0,,0.0,0.0,,,,


In [5]:
fish_df = pd.read_excel(CT_FISH_FP, sheet_name="Fish species", header=6)
print(len(fish_df))
fish_df.head(3)

1853


Unnamed: 0,Country,FAO Species Category,Scientific name,Fish species?,Class,Order,Family,Multi-species?,Year,Production (t),EMW id,Estimated mean weight (lower),Estimated mean weight (upper),mean weight (lower),mean weight (upper),Numbers (lower) millions,Numbers (upper) millions
0,Afghanistan,Rainbow trout,Oncorhynchus mykiss,Y,Actinopterygii,SALMONIFORMES,Salmonidae,N,2015.0,150.0,155.0,210.0,5000.0,210.0,5000.0,0.03,0.714286
1,Afghanistan,Cyprinids nei,Cyprinidae,Y,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,1000.0,,0.0,0.0,322.064283,1081.212063,0.924888,3.10497
2,Albania,Bighead carp,Hypophthalmichthys nobilis,Y,Actinopterygii,CYPRINIFORMES,Cyprinidae,N,2015.0,16.0,29.0,500.0,1500.0,500.0,1500.0,0.010667,0.032


In [6]:
wild_df = pd.read_excel(CT_WILD_FP, sheet_name="Sheet1", header=17)
print(len(wild_df))
wild_df.head(3)

12045


Unnamed: 0,Country,FAO Species Category,Scientific name,Fish species?,Class,Multi-species?,Year,Production (t),EMW id,Estimated mean weight EMW (lower) g,Estimated mean weight EMW (upper) g,Global Generic estimated mean weight for class GEMW (lower) g,Global Generic estimated mean weight for class GEMW (upper) g,Mean weight used (lower) g,Mean weight used (upper) g,Estimated numbers (lower) millions,Estimated numbers (upper) millions
0,Afghanistan,Freshwater fishes nei,,Y,Includes species from > 1 class,,2007-2016,1000.0,,0.0,0.0,37.8921,96.5228,37.8921,96.5228,10.360251,26.390735
1,Albania,"Angelsharks, sand devils nei",Squatinidae,Y,Elasmobranchii (sharks and rays),Y,2007-2016,16.0,23.0,1683.72,19793.8,5950.39,10539.4,1683.72,19793.8,0.000808,0.009503
2,Albania,Atlantic bluefin tuna,Thunnus thynnus,Y,Actinopterygii (ray-finned fishes),N,2007-2016,18.0,51.0,262000.0,262000.0,37.7549,96.1746,262000.0,262000.0,6.9e-05,6.9e-05


**2. Clean all the column names**

In [7]:
deca_df.columns

Index(['Country', 'FAO Species Category', 'Scientific name',
       'Decapod species?', 'Crustacean species?', 'Class', 'Order', 'Family',
       'Multi-species?', 'Year', 'Production (t)', 'EMW id',
       'Estimated mean weight (lower)', 'Estimated mean weight (upper)',
       'mean weight (lower)', 'mean weight (upprr)',
       'Numbers (lower) millions', 'Numbers (upper) millions'],
      dtype='object')

In [8]:
fish_df.columns

Index(['Country', 'FAO Species Category', 'Scientific name', 'Fish species?',
       'Class', 'Order', 'Family', 'Multi-species?', 'Year', 'Production (t)',
       'EMW id', 'Estimated mean weight (lower)',
       'Estimated mean weight (upper)', 'mean weight (lower)',
       'mean weight (upper)', 'Numbers (lower) millions',
       'Numbers (upper) millions'],
      dtype='object')

In [9]:
def get_column_name(name):
    lowered = name.lower().strip()
    stripped = re.sub("[^a-z]", " ", lowered).strip()
    snaked = re.sub(" +", "_", stripped)
    return snaked

deca_df.columns = list(map(get_column_name, deca_df.columns))
fish_df.columns = list(map(get_column_name, fish_df.columns))

deca_df = deca_df.rename(columns=dict(mean_weight_upprr="mean_weight_upper"))

print(deca_df.columns)
print(fish_df.columns)

Index(['country', 'fao_species_category', 'scientific_name', 'decapod_species',
       'crustacean_species', 'class', 'order', 'family', 'multi_species',
       'year', 'production_t', 'emw_id', 'estimated_mean_weight_lower',
       'estimated_mean_weight_upper', 'mean_weight_lower', 'mean_weight_upper',
       'numbers_lower_millions', 'numbers_upper_millions'],
      dtype='object')
Index(['country', 'fao_species_category', 'scientific_name', 'fish_species',
       'class', 'order', 'family', 'multi_species', 'year', 'production_t',
       'emw_id', 'estimated_mean_weight_lower', 'estimated_mean_weight_upper',
       'mean_weight_lower', 'mean_weight_upper', 'numbers_lower_millions',
       'numbers_upper_millions'],
      dtype='object')


In [10]:
deca_df.head()

Unnamed: 0,country,fao_species_category,scientific_name,decapod_species,crustacean_species,class,order,family,multi_species,year,production_t,emw_id,estimated_mean_weight_lower,estimated_mean_weight_upper,mean_weight_lower,mean_weight_upper,numbers_lower_millions,numbers_upper_millions
0,Afghanistan,Cyprinids nei,Cyprinidae,N,N,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,1000.0,,0.0,0.0,,,,
1,Afghanistan,Rainbow trout,Oncorhynchus mykiss,N,N,Actinopterygii,SALMONIFORMES,Salmonidae,,2015.0,150.0,,0.0,0.0,,,,
2,Albania,Bighead carp,Hypophthalmichthys nobilis,N,N,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,16.0,,0.0,0.0,,,,
3,Albania,Common carp,Cyprinus carpio,N,N,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,26.8,,0.0,0.0,,,,
4,Albania,Crucian carp,Carassius carassius,N,N,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,12.0,,0.0,0.0,,,,


In [11]:
fish_df.head()

Unnamed: 0,country,fao_species_category,scientific_name,fish_species,class,order,family,multi_species,year,production_t,emw_id,estimated_mean_weight_lower,estimated_mean_weight_upper,mean_weight_lower,mean_weight_upper,numbers_lower_millions,numbers_upper_millions
0,Afghanistan,Rainbow trout,Oncorhynchus mykiss,Y,Actinopterygii,SALMONIFORMES,Salmonidae,N,2015.0,150.0,155.0,210.0,5000.0,210.0,5000.0,0.03,0.714286
1,Afghanistan,Cyprinids nei,Cyprinidae,Y,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,1000.0,,0.0,0.0,322.064283,1081.212063,0.924888,3.10497
2,Albania,Bighead carp,Hypophthalmichthys nobilis,Y,Actinopterygii,CYPRINIFORMES,Cyprinidae,N,2015.0,16.0,29.0,500.0,1500.0,500.0,1500.0,0.010667,0.032
3,Albania,Common carp,Cyprinus carpio,Y,Actinopterygii,CYPRINIFORMES,Cyprinidae,N,2015.0,26.8,57.0,500.0,2500.0,500.0,2500.0,0.01072,0.0536
4,Albania,Crucian carp,Carassius carassius,Y,Actinopterygii,CYPRINIFORMES,Cyprinidae,N,2015.0,12.0,62.0,150.0,400.0,150.0,400.0,0.03,0.08


**3. Join the datasets**

In [12]:
def get_row_index(row):
    return "{} {}".format(row["country"], row["fao_species_category"])

def get_reindexed_df(df):
    df["new_index"] = df.apply(get_row_index, axis=1)
    return df.set_index( "new_index" )

deca_df = get_reindexed_df(deca_df)
fish_df = get_reindexed_df(fish_df)

In [13]:
def var_missing(df, varname):
    return (df[varname].isna()) | (df[varname] == 0)

def get_num_na(df):
    return len(df[ var_missing(df, "mean_weight_lower") ])

print(get_num_na(deca_df))
print(get_num_na(fish_df))

1670
517


In [14]:
def replace_row(row, replace_varname):
    replace_df = fish_df
    replace_class_val = "Actinopterygii"
    if row["class"] == replace_class_val:
        replace_row = replace_df.ix[row.name]
        return replace_row[replace_varname]
    return row[replace_varname]

farm_df = deca_df.copy()
fish_class_val = "Actinopterygii"
vars_replace = ["mean_weight_lower", "mean_weight_upper", "numbers_lower_millions", "numbers_upper_millions"]
for var in vars_replace:
    farm_df[var] = farm_df.apply(lambda row: replace_row(row, var), axis=1)
farm_df.head()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """


Unnamed: 0_level_0,country,fao_species_category,scientific_name,decapod_species,crustacean_species,class,order,family,multi_species,year,production_t,emw_id,estimated_mean_weight_lower,estimated_mean_weight_upper,mean_weight_lower,mean_weight_upper,numbers_lower_millions,numbers_upper_millions
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Afghanistan Cyprinids nei,Afghanistan,Cyprinids nei,Cyprinidae,N,N,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,1000.0,,0.0,0.0,322.064283,1081.212063,0.924888,3.10497
Afghanistan Rainbow trout,Afghanistan,Rainbow trout,Oncorhynchus mykiss,N,N,Actinopterygii,SALMONIFORMES,Salmonidae,,2015.0,150.0,,0.0,0.0,210.0,5000.0,0.03,0.714286
Albania Bighead carp,Albania,Bighead carp,Hypophthalmichthys nobilis,N,N,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,16.0,,0.0,0.0,500.0,1500.0,0.010667,0.032
Albania Common carp,Albania,Common carp,Cyprinus carpio,N,N,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,26.8,,0.0,0.0,500.0,2500.0,0.01072,0.0536
Albania Crucian carp,Albania,Crucian carp,Carassius carassius,N,N,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,12.0,,0.0,0.0,150.0,400.0,0.03,0.08


In [15]:
set(farm_df["class"].tolist())

{'Actinopterygii',
 'Decapods - BRACHYURA',
 'Decapods - NATANTIA',
 'Decapods - REPTANTIA',
 nan,
 'unknown'}

In [16]:
non_missing_classes = ['Actinopterygii', 'Decapods - BRACHYURA', 'Decapods - NATANTIA', 'Decapods - REPTANTIA']
farm_df[ var_missing(farm_df, "mean_weight_lower") & farm_df["class"].isin(non_missing_classes) ]

Unnamed: 0_level_0,country,fao_species_category,scientific_name,decapod_species,crustacean_species,class,order,family,multi_species,year,production_t,emw_id,estimated_mean_weight_lower,estimated_mean_weight_upper,mean_weight_lower,mean_weight_upper,numbers_lower_millions,numbers_upper_millions
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1


In [17]:
farm_df[ (farm_df["class"] == "Actinopterygii") & var_missing(farm_df, "emw_id") ]

Unnamed: 0_level_0,country,fao_species_category,scientific_name,decapod_species,crustacean_species,class,order,family,multi_species,year,production_t,emw_id,estimated_mean_weight_lower,estimated_mean_weight_upper,mean_weight_lower,mean_weight_upper,numbers_lower_millions,numbers_upper_millions
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Afghanistan Cyprinids nei,Afghanistan,Cyprinids nei,Cyprinidae,N,N,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,1000.00,,0.0,0.0,322.064283,1081.212063,0.924888,3.104970
Afghanistan Rainbow trout,Afghanistan,Rainbow trout,Oncorhynchus mykiss,N,N,Actinopterygii,SALMONIFORMES,Salmonidae,,2015.0,150.00,,0.0,0.0,210.000000,5000.000000,0.030000,0.714286
Albania Bighead carp,Albania,Bighead carp,Hypophthalmichthys nobilis,N,N,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,16.00,,0.0,0.0,500.000000,1500.000000,0.010667,0.032000
Albania Common carp,Albania,Common carp,Cyprinus carpio,N,N,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,26.80,,0.0,0.0,500.000000,2500.000000,0.010720,0.053600
Albania Crucian carp,Albania,Crucian carp,Carassius carassius,N,N,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,12.00,,0.0,0.0,150.000000,400.000000,0.030000,0.080000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zambia Three spotted tilapia,Zambia,Three spotted tilapia,Oreochromis andersonii,N,N,Actinopterygii,PERCOIDEI,Cichlidae,,2015.0,3112.00,,0.0,0.0,322.064283,1081.212063,2.878251,9.662667
Zanzibar Milkfish,Zanzibar,Milkfish,Chanos chanos,N,N,Actinopterygii,GONORYNCHIFORMES,Chanidae,,2015.0,3.94,,0.0,0.0,250.000000,500.000000,0.007880,0.015760
Zimbabwe North African catfish,Zimbabwe,North African catfish,Clarias gariepinus,N,N,Actinopterygii,SILURIFORMES,Clariidae,,2015.0,10.00,,0.0,0.0,500.000000,1500.000000,0.006667,0.020000
Zimbabwe Rainbow trout,Zimbabwe,Rainbow trout,Oncorhynchus mykiss,N,N,Actinopterygii,SALMONIFORMES,Salmonidae,,2015.0,80.00,,0.0,0.0,210.000000,5000.000000,0.016000,0.380952


In [18]:
len(farm_df)

1852

In [19]:
set(deca_df.index).difference(set(fish_df.index))

set()

In [20]:
set(fish_df.index).difference(set(deca_df.index))

set()

In [21]:
print( "len(deca_df.index)", len(deca_df.index) )
print( "len(set(deca_df.index))", len(set(deca_df.index)) )
print( "len(fish_df.index)", len(fish_df.index) )
print( "len(set(fish_df.index))", len(set(fish_df.index)) )

len(deca_df.index) 1852
len(set(deca_df.index)) 1852
len(fish_df.index) 1853
len(set(fish_df.index)) 1852


In [22]:
# ^ what?
Counter(fish_df.index.tolist()).most_common(5)
# lol okay then don't even worry about it.

[('nan nan', 2),
 ('Afghanistan Rainbow trout', 1),
 ('Afghanistan Cyprinids nei', 1),
 ('Albania Bighead carp', 1),
 ('Albania Common carp', 1)]

**4. Plug in missing values**
* For any row with missing values currently, flag the row and sub in the standard weight information
* Do some basic sanity checks

In [28]:
# import in fish count spreadsheet 4
CT_SPREAD4_FP = os.path.join(COUNT_INPUT_DIR, "Spreadsheet4_emws_global.xls")
sheet4_df = pd.read_excel(CT_SPREAD4_FP, sheet_name="Results", header=17, keep_default_na=False, skipfooter=4)
sheet4_df

Unnamed: 0,Totals,Class,Production (t),Estimated numbers (lower) (millions),Estimated numbers (upper) (millions),Global generic mean weight (lower) (g),Global generic mean weight (upper) (g)
0,single species categories with an EMW,Actinopterygii,32406300.0,30536.9,101528.0,319.187,1061.22
1,multi species categories with an EMW,Actinopterygii,1890820.0,1601.2,4949.6,,
2,All species categories with an EMW,Actinopterygii,34297100.0,32138.1,106477.0,,
3,,,,,,,
4,species categories without an EMW,unknown,1736030.0,1635.88,5438.9,319.187,1061.22
5,species categories without an EMW,Actinopterygii,3141150.0,2959.95,9841.11,319.187,1061.22
6,species categories without an EMW,all,4877180.0,4595.84,15280.0,319.187,1061.22


In [31]:
generic_mean_weight_lower = sheet4_df["Global generic mean weight (lower) (g)"][0]
generic_mean_weight_upper = sheet4_df["Global generic mean weight (upper) (g)"][0]
print(generic_mean_weight_lower, generic_mean_weight_upper)

319.1869336339932 1061.2168850946118


In [33]:
var_missing(farm_df, "mean_weight_lower") & farm_df["class"].isin(non_missing_classes)

farm_df["using_mean_value"] = var_missing(farm_df, "mean_weight_lower")
farm_df.head()

Unnamed: 0_level_0,country,fao_species_category,scientific_name,decapod_species,crustacean_species,class,order,family,multi_species,year,production_t,emw_id,estimated_mean_weight_lower,estimated_mean_weight_upper,mean_weight_lower,mean_weight_upper,numbers_lower_millions,numbers_upper_millions,using_mean_value
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Afghanistan Cyprinids nei,Afghanistan,Cyprinids nei,Cyprinidae,N,N,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,1000.0,,0.0,0.0,322.064283,1081.212063,0.924888,3.10497,False
Afghanistan Rainbow trout,Afghanistan,Rainbow trout,Oncorhynchus mykiss,N,N,Actinopterygii,SALMONIFORMES,Salmonidae,,2015.0,150.0,,0.0,0.0,210.0,5000.0,0.03,0.714286,False
Albania Bighead carp,Albania,Bighead carp,Hypophthalmichthys nobilis,N,N,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,16.0,,0.0,0.0,500.0,1500.0,0.010667,0.032,False
Albania Common carp,Albania,Common carp,Cyprinus carpio,N,N,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,26.8,,0.0,0.0,500.0,2500.0,0.01072,0.0536,False
Albania Crucian carp,Albania,Crucian carp,Carassius carassius,N,N,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,12.0,,0.0,0.0,150.0,400.0,0.03,0.08,False


In [34]:
farm_df["mean_weight_lower"] = farm_df["mean_weight_lower"].fillna(generic_mean_weight_lower)
farm_df["mean_weight_upper"] = farm_df["mean_weight_upper"].fillna(generic_mean_weight_upper)
farm_df[ farm_df["using_mean_value"] ].head()

Unnamed: 0_level_0,country,fao_species_category,scientific_name,decapod_species,crustacean_species,class,order,family,multi_species,year,production_t,emw_id,estimated_mean_weight_lower,estimated_mean_weight_upper,mean_weight_lower,mean_weight_upper,numbers_lower_millions,numbers_upper_millions,using_mean_value
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Albania Mediterranean mussel,Albania,Mediterranean mussel,Mytilus galloprovincialis,N,N,,BIVALVIA,Mytilidae,,2015.0,295.0,,0.0,0.0,319.186934,1061.216885,,,True
Algeria Freshwater fishes nei,Algeria,Freshwater fishes nei,Osteichthyes,N,N,unknown,PISCES MISCELLANEA,,,2015.0,57.03,,0.0,0.0,319.186934,1061.216885,,,True
Algeria Mediterranean mussel,Algeria,Mediterranean mussel,Mytilus galloprovincialis,N,N,,BIVALVIA,Mytilidae,,2015.0,2.76,,0.0,0.0,319.186934,1061.216885,,,True
Argentina American bull frog,Argentina,American bull frog,Rana catesbeiana,N,N,,ANURA,Ranidae,,2015.0,25.0,,0.0,0.0,319.186934,1061.216885,,,True
Argentina Blue mussel,Argentina,Blue mussel,Mytilus edulis,N,N,,BIVALVIA,Mytilidae,,2015.0,6.0,,0.0,0.0,319.186934,1061.216885,,,True


In [36]:
# Sanity check: Do the numbers for generic mean weights line up with the estimates for each species?
farm_df[ (~farm_df["using_mean_value"]) & ( (farm_df["production_t"] / farm_df["mean_weight_lower"]) != farm_df["numbers_upper_millions"]) ]

Unnamed: 0_level_0,country,fao_species_category,scientific_name,decapod_species,crustacean_species,class,order,family,multi_species,year,production_t,emw_id,estimated_mean_weight_lower,estimated_mean_weight_upper,mean_weight_lower,mean_weight_upper,numbers_lower_millions,numbers_upper_millions,using_mean_value
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1


In [38]:
farm_df[ (~farm_df["using_mean_value"]) & ( (farm_df["production_t"] / farm_df["mean_weight_upper"]) != farm_df["numbers_lower_millions"]) ]

Unnamed: 0_level_0,country,fao_species_category,scientific_name,decapod_species,crustacean_species,class,order,family,multi_species,year,production_t,emw_id,estimated_mean_weight_lower,estimated_mean_weight_upper,mean_weight_lower,mean_weight_upper,numbers_lower_millions,numbers_upper_millions,using_mean_value
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1


In [39]:
# no anomalies above! Let's replace all values to account for missing vals.
farm_df["numbers_upper_millions"] = farm_df["production_t"] / farm_df["mean_weight_lower"]
farm_df["numbers_lower_millions"] = farm_df["production_t"] / farm_df["mean_weight_upper"]

farm_df.head()

Unnamed: 0_level_0,country,fao_species_category,scientific_name,decapod_species,crustacean_species,class,order,family,multi_species,year,production_t,emw_id,estimated_mean_weight_lower,estimated_mean_weight_upper,mean_weight_lower,mean_weight_upper,numbers_lower_millions,numbers_upper_millions,using_mean_value
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Afghanistan Cyprinids nei,Afghanistan,Cyprinids nei,Cyprinidae,N,N,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,1000.0,,0.0,0.0,322.064283,1081.212063,0.924888,3.10497,False
Afghanistan Rainbow trout,Afghanistan,Rainbow trout,Oncorhynchus mykiss,N,N,Actinopterygii,SALMONIFORMES,Salmonidae,,2015.0,150.0,,0.0,0.0,210.0,5000.0,0.03,0.714286,False
Albania Bighead carp,Albania,Bighead carp,Hypophthalmichthys nobilis,N,N,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,16.0,,0.0,0.0,500.0,1500.0,0.010667,0.032,False
Albania Common carp,Albania,Common carp,Cyprinus carpio,N,N,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,26.8,,0.0,0.0,500.0,2500.0,0.01072,0.0536,False
Albania Crucian carp,Albania,Crucian carp,Carassius carassius,N,N,Actinopterygii,CYPRINIFORMES,Cyprinidae,,2015.0,12.0,,0.0,0.0,150.0,400.0,0.03,0.08,False


In [40]:
farm_df[ farm_df["using_mean_value"] ].head()

Unnamed: 0_level_0,country,fao_species_category,scientific_name,decapod_species,crustacean_species,class,order,family,multi_species,year,production_t,emw_id,estimated_mean_weight_lower,estimated_mean_weight_upper,mean_weight_lower,mean_weight_upper,numbers_lower_millions,numbers_upper_millions,using_mean_value
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Albania Mediterranean mussel,Albania,Mediterranean mussel,Mytilus galloprovincialis,N,N,,BIVALVIA,Mytilidae,,2015.0,295.0,,0.0,0.0,319.186934,1061.216885,0.277983,0.924223,True
Algeria Freshwater fishes nei,Algeria,Freshwater fishes nei,Osteichthyes,N,N,unknown,PISCES MISCELLANEA,,,2015.0,57.03,,0.0,0.0,319.186934,1061.216885,0.05374,0.178673,True
Algeria Mediterranean mussel,Algeria,Mediterranean mussel,Mytilus galloprovincialis,N,N,,BIVALVIA,Mytilidae,,2015.0,2.76,,0.0,0.0,319.186934,1061.216885,0.002601,0.008647,True
Argentina American bull frog,Argentina,American bull frog,Rana catesbeiana,N,N,,ANURA,Ranidae,,2015.0,25.0,,0.0,0.0,319.186934,1061.216885,0.023558,0.078324,True
Argentina Blue mussel,Argentina,Blue mussel,Mytilus edulis,N,N,,BIVALVIA,Mytilidae,,2015.0,6.0,,0.0,0.0,319.186934,1061.216885,0.005654,0.018798,True


In [41]:
# save your work.
farm_df.to_excel(os.path.join(COUNT_OUTPUT_DIR, "farmed_fish_data.xlsx"))