In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from datetime import datetime

## Load data

In [3]:
animal_report = pd.read_csv("../data/top_20/facility_data/animal_report.csv")
bbox = pd.read_csv("../data/all_farms/animal_facilities_bbox.csv")

In [43]:
animal_report_id_date = animal_report[['reg_measure_id', 
       'effective_date', 'expiration_review_date', 'termination_date',
       'adoption_date', 'most_recent_amendment_date',
       'rescission_date', 'facility_id',
       'facility_name', 'cafo_type', 'cafo_population']]

In [30]:
animal_report_id_date.shape

(4801, 11)

In [31]:
animal_report_id_date.nunique()

reg_measure_id                4801
effective_date                 870
expiration_review_date         534
termination_date               972
adoption_date                  386
most_recent_amendment_date      29
rescission_date                 57
facility_id                   3147
facility_name                 3041
cafo_type                        5
cafo_population               1103
dtype: int64

In [32]:
animal_report_id_date.isna().sum()

reg_measure_id                   0
effective_date                 569
expiration_review_date        1491
termination_date              2540
adoption_date                 3799
most_recent_amendment_date    4723
rescission_date               4720
facility_id                     47
facility_name                   48
cafo_type                      734
cafo_population                834
dtype: int64

In [5]:
bbox.head()

Unnamed: 0.1,Unnamed: 0,min_lat,max_lat,min_lon,max_lon,idx
0,0,37.653315,37.671309,-120.789975,-120.767389,3768
1,1,35.646975,35.664969,-119.515025,-119.49302,3958
2,2,35.621059,35.639053,-119.355694,-119.333697,3998
3,3,37.451065,37.469059,-120.873234,-120.85071,4014
4,4,37.660805,37.678799,-120.707626,-120.685038,4016


In [8]:
print(bbox.shape)
print(animal_report.shape)

(1429, 6)
(4801, 47)


In [33]:
bbox.idx.nunique()

1429

bbox is has unique idx.

### Linking bbox and animal_report:

In [44]:
joined = bbox.merge(animal_report[['latitude_decimal_degrees',
                                   'longitude_decimal_degrees',
                                   'effective_date',
                                   'facility_id']], left_on="idx", right_index=True)
joined.shape

(1429, 10)

This checks out:

In [38]:
joined.head()

Unnamed: 0.1,Unnamed: 0,min_lat,max_lat,min_lon,max_lon,idx,latitude_decimal_degrees,longitude_decimal_degrees
0,0,37.653315,37.671309,-120.789975,-120.767389,3768,37.66231,-120.7787
1,1,35.646975,35.664969,-119.515025,-119.49302,3958,35.65597,-119.50404
2,2,35.621059,35.639053,-119.355694,-119.333697,3998,35.630054,-119.344713
3,3,37.451065,37.469059,-120.873234,-120.85071,4014,37.46006,-120.86199
4,4,37.660805,37.678799,-120.707626,-120.685038,4016,37.6698,-120.69635


Effective date seems to be the one to use.

In [45]:
joined.isna().sum()

Unnamed: 0                   0
min_lat                      0
max_lat                      0
min_lon                      0
max_lon                      0
idx                          0
latitude_decimal_degrees     0
longitude_decimal_degrees    0
effective_date               0
facility_id                  0
dtype: int64

## Does animal report have a temporal component?

In [46]:
facility_ids = joined.facility_id.values

In [57]:
animal_report2 = animal_report[['facility_id', 'effective_date', 'cafo_type', 'cafo_population']].\
    where(animal_report.facility_id.isin(facility_ids)).dropna(subset = ['facility_id', 'effective_date'])

In [58]:
animal_report2.shape

(1878, 4)

In [65]:
facilities = animal_report2.groupby('facility_id').\
    aggregate({'effective_date': ['count', 'min', 'max'], 
               'cafo_population': ['mean', 'std']})
facilities.head()

Unnamed: 0_level_0,effective_date,effective_date,effective_date,cafo_population,cafo_population
Unnamed: 0_level_1,count,min,max,mean,std
facility_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
201050.0,2,1990-03-09,1990-03-09,1400.0,0.0
201891.0,3,1993-05-21,2007-06-29,486.0,0.0
202572.0,2,1994-08-05,2007-06-29,1090.0,0.0
203907.0,1,2007-06-29,2007-06-29,1420.0,
203909.0,1,2007-06-29,2007-06-29,486.0,


In [66]:
np.sum(facilities.effective_date['count'] > 1)

295

In [67]:
np.sum(facilities.cafo_population['std'] > 0)

0

We don't have temporal data.