# Honeybee Data

Reference article: https://data-for-good.pubpub.org/pub/rg3364dl/release/2

Things to understand:
Apiary: An apiary is a location where beehives of honey bees are kept. Can come in many sizes and can be urban or rural. An apiary is a place where beehives are kept.

HCC: Healthy Colony Checklist

Brood: Eggs, larvae, and pupae of honeybees.


In [2]:
# imports
import pandas as pd
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as pltib
import seaborn as sns
%matplotlib inline

In [3]:
# apiary dataset
apiary = pd.read_csv('Apiary_Information.csv')
apiary

Unnamed: 0,ApiaryID,Apiary,City,State
0,1,BBCC,Durham,NC
1,2,BBCC-RTP,Durham,NC
2,3,BBTS,Clayton,NC
3,4,Beesboro,Durham,NC
4,5,Fresno,Durham,NC
5,6,Juniper Level,Durham,NC
6,7,Lakeview,Salt Lake City,UT
7,8,Leesville,Durham,NC
8,9,Mike C,Salt Lake City,UT
9,10,Roestenburg,Salt Lake City,UT


In [4]:
# hcc dataset 
hcc = pd.read_csv('HCC_Inspections.csv')
hcc

Unnamed: 0,InpsectionID,HiveID,InsptDate,Brood,Bees,Queen,Food,Stressors,Space,Percent_Met,Healthy
0,1,1,2016-06-15,1.0,1.0,1.0,1.0,0.0,1.0,83,No
1,2,1,2016-07-22,1.0,1.0,1.0,1.0,0.0,1.0,83,No
2,3,1,2016-08-01,1.0,1.0,1.0,1.0,1.0,1.0,100,Yes
3,4,1,2016-08-08,1.0,1.0,1.0,1.0,1.0,1.0,100,Yes
4,5,1,2016-08-15,1.0,1.0,1.0,1.0,0.0,1.0,83,No
...,...,...,...,...,...,...,...,...,...,...,...
2399,2400,188,2018-09-24,1.0,1.0,0.0,1.0,1.0,1.0,83,No
2400,2401,188,2018-10-01,1.0,1.0,0.0,1.0,1.0,1.0,83,No
2401,2402,188,2018-10-01,1.0,1.0,0.0,1.0,1.0,1.0,83,No
2402,2403,188,2018-10-08,1.0,1.0,0.0,1.0,1.0,1.0,83,No


In [7]:
# hive info dataset
hive = pd.read_csv('Hive_Information.csv')
hive

Unnamed: 0,HiveID,Hive_Tag,ApiaryID
0,1,H003-P2,1
1,2,H007-NA,1
2,3,H1,1
3,4,HT101,1
4,5,HT102,1
...,...,...,...
183,184,H04-IQBA,11
184,185,2,12
185,186,Hive #4,12
186,187,2,13


In [8]:
# hourly weather dataset
weather_hour = pd.read_csv('Hourly_Weather.csv')
weather_hour

Unnamed: 0,WeatherID,ObsID,StationID,Temperature,Humidity,Dew_Point,Wind_Direction,Wind_Speed,Wind_Gust,Pressure,Precip,Condition,Sunrise,Sunset,Daylight_Hours
0,1,1,1,54,45,33,VAR,7,0,29.54,0.0,Fair,7:07:00,19:35:00,12:28:00
1,2,2,1,52,47,32,NNW,3,0,29.56,0.0,Fair,7:07:00,19:35:00,12:28:00
2,3,3,1,51,48,32,CALM,0,0,29.57,0.0,Fair,7:07:00,19:35:00,12:28:00
3,4,4,1,48,56,33,CALM,0,0,29.57,0.0,Fair,7:07:00,19:35:00,12:28:00
4,5,5,1,46,63,34,CALM,0,0,29.58,0.0,Fair,7:07:00,19:35:00,12:28:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3667,3668,1772,3,79,25,40,N,5,0,25.60,0.0,Mostly Cloudy,6:13:00,20:37:00,14:24:00
3668,3669,1773,3,75,29,41,NNW,17,0,25.63,0.0,Partly Cloudy,6:13:00,20:37:00,14:24:00
3669,3670,1774,3,70,41,45,NNW,9,0,25.63,0.0,Partly Cloudy,6:13:00,20:37:00,14:24:00
3670,3671,1775,3,64,56,48,WSW,7,0,25.65,0.0,Partly Cloudy,6:13:00,20:37:00,14:24:00


In [9]:
# weather observations dataset
weather_obs = pd.read_csv('Weather_Observations.csv')
weather_obs

Unnamed: 0,ObsID,Date,Obs_Time,Obs_Hour
0,1,3/29/2016,0:51,0
1,2,3/29/2016,1:51,1
2,3,3/29/2016,2:51,2
3,4,3/29/2016,3:51,3
4,5,3/29/2016,4:51,4
...,...,...,...,...
1771,1772,5/13/2019,19:54,19
1772,1773,5/13/2019,20:54,20
1773,1774,5/13/2019,21:54,21
1774,1775,5/13/2019,22:54,22


In [10]:
# weather stations dataset
weather_station = pd.read_csv('Weather_Stations.csv')
weather_station

Unnamed: 0,StationID,Station_City,Station
0,1,Clayton,KRDU
1,2,Durham,KRDU
2,3,Salt Lake City,KSLC


Research Question: What are the internal and external factors that affect honey production? 
Do healthy hives produce more honey?

Something to note: The scale_data dataset was not included. Therefore, other literature review will be used to see typical honey hive weight to predict honey production.

# Exploring

Looking at weather data by State to get a better understanding of location.

In [11]:
# join hourly weather with weather stations
weather_hourly_station = weather_hour.merge(weather_station, on = 'StationID', how = 'left')


# join with apiary
weather_with_state = weather_hourly_station.merge(
    apiary[['City', 'State']],
    left_on = 'Station_City',
    right_on = 'City',
    how = 'left')
weather_with_state

Unnamed: 0,WeatherID,ObsID,StationID,Temperature,Humidity,Dew_Point,Wind_Direction,Wind_Speed,Wind_Gust,Pressure,Precip,Condition,Sunrise,Sunset,Daylight_Hours,Station_City,Station,City,State
0,1,1,1,54,45,33,VAR,7,0,29.54,0.0,Fair,7:07:00,19:35:00,12:28:00,Clayton,KRDU,Clayton,NC
1,1,1,1,54,45,33,VAR,7,0,29.54,0.0,Fair,7:07:00,19:35:00,12:28:00,Clayton,KRDU,Clayton,NC
2,2,2,1,52,47,32,NNW,3,0,29.56,0.0,Fair,7:07:00,19:35:00,12:28:00,Clayton,KRDU,Clayton,NC
3,2,2,1,52,47,32,NNW,3,0,29.56,0.0,Fair,7:07:00,19:35:00,12:28:00,Clayton,KRDU,Clayton,NC
4,3,3,1,51,48,32,CALM,0,0,29.57,0.0,Fair,7:07:00,19:35:00,12:28:00,Clayton,KRDU,Clayton,NC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14755,3672,1776,3,67,51,48,CALM,0,0,25.66,0.0,Mostly Cloudy,6:13:00,20:37:00,14:24:00,Salt Lake City,KSLC,Salt Lake City,UT
14756,3672,1776,3,67,51,48,CALM,0,0,25.66,0.0,Mostly Cloudy,6:13:00,20:37:00,14:24:00,Salt Lake City,KSLC,Salt Lake City,UT
14757,3672,1776,3,67,51,48,CALM,0,0,25.66,0.0,Mostly Cloudy,6:13:00,20:37:00,14:24:00,Salt Lake City,KSLC,Salt Lake City,UT
14758,3672,1776,3,67,51,48,CALM,0,0,25.66,0.0,Mostly Cloudy,6:13:00,20:37:00,14:24:00,Salt Lake City,KSLC,Salt Lake City,UT


In [12]:
weather_with_state['State'].unique()

array(['NC', 'UT'], dtype=object)

Now that I have this dataframe with weather information by State, I can look into the differences between Utah and North Carolina.

In [13]:
# boxplot of temperature by State

# not plotting so doing some more digging
weather_with_state['Temperature']

weather_with_state.dtypes

weather_with_state.describe(include='all')


# Temperature is currently object. Need to convert to numeric
#weather_with_state.boxplot(by = 'State', column = ['Temperature'], grid = False)

Unnamed: 0,WeatherID,ObsID,StationID,Temperature,Humidity,Dew_Point,Wind_Direction,Wind_Speed,Wind_Gust,Pressure,Precip,Condition,Sunrise,Sunset,Daylight_Hours,Station_City,Station,City,State
count,14760.0,14760.0,14760.0,14696.0,14696.0,14696.0,14656,14696.0,14696.0,14696.0,14696.0,14696,14760,14760,14760,14760,14760,14760,14760
unique,,,,63.0,80.0,58.0,18,20.0,20.0,122.0,6.0,19,52,58,62,3,2,3,2
top,,,,76.0,93.0,72.0,CALM,0.0,0.0,29.6,0.0,Fair,6:35:00,20:06:00,13:31:00,Durham,KRDU,Durham,NC
freq,,,,648.0,800.0,752.0,2878,2926.0,13508.0,536.0,14576.0,4912,960,768,768,10800,14400,10800,14400
mean,2284.304878,846.060976,1.780488,,,,,,,,,,,,,,,,
std,949.67358,508.082482,0.469172,,,,,,,,,,,,,,,,
min,1.0,1.0,1.0,,,,,,,,,,,,,,,,
25%,1815.75,387.0,2.0,,,,,,,,,,,,,,,,
50%,2430.5,827.0,2.0,,,,,,,,,,,,,,,,
75%,3045.25,1288.0,2.0,,,,,,,,,,,,,,,,
