In [2]:
import arviz as az
import numpy as np
import pymc as pm
import pandas as pd

%load_ext lab_black
%load_ext watermark

# Ants

An example of Poisson regression.

Data can be found [here](https://raw.githubusercontent.com/areding/6420-pymc/main/data/ants.csv).

Adapted from [unit 7: ants.odc](https://raw.githubusercontent.com/areding/6420-pymc/main/original_examples/Codes4Unit7/ants.odc).

## Associated lecture video: Unit 7 Lesson 15

In [3]:
%%html
<iframe width="560" height="315" src="https://www.youtube.com/embed?v=xomK4tcePmc&list=PLv0FeK5oXK4l-RdT6DWJj0_upJOG2WKNO&index=77" frameborder="0" allow="autoplay; encrypted-media" allowfullscreen></iframe>

## Problem statement

The data discussed in Gotelli and Ellison (2002) provide the ant species richness (number of ant species) found in 64-square-meter sampling grids in 22 forests (coded as 1) and 22 bogs (coded as 2) surrounding the forests in  Connecticut, Massachusetts, and Vermont. The sites span 3Ê• of latitude in New England. There are 44 observations on four variables (columns in data set): 

- Ants: number of species, 
- Habitat: forests (1) and bogs (2), 
- Latitude
- Elevation: in meters above sea level.

(a) Using Poisson regression, model the number of ant species (Ants) with covariates Habitat and Elevation.  
(b) For a sampling grid unit located in a forest at the elevation of 100 m how many species the model from (a) predicts? For the model coefficients and the prediction report 95% credible  sets.

In [5]:
data = pd.read_csv("../data/ants.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   ants       44 non-null     int64
 1   habitat    44 non-null     int64
 2   elevation  44 non-null     int64
dtypes: int64(3)
memory usage: 1.2 KB


In [7]:
with pm.Model() as m:
    ant_species = pm.Data("ant_species", data["ants"].to_numpy(), mutable=False)
    habitat = pm.Data("habitat", data["habitat"].to_numpy(), mutable=True)
    elevation = pm.Data("elevation", data["elevation"].to_numpy(), mutable=True)

    beta0 = pm.Normal("beta0_intercept", mu=0, tau=0.0001)
    beta1 = pm.Normal("beta1_habitat", mu=0, tau=0.0001)
    beta2 = pm.Normal("beta2_elevation", mu=0, tau=0.0001)

    μ = pm.math.exp(beta0 + beta1 * habitat + beta2 * elevation)

    y = pm.Poisson("y", mu=μ, observed=ant_species)

    trace = pm.sample(
        5000,
        chains=4,
        tune=2000,
        cores=4,
        init="adapt_diag",
        random_seed=1,
        return_inferencedata=True,
    )

Auto-assigning NUTS sampler...
Initializing NUTS using adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta0_intercept, beta1_habitat, beta2_elevation]


  return _boost._beta_ppf(q, a, b)
  return _boost._beta_ppf(q, a, b)
  return _boost._beta_ppf(q, a, b)
  return _boost._beta_ppf(q, a, b)
Sampling 4 chains for 2_000 tune and 5_000 draw iterations (8_000 + 20_000 draws total) took 11 seconds.


In [8]:
az.summary(trace)

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
beta0_intercept,3.174,0.188,2.822,3.532,0.002,0.002,6934.0,7313.0,1.0
beta1_habitat,-0.64,0.12,-0.862,-0.412,0.001,0.001,7425.0,7886.0,1.0
beta2_elevation,-0.001,0.0,-0.002,-0.001,0.0,0.0,8855.0,8231.0,1.0


In [9]:
# prediction
with m:
    pm.set_data({"habitat": [1], "elevation": [100]})
    ppc = pm.sample_posterior_predictive(trace)

az.summary(ppc)



Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
y[0],10.866,3.417,5.0,17.0,0.026,0.018,17760.0,18913.0,1.0


In [10]:
%watermark --iversions -v

Python implementation: CPython
Python version       : 3.10.4
IPython version      : 8.3.0

arviz : 0.12.1
pandas: 1.4.2
numpy : 1.22.3
pymc  : 4.0.0b5

