In [1]:
# Importing libraries we are going to use
import pandas as pd
import numpy as np

## Normalization
from sklearn.preprocessing import StandardScaler

## Encoding
from sklearn.preprocessing import LabelEncoder

## Models
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## Poisson Regression Module
from sklearn.linear_model import PoissonRegressor
## Metrics
from sklearn import metrics
from sklearn.metrics import roc_curve

## Warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('outbreaks.csv',sep=',')
data

Unnamed: 0,Year,Month,State,Location,Food,Ingredient,Species,Serotype/Genotype,Status,Illnesses,Hospitalizations,Fatalities
0,1998,January,California,Restaurant,,,,,,20,0.0,0.0
1,1998,January,California,,Custard,,,,,112,0.0,0.0
2,1998,January,California,Restaurant,,,,,,35,0.0,0.0
3,1998,January,California,Restaurant,"Fish, Ahi",,Scombroid toxin,,Confirmed,4,0.0,0.0
4,1998,January,California,Private Home/Residence,"Lasagna, Unspecified; Eggs, Other",,Salmonella enterica,Enteritidis,Confirmed,26,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
19114,2015,December,Wisconsin,Restaurant,,,Norovirus genogroup II,GII_14,Confirmed,4,0.0,0.0
19115,2015,December,Wisconsin,Private Home/Residence,Salsa,,Norovirus genogroup II,GII_2,Confirmed,16,0.0,0.0
19116,2015,December,Wisconsin,Nursing Home/Assisted Living Facility,,,Norovirus genogroup II,GII_17 Kawasaki (2014),Confirmed,43,3.0,0.0
19117,2015,December,Wisconsin,Fast Food Restaurant,,,Norovirus genogroup II,GII_2,Confirmed,5,0.0,0.0


This a dataset from ***Kaggle*** public database. 

**Question to answer:** ***Is there a relationship between the food borne diseases in the U.S.?***

## Data preprocessing

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19119 entries, 0 to 19118
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               19119 non-null  int64  
 1   Month              19119 non-null  object 
 2   State              19119 non-null  object 
 3   Location           16953 non-null  object 
 4   Food               10156 non-null  object 
 5   Ingredient         1876 non-null   object 
 6   Species            12500 non-null  object 
 7   Serotype/Genotype  3907 non-null   object 
 8   Status             12500 non-null  object 
 9   Illnesses          19119 non-null  int64  
 10  Hospitalizations   15494 non-null  float64
 11  Fatalities         15518 non-null  float64
dtypes: float64(2), int64(2), object(8)
memory usage: 1.8+ MB


In [4]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,19119.0,2005.562373,5.158404,1998.0,2001.0,2005.0,2010.0,2015.0
Illnesses,19119.0,19.537162,49.452417,2.0,3.0,8.0,19.0,1939.0
Hospitalizations,15494.0,0.947528,5.312772,0.0,0.0,0.0,1.0,308.0
Fatalities,15518.0,0.021717,0.386976,0.0,0.0,0.0,0.0,33.0


+ The first thing that we can found here is that the years are between 2005 and 2015.
+ Fatalities from meat consumption don't exceed 33 deaths.

In [5]:
data.isnull().sum()

Year                     0
Month                    0
State                    0
Location              2166
Food                  8963
Ingredient           17243
Species               6619
Serotype/Genotype    15212
Status                6619
Illnesses                0
Hospitalizations      3625
Fatalities            3601
dtype: int64

As we can see above, there are columns that have more than the **30 %** of rows with missing data. So we have to procceed with allocating some columns by deleting them and
other ones replacing deleting the rows where are missing values. 

In [6]:

data = data.drop(['Serotype/Genotype','Ingredient'], axis=1)

In [7]:
data = data.dropna()

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5202 entries, 3 to 19115
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Year              5202 non-null   int64  
 1   Month             5202 non-null   object 
 2   State             5202 non-null   object 
 3   Location          5202 non-null   object 
 4   Food              5202 non-null   object 
 5   Species           5202 non-null   object 
 6   Status            5202 non-null   object 
 7   Illnesses         5202 non-null   int64  
 8   Hospitalizations  5202 non-null   float64
 9   Fatalities        5202 non-null   float64
dtypes: float64(2), int64(2), object(6)
memory usage: 447.0+ KB


In [10]:
## There are not null values anymores
data.isnull().sum()

Year                0
Month               0
State               0
Location            0
Food                0
Species             0
Status              0
Illnesses           0
Hospitalizations    0
Fatalities          0
dtype: int64

In [11]:
data.head()

Unnamed: 0,Year,Month,State,Location,Food,Species,Status,Illnesses,Hospitalizations,Fatalities
3,1998,January,California,Restaurant,"Fish, Ahi",Scombroid toxin,Confirmed,4,0.0,0.0
4,1998,January,California,Private Home/Residence,"Lasagna, Unspecified; Eggs, Other",Salmonella enterica,Confirmed,26,3.0,0.0
7,1998,January,California,Restaurant,"Stuffing, Unspecified; Sandwich, Turkey",Salmonella enterica,Confirmed,4,3.0,0.0
15,1998,January,Florida,Restaurant,"Ethnic Style, Unspecified",Clostridium perfringens,Suspected,3,0.0,0.0
17,1998,January,Florida,Restaurant,"Ground Beef, Cheeseburger",Staphylococcus aureus,Suspected,2,0.0,0.0


## Data Preparation

Let's create dummy variables, to use a **regression equation** to represent multiple groups.

In [12]:
cat_vars = ['Year','State','Location','Food','Species','Status','Month','Hospitalizations','Fatalities']
for var in cat_vars:
    cat_list ='var'+'_'+var
    cat_list = pd.get_dummies(data[var], prefix=var)
    data1 = data.join(cat_list)
    data = data1
    
data_vars = data.columns.values.tolist()
to_keep = [i for i in data_vars if i not in cat_vars]

data_final = data[to_keep]
data_final.columns.values


array(['Illnesses', 'Year_1998', 'Year_1999', ..., 'Fatalities_5.0',
       'Fatalities_7.0', 'Fatalities_33.0'], dtype=object)

In [13]:
## Segregating by Features (X1, X2,...,Xk) y Label (y)
X = data_final.loc[:, data_final.columns != 'Illnesses']
y = data_final.loc[:, data_final.columns == 'Illnesses']

In [14]:
## Split into train and validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

## regression model

In [15]:
## Create linear regression object
model = PoissonRegressor()

## Train the model using the training sets
model.fit(X_train, y_train)

## Predictions
y_pred = model.predict(X_test)

# Predicted dataframe
pd.DataFrame(y_pred)


0.1473414477898783


Unnamed: 0,0
0,15.374143
1,15.389971
2,40.529329
3,16.431656
4,31.352060
...,...
1556,19.745381
1557,19.360296
1558,32.602802
1559,10.633788


In [18]:
# Real values
pd.DataFrame(y_test)

Unnamed: 0,Illnesses
2039,16
9445,18
9601,54
17788,15
4097,40
...,...
14508,64
6267,23
14477,32
2560,4


In [16]:
from sklearn import linear_model
ridge = linear_model.Ridge(alpha=10)
ridge.fit(X_train, y_train)

In [17]:
training_ridge = ridge.score(X_train, y_train)
print("training accuracy with ridge ",training_ridge )

training accuracy with ridge  0.41350272598843496


Looking the acurracy of the training, we can see that probably this is not a regression problem but  a **clasiffication**. So we should implement it to see how the precision improves.