In [2]:
# Importing libraries we are going to use
import pandas as pd
import numpy as np

#Split the data into training and testing set
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier

#Measuring accuracy on Testing Data
from sklearn import metrics

In [3]:
data = pd.read_csv('outbreaks.csv',sep=',')
data

Unnamed: 0,Year,Month,State,Location,Food,Ingredient,Species,Serotype/Genotype,Status,Illnesses,Hospitalizations,Fatalities
0,1998,January,California,Restaurant,,,,,,20,0.0,0.0
1,1998,January,California,,Custard,,,,,112,0.0,0.0
2,1998,January,California,Restaurant,,,,,,35,0.0,0.0
3,1998,January,California,Restaurant,"Fish, Ahi",,Scombroid toxin,,Confirmed,4,0.0,0.0
4,1998,January,California,Private Home/Residence,"Lasagna, Unspecified; Eggs, Other",,Salmonella enterica,Enteritidis,Confirmed,26,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
19114,2015,December,Wisconsin,Restaurant,,,Norovirus genogroup II,GII_14,Confirmed,4,0.0,0.0
19115,2015,December,Wisconsin,Private Home/Residence,Salsa,,Norovirus genogroup II,GII_2,Confirmed,16,0.0,0.0
19116,2015,December,Wisconsin,Nursing Home/Assisted Living Facility,,,Norovirus genogroup II,GII_17 Kawasaki (2014),Confirmed,43,3.0,0.0
19117,2015,December,Wisconsin,Fast Food Restaurant,,,Norovirus genogroup II,GII_2,Confirmed,5,0.0,0.0


## Data preprocessing

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19119 entries, 0 to 19118
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               19119 non-null  int64  
 1   Month              19119 non-null  object 
 2   State              19119 non-null  object 
 3   Location           16953 non-null  object 
 4   Food               10156 non-null  object 
 5   Ingredient         1876 non-null   object 
 6   Species            12500 non-null  object 
 7   Serotype/Genotype  3907 non-null   object 
 8   Status             12500 non-null  object 
 9   Illnesses          19119 non-null  int64  
 10  Hospitalizations   15494 non-null  float64
 11  Fatalities         15518 non-null  float64
dtypes: float64(2), int64(2), object(8)
memory usage: 1.8+ MB


In [5]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,19119.0,2005.562373,5.158404,1998.0,2001.0,2005.0,2010.0,2015.0
Illnesses,19119.0,19.537162,49.452417,2.0,3.0,8.0,19.0,1939.0
Hospitalizations,15494.0,0.947528,5.312772,0.0,0.0,0.0,1.0,308.0
Fatalities,15518.0,0.021717,0.386976,0.0,0.0,0.0,0.0,33.0


In [6]:
data.isnull().sum()

Year                     0
Month                    0
State                    0
Location              2166
Food                  8963
Ingredient           17243
Species               6619
Serotype/Genotype    15212
Status                6619
Illnesses                0
Hospitalizations      3625
Fatalities            3601
dtype: int64

In [7]:

data = data.drop(['Serotype/Genotype','Ingredient'], axis=1)

In [8]:
data = data.dropna()

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5202 entries, 3 to 19115
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Year              5202 non-null   int64  
 1   Month             5202 non-null   object 
 2   State             5202 non-null   object 
 3   Location          5202 non-null   object 
 4   Food              5202 non-null   object 
 5   Species           5202 non-null   object 
 6   Status            5202 non-null   object 
 7   Illnesses         5202 non-null   int64  
 8   Hospitalizations  5202 non-null   float64
 9   Fatalities        5202 non-null   float64
dtypes: float64(2), int64(2), object(6)
memory usage: 447.0+ KB


In [10]:
## There are not null values anymores
data.isnull().sum()

Year                0
Month               0
State               0
Location            0
Food                0
Species             0
Status              0
Illnesses           0
Hospitalizations    0
Fatalities          0
dtype: int64

In [11]:
data.head()

Unnamed: 0,Year,Month,State,Location,Food,Species,Status,Illnesses,Hospitalizations,Fatalities
3,1998,January,California,Restaurant,"Fish, Ahi",Scombroid toxin,Confirmed,4,0.0,0.0
4,1998,January,California,Private Home/Residence,"Lasagna, Unspecified; Eggs, Other",Salmonella enterica,Confirmed,26,3.0,0.0
7,1998,January,California,Restaurant,"Stuffing, Unspecified; Sandwich, Turkey",Salmonella enterica,Confirmed,4,3.0,0.0
15,1998,January,Florida,Restaurant,"Ethnic Style, Unspecified",Clostridium perfringens,Suspected,3,0.0,0.0
17,1998,January,Florida,Restaurant,"Ground Beef, Cheeseburger",Staphylococcus aureus,Suspected,2,0.0,0.0


## Data Preparation

In [12]:
cat_vars = ['Year','State','Location','Illnesses','Species','Status','Month','Hospitalizations','Fatalities']
for var in cat_vars:
    cat_list ='var'+'_'+var
    cat_list = pd.get_dummies(data[var], prefix=var)
    data1 = data.join(cat_list)
    data = data1
    
data_vars = data.columns.values.tolist()
to_keep = [i for i in data_vars if i not in cat_vars]

data_final = data[to_keep]
data_final.columns.values

array(['Food', 'Year_1998', 'Year_1999', 'Year_2000', 'Year_2001',
       'Year_2002', 'Year_2003', 'Year_2004', 'Year_2005', 'Year_2006',
       'Year_2007', 'Year_2008', 'Year_2009', 'Year_2010', 'Year_2011',
       'Year_2012', 'Year_2013', 'Year_2014', 'Year_2015',
       'State_Alabama', 'State_Alaska', 'State_Arizona', 'State_Arkansas',
       'State_California', 'State_Colorado', 'State_Connecticut',
       'State_Delaware', 'State_Florida', 'State_Georgia', 'State_Guam',
       'State_Hawaii', 'State_Idaho', 'State_Illinois', 'State_Indiana',
       'State_Iowa', 'State_Kansas', 'State_Kentucky', 'State_Louisiana',
       'State_Maine', 'State_Maryland', 'State_Massachusetts',
       'State_Michigan', 'State_Minnesota', 'State_Mississippi',
       'State_Missouri', 'State_Montana', 'State_Multistate',
       'State_Nebraska', 'State_Nevada', 'State_New Hampshire',
       'State_New Jersey', 'State_New Mexico', 'State_New York',
       'State_North Carolina', 'State_North Dakota

In [13]:
## Segregating by Features (X1, X2,...,Xk) y Label (y)
X = data_final.loc[:, data_final.columns != 'Food']
y = data_final.loc[:, data_final.columns == 'Food']

In [14]:
## Split into train and validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
clf = KNeighborsClassifier(n_neighbors=2)

#Printing all the parameters of KNN
print(clf)

KNeighborsClassifier(n_neighbors=2)


In [23]:
#Creating the model on Training Data
KNN=clf.fit(X_train,y_train)
y_pred=KNN.predict(X_test)

  return self._fit(X, y)


In [24]:
#Measuring accuracy on Testing Data
KNN.score(X_train, y_train)
pd.DataFrame(y_pred)

Unnamed: 0,0
0,"Chicken, Strips"
1,"Chicken, Tandori"
2,Pizza
3,"Enchilada, Chicken"
4,"Crab, Cooked"
...,...
1036,"Fish, Baracuda"
1037,"Fish, Escolar"
1038,"Fish, Grouper"
1039,"Chicken, Unspecified"


This is showing the relation between the ilnesses and the food.

In [25]:
pd.DataFrame(y_test)

Unnamed: 0,Food
9950,Creme-Filled Doughnut
352,"Chicken, Unspecified; Cake, Unspecified"
17987,"Tamale, Unspecified"
8570,"Sandwich, Unspecified"
16247,Mexican Spicy Meat Stew
...,...
7101,"Fish, Amberjack"
4456,Cantaloupe
17980,"Fish, Almaco Jack"
214,"Hot Dog, Unspecified"
