# **Setup**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
import io

# **Download Data**

In [None]:
!gdown 1Qy4oNXeweh7UUD87jZrBF4b1fWOAqXf0

Downloading...
From: https://drive.google.com/uc?id=1Qy4oNXeweh7UUD87jZrBF4b1fWOAqXf0
To: /content/FoodData.csv
  0% 0.00/11.4k [00:00<?, ?B/s]100% 11.4k/11.4k [00:00<00:00, 30.1MB/s]


In [None]:
data_filepath = '/content/FoodData.csv'
raw_data = pd.read_csv(data_filepath)
raw_data=raw_data.dropna()

In [None]:
raw_data.to_csv('clear_data.csv', encoding='utf-8')

# **Pre Processing**

## -Data Visualisasi

In [None]:
print('==============================')
print(raw_data.head(-1))
print(raw_data.describe())
print(raw_data['Allergy'].value_counts().head())
print('==============================')
print(raw_data['Class'].unique())
print(raw_data['Type'].unique())
print(raw_data['Group'].unique())
print(raw_data['Allergy'].unique())
print('==============================')
# Print the unique values
ClassData = raw_data['Class'].unique()
TypeData = raw_data['Type'].unique()
GroupData = raw_data['Group'].unique()
for class_val in ClassData:
    print(class_val)
    filtered_data = raw_data[raw_data['Class'] == class_val]
    unique_type = filtered_data['Type'].unique()
    for type_val in unique_type:
        print('     ', type_val)
        filtered_data = raw_data[raw_data['Type'] == type_val]
        unique_groups = filtered_data['Group'].unique()
        for group_val in unique_groups:
            print('          ', group_val)
print('==============================')

## -Dokumentasi Data

In [None]:
x = [x + ', ' for x in raw_data['Food']]
f = open("raw food.txt", "a")
f.write(' '.join(x))
f.close()

## -Processing

In [None]:
clear_data = raw_data.apply(lambda x: pd.factorize(x)[0])
clear_data

Unnamed: 0,Class,Type,Group,Food,Allergy
0,0,0,0,0,0
1,0,1,1,1,1
2,0,1,2,2,2
3,0,2,3,3,3
4,0,2,4,4,4
...,...,...,...,...,...
179,0,3,7,156,7
180,1,5,12,157,13
181,0,3,8,158,8
182,0,2,26,159,29


## -Clear Data Visualisasi

In [None]:
print('==============================')
print(clear_data.info())
print(clear_data.describe())
print('==============================')
print(clear_data['Class'].unique())
print(clear_data['Type'].unique())
print(clear_data['Group'].unique())
print('==============================')
pd.options.display.float_format = '{:,.2f}'.format
plt.figure(figsize=(16,10))
sns.heatmap(clear_data.corr(), annot=True)
plt.show()
print('==============================')

# **Train Model** 

## -ScikitLearn

### --Setup

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

### --Train

In [None]:
X = clear_data[['Class', 'Type', 'Group']]
y = clear_data['Allergy']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7575757575757576


### --Test

In [None]:
new_food = [[0,0,0]]
predicted_label_encoded = clf.predict(new_food)[0]

# Map the predicted integer label back to its original value
predicted_label = raw_data['Allergy'].unique()[predicted_label_encoded]
print(predicted_label)

Seed Allergy




# **Export Model**

In [None]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [None]:
print(clf)

DecisionTreeClassifier()


In [None]:
import json
import numpy as np
from sklearn.base import BaseEstimator

# Assuming you have already trained and have a scikit-learn model object called 'model'
def remove_circular_refs(ob, _seen=None):
    if _seen is None:
        _seen = set()
    if id(ob) in _seen:
        # circular reference, remove it.
        return None
    _seen.add(id(ob))
    res = ob
    if isinstance(ob, dict):
        res = {
            remove_circular_refs(k, _seen): remove_circular_refs(v, _seen)
            for k, v in ob.items()}
    elif isinstance(ob, (list, tuple, set, frozenset)):
        res = type(ob)(remove_circular_refs(v, _seen) for v in ob)
    # remove id again; only *nested* references count
    _seen.remove(id(ob))
    return res
# Convert non-serializable objects to serializable format
def convert_np_arrays(obj):
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    return obj

# Convert the model to a dictionary
model_dict = {
    'class_name': clf.__class__.__name__,
    'model_params': clf.get_params(),
    'model_state': clf.__dict__
}

# Convert non-serializable objects in the model state
model_dict['model_state'] = {k: convert_np_arrays(v) for k, v in model_dict['model_state'].items()}

# Save the model dictionary to a JSON file
with open('model.json', 'w') as f:
    json.dump(model_dict, f, default=convert_np_arrays)


ValueError: ignored