In [1]:
import pandas as pd
import numpy as np
import streamlit
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
df = pd.read_csv('DSP_13_eng.csv', sep=';')

In [3]:
df.head()

Unnamed: 0,symptoms,age,disease,height,medicines,health
0,1,,0,170.0,2,0
1,1,65.0,0,,3,0
2,1,55.0,0,182.0,4,0
3,1,44.0,0,187.0,3,0
4,1,77.0,0,165.0,2,0


In [4]:
df.describe()

Unnamed: 0,symptoms,age,disease,height,medicines,health
count,60.0,55.0,60.0,55.0,60.0,60.0
mean,2.533333,44.163636,1.416667,183.345455,2.5,0.5
std,1.294927,18.491002,1.618659,12.561284,1.033342,0.504219
min,1.0,11.0,0.0,159.0,1.0,0.0
25%,2.0,32.0,0.0,169.0,2.0,0.0
50%,2.0,43.0,1.0,187.0,2.5,0.5
75%,4.0,55.5,2.0,194.0,3.0,1.0
max,5.0,77.0,5.0,200.0,4.0,1.0


In [5]:
df.isna().any()

symptoms     False
age           True
disease      False
height        True
medicines    False
health       False
dtype: bool

In [6]:
df = df.rename(columns={'symptoms ':'symptoms', 'height ':'height'})

In [7]:
columns = ['symptoms', 'age', 'disease', 'height', 'medicines']

In [8]:
df = df.dropna()

In [9]:
df.describe()

Unnamed: 0,symptoms,age,disease,height,medicines,health
count,50.0,50.0,50.0,50.0,50.0,50.0
mean,2.66,44.26,1.54,183.8,2.64,0.54
std,1.318781,18.287734,1.692842,12.404739,1.025392,0.503457
min,1.0,11.0,0.0,159.0,1.0,0.0
25%,2.0,32.0,0.0,171.5,2.0,0.0
50%,2.0,42.5,1.0,187.0,3.0,1.0
75%,4.0,55.0,2.0,194.0,3.0,1.0
max,5.0,77.0,5.0,200.0,4.0,1.0


In [10]:
df.max()

symptoms       5.0
age           77.0
disease        5.0
height       200.0
medicines      4.0
health         1.0
dtype: float64

In [11]:
df.min()

symptoms       1.0
age           11.0
disease        0.0
height       159.0
medicines      1.0
health         0.0
dtype: float64

In [12]:
X = df[columns]
y = df['health']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [14]:
rf_5 = RandomForestClassifier(max_depth = 5)

In [15]:
rf_5.fit(X_train, y_train)

In [16]:
y_pred = rf_5.predict(X_test)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.3
Mean Squared Error: 0.3
Root Mean Squared Error: 0.5477225575051661


In [17]:
rf_8 = RandomForestClassifier(max_depth = 8)

In [18]:
rf_8.fit(X_train, y_train)

In [19]:
y_pred = rf_8.predict(X_test)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.2
Mean Squared Error: 0.2
Root Mean Squared Error: 0.4472135954999579


In [20]:
pickle.dump(rf_5, open('rf_5_model.pkl', 'wb'))

In [21]:
pickle.dump(rf_8, open('rf_8_model.pkl', 'wb'))