# Covid Symptom Identification

Project developed by:

- Ana Teresa Cruz (up201806460)
- André Nascimento (up201806461)
- António Bezerra (up201806854)


In [None]:
import pandas as pd

dataset = pd.read_csv('large_data.csv')

dataset.head()

In [None]:
dataset.describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sb

dataset_corr = dataset.corr()
plt.figure(figsize=(30,30))
sb.heatmap(dataset_corr, annot=True)

### All data hist

In [None]:
dataset.hist(bins=[-0.05, 0.05, 0.95, 1.05], range=(0,1), figsize=(22, 16))

### Allergy data hist

In [None]:
allergy_data = dataset.loc[dataset['TYPE'] == 'ALLERGY']
allergy_data.hist(bins=[-0.05, 0.05, 0.95, 1.05], range=(0,1), figsize=(22,16))

### Cold data hist

In [None]:
cold_data = dataset.loc[dataset['TYPE'] == 'COLD']
cold_data.hist(bins=[-0.05, 0.05, 0.95, 1.05], range=(0,1), figsize=(22,16))

### Covid data hist

In [None]:
covid_data = dataset.loc[dataset['TYPE'] == 'COVID']
covid_data.hist(bins=[-0.05, 0.05, 0.95, 1.05], range=(0,1), figsize=(22,16))

### Flu data hist

In [None]:
flu_data = dataset.loc[dataset['TYPE'] == 'FLU']
flu_data.hist(bins=[-0.05, 0.05, 0.95, 1.05], range=(0,1), figsize=(22,16))

## Data preprocessing

In [None]:
dataset['TYPE'] = dataset['TYPE'].astype('category')

col_names = list(dataset.columns)
col_names.remove('TYPE')

inputs = dataset[col_names].values
labels = dataset['TYPE'].values

#### Undersampling data

In [None]:
# all types with 1024 samples
us_dataset = pd.concat([allergy_data.sample(n=1024, random_state=1),
                                  cold_data,
                                  covid_data.sample(n=1024, random_state=1),
                                  flu_data.sample(n=1024, random_state=1)])

us_inputs = us_dataset[col_names].values
us_labels = us_dataset['TYPE'].values

#### Oversampling data

In [None]:
# all types with 25000 samples
os_dataset = pd.concat([allergy_data.sample(n=25000, replace=True, random_state=1),
                                  cold_data.sample(n=25000, replace=True, random_state=1),
                                  covid_data.sample(n=25000, replace=True, random_state=1),
                                  flu_data])

os_inputs = os_dataset[col_names].values
os_labels = os_dataset['TYPE'].values

#### Train and Test split data

In [None]:
from sklearn.model_selection import train_test_split

(train_in,
 test_in,
 train_classes,
 test_classes) = train_test_split(inputs, labels, test_size=0.25, random_state=1, stratify=labels)

(us_train_in,
 us_test_in,
 us_train_classes,
 us_test_classes) = train_test_split(us_inputs, us_labels, test_size=0.25, random_state=1, stratify=us_labels)

(os_train_in,
 os_test_in,
 os_train_classes,
 os_test_classes) = train_test_split(os_inputs, os_labels, test_size=0.25, random_state=1, stratify=os_labels)

#### Scaler

In [None]:
from sklearn.preprocessing import StandardScaler  

scaler = StandardScaler()

scaler.fit(train_in)
train_in = scaler.transform(train_in)
test_in = scaler.transform(test_in) 

scaler.fit(us_train_in)
us_train_in = scaler.transform(us_train_in)
us_test_in = scaler.transform(us_test_in) 

scaler.fit(os_train_in)
os_train_in = scaler.transform(os_train_in)
os_test_in = scaler.transform(os_test_in) 

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

## Decision Tree Classifier

#### Original dataset

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()

dtc.fit(train_in, train_classes)
dtc_prediction = dtc.predict(test_in)

dtc_confusion_matrix = confusion_matrix(test_classes, dtc_prediction)
dtc_classification_report = classification_report(test_classes, dtc_prediction)

print(f"--- Original dataset ---\nConfusion matrix:\n{dtc_confusion_matrix}\n\nClassification report:\n{dtc_classification_report}\n") 

#### Undersampled dataset

In [None]:
dtc.fit(us_train_in, us_train_classes)
dtc_prediction = dtc.predict(us_test_in)

dtc_us_confusion_matrix = confusion_matrix(us_test_classes, dtc_prediction)
dtc_us_classification_report = classification_report(us_test_classes, dtc_prediction)

print(f"--- Undersampled dataset ---\nConfusion matrix:\n{dtc_us_confusion_matrix}\n\nClassification report:\n{dtc_us_classification_report}\n") 

#### Oversampled dataset

In [None]:
dtc.fit(os_train_in, os_train_classes)
dtc_prediction = dtc.predict(os_test_in)

dtc_os_confusion_matrix = confusion_matrix(os_test_classes, dtc_prediction)
dtc_os_classification_report = classification_report(os_test_classes, dtc_prediction)

print(f"--- Oversampled dataset ---\nConfusion matrix:\n{dtc_os_confusion_matrix}\n\nClassification report:\n{dtc_os_classification_report}\n") 

### Parameter Tunning (GridSearch)

In [None]:
from sklearn.model_selection import GridSearchCV

#### Original dataset

In [None]:
parameter_grid = {'criterion': ['gini', 'entropy'],
                  'splitter': ['best', 'random'],
                  'max_depth': [11, 13, 15, 17],
                  'max_features': [14, 15, 16, 17]}

grid_search = GridSearchCV(DecisionTreeClassifier(),
                           param_grid=parameter_grid,
                           cv=10,
                           verbose=3,
                           n_jobs=-1)

grid_search.fit(train_in, train_classes)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
dtc = grid_search.best_estimator_
dtc_prediction = dtc.predict(test_in)


best_dtc_confusion_matrix = confusion_matrix(test_classes, dtc_prediction)
best_dtc_classification_report = classification_report(test_classes, dtc_prediction)


print(f"--- Improved original dataset ---\nConfusion matrix:\n{best_dtc_confusion_matrix}\n\nClassification report:\n{best_dtc_classification_report}\n") 

#### Undersampled dataset

In [None]:
grid_search.fit(us_train_in, us_train_classes)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
dtc = grid_search.best_estimator_
dtc_prediction = dtc.predict(us_test_in)


best_us_dtc_confusion_matrix = confusion_matrix(us_test_classes, dtc_prediction)
best_us_dtc_classification_report = classification_report(us_test_classes, dtc_prediction)


print(f"--- Improved undersampled dataset ---\nConfusion matrix:\n{best_us_dtc_confusion_matrix}\n\nClassification report:\n{best_us_dtc_classification_report}\n") 

#### Oversampled dataset

In [None]:
grid_search.fit(os_train_in, os_train_classes)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
dtc = grid_search.best_estimator_
dtc_prediction = dtc.predict(os_test_in)


best_os_dtc_confusion_matrix = confusion_matrix(os_test_classes, dtc_prediction)
best_os_dtc_classification_report = classification_report(os_test_classes, dtc_prediction)


print(f"--- Improved oversampled dataset ---\nConfusion matrix:\n{best_os_dtc_confusion_matrix}\n\nClassification report:\n{best_os_dtc_classification_report}\n") 

## Random Forest  !TODO!

In [None]:
from sklearn.ensemble import RandomForestClassifier

def train_evaluate_RF(train_in, test_in, train_class, test_class):
    rfClassifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42) # 10 decision trees used in this classifier
    rfClassifier.fit(train_in, train_class)
    
    prediction = rfClassifier.predict(test_in)  #predict on test set
    print(skmetric.confusion_matrix(test_class, prediction))
    print(skmetric.classification_report(test_class, prediction))

## K-Nearest Neighbors

#### Original dataset

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

knn.fit(train_in, train_classes)
knn_prediction = knn.predict(test_in)


knn_confusion_matrix = confusion_matrix(test_classes, knn_prediction)
knn_classification_report = classification_report(test_classes, knn_prediction)


print(f"--- Original dataset ---\nConfusion matrix:\n{knn_confusion_matrix}\n\nClassification report:\n{knn_classification_report}\n") 

#### Undersampled dataset

In [None]:
knn.fit(us_train_in, us_train_classes)
knn_prediction = knn.predict(us_test_in)

knn_us_confusion_matrix = confusion_matrix(us_test_classes, knn_prediction)
knn_us_classification_report = classification_report(us_test_classes, knn_prediction)

print(f"--- Undersampled dataset ---\nConfusion matrix:\n{knn_us_confusion_matrix}\n\nClassification report:\n{knn_us_classification_report}\n") 

#### Oversampled dataset

In [None]:
knn.fit(os_train_in, os_train_classes)
knn_prediction = knn.predict(os_test_in)

knn_os_confusion_matrix = confusion_matrix(os_test_classes, knn_prediction)
knn_os_classification_report = classification_report(os_test_classes, knn_prediction)

print(f"--- Oversampled dataset ---\nConfusion matrix:\n{knn_os_confusion_matrix}\n\nClassification report:\n{knn_os_classification_report}\n") 

### Parameter Tunning

#### Original dataset

In [None]:
parameter_grid = {'n_neighbors': [5, 8, 10, 12, 15],
                  'weights': ['uniform', 'distance'],
                  'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

grid_search = GridSearchCV(KNeighborsClassifier(),
                           param_grid=parameter_grid,
                           cv=10,
                           verbose=3,
                           n_jobs=-1)


grid_search.fit(train_in, train_classes)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
knn = grid_search.best_estimator_
knn_prediction = knn.predict(test_in)


best_knn_confusion_matrix = confusion_matrix(test_classes, knn_prediction)
best_knn_classification_report = classification_report(test_classes, knn_prediction)


print(f"--- Improved original dataset ---\nConfusion matrix:\n{best_knn_confusion_matrix}\n\nClassification report:\n{best_knn_classification_report}\n") 

#### Undersampled dataset

In [None]:
grid_search.fit(us_train_in, us_train_classes)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
knn = grid_search.best_estimator_
knn_prediction = svc.predict(us_test_in)


best_us_knn_confusion_matrix = confusion_matrix(us_test_classes, knn_prediction)
best_us_knn_classification_report = classification_report(us_test_classes, knn_prediction)


print(f"--- Improved undersampled dataset ---\nConfusion matrix:\n{best_us_knn_confusion_matrix}\n\nClassification report:\n{best_us_knn_classification_report}\n") 

#### Oversampled dataset

In [None]:
grid_search.fit(os_train_in, os_train_classes)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
knn = grid_search.best_estimator_
knn_prediction = knn.predict(os_test_in)


best_os_knn_confusion_matrix = confusion_matrix(os_test_classes, knn_prediction)
best_os_knn_classification_report = classification_report(os_test_classes, knn_prediction)


print(f"--- Improved oversampled dataset ---\nConfusion matrix:\n{best_os_knn_confusion_matrix}\n\nClassification report:\n{best_os_knn_classification_report}\n") 

## Support-vector machines

#### Original dataset

In [None]:
from sklearn.svm import SVC

svc = SVC()

svc.fit(train_in, train_classes)
svc_prediction = svc.predict(test_in)


svc_confusion_matrix = confusion_matrix(test_classes, svc_prediction)
svc_classification_report = classification_report(test_classes, svc_prediction)


print(f"--- Original dataset ---\nConfusion matrix:\n{svc_confusion_matrix}\n\nClassification report:\n{svc_classification_report}\n") 

#### Undersampled dataset

In [None]:
svc.fit(us_train_in, us_train_classes)
svc_prediction = svc.predict(us_test_in)

svc_us_confusion_matrix = confusion_matrix(us_test_classes, svc_prediction)
svc_us_classification_report = classification_report(us_test_classes, svc_prediction)

print(f"--- Undersampled dataset ---\nConfusion matrix:\n{svc_us_confusion_matrix}\n\nClassification report:\n{svc_us_classification_report}\n") 

#### Oversampled dataset

In [None]:
svc.fit(os_train_in, os_train_classes)
svc_prediction = svc.predict(os_test_in)

svc_os_confusion_matrix = confusion_matrix(os_test_classes, svc_prediction)
svc_os_classification_report = classification_report(os_test_classes, svc_prediction)

print(f"--- Oversampled dataset ---\nConfusion matrix:\n{svc_os_confusion_matrix}\n\nClassification report:\n{svc_os_classification_report}\n") 

### Parameter Tunning

#### Original dataset

In [None]:
parameter_grid = {'C' : [0.1, 1, 10], 
            'gamma' : ['scale', 'auto'],
            'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

grid_search = GridSearchCV(SVC(),
                           param_grid=parameter_grid,
                           cv=10,
                           verbose=3,
                           n_jobs=-1)

grid_search.fit(train_in, train_classes)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
svc = grid_search.best_estimator_
svc_prediction = svc.predict(test_in)


best_svc_confusion_matrix = confusion_matrix(test_classes, svc_prediction)
best_svc_classification_report = classification_report(test_classes, svc_prediction)


print(f"--- Improved original dataset ---\nConfusion matrix:\n{best_svc_confusion_matrix}\n\nClassification report:\n{best_svc_classification_report}\n") 

#### Undersampled dataset

In [None]:
grid_search.fit(us_train_in, us_train_classes)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
svc = grid_search.best_estimator_
svc_prediction = svc.predict(us_test_in)


best_us_svc_confusion_matrix = confusion_matrix(us_test_classes, svc_prediction)
best_us_svc_classification_report = classification_report(us_test_classes, svc_prediction)


print(f"--- Improved undersampled dataset ---\nConfusion matrix:\n{best_us_svc_confusion_matrix}\n\nClassification report:\n{best_us_svc_classification_report}\n") 

#### Oversampled dataset

In [None]:
grid_search.fit(os_train_in, os_train_classes)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
svc = grid_search.best_estimator_
svc_prediction = svc.predict(os_test_in)


best_os_svc_confusion_matrix = confusion_matrix(os_test_classes, svc_prediction)
best_os_svc_classification_report = classification_report(os_test_classes, svc_prediction)


print(f"--- Improved oversampled dataset ---\nConfusion matrix:\n{best_os_svc_confusion_matrix}\n\nClassification report:\n{best_os_svc_classification_report}\n") 

## Neural Networks

#### Original dataset

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier()

mlp.fit(train_in, train_classes)
mlp_prediction = mlp.predict(test_in)


mlp_confusion_matrix = confusion_matrix(test_classes, mlp_prediction)
mlp_classification_report = classification_report(test_classes, mlp_prediction)


print(f"--- Original dataset ---\nConfusion matrix:\n{mlp_confusion_matrix}\n\nClassification report:\n{mlp_classification_report}\n") 

#### Undersampled dataset

In [None]:
mlp.fit(us_train_in, us_train_classes)
mlp_prediction = mlp.predict(us_test_in)

mlp_us_confusion_matrix = confusion_matrix(us_test_classes, mlp_prediction)
mlp_us_classification_report = classification_report(us_test_classes, mlp_prediction)

print(f"--- Undersampled dataset ---\nConfusion matrix:\n{mlp_us_confusion_matrix}\n\nClassification report:\n{mlp_us_classification_report}\n") 

#### Oversampled dataset

In [None]:
mlp.fit(os_train_in, os_train_classes)
mlp_prediction = mlp.predict(os_test_in)

mlp_os_confusion_matrix = confusion_matrix(os_test_classes, mlp_prediction)
mlp_os_classification_report = classification_report(os_test_classes, mlp_prediction)

print(f"--- Oversampled dataset ---\nConfusion matrix:\n{mlp_os_confusion_matrix}\n\nClassification report:\n{mlp_os_classification_report}\n") 

### Parameter Tunning

#### Original dataset

In [None]:
parameter_grid = {'hidden_layer_sizes': [10, 50, 100],
                  'activation' : ['identity', 'logistic', 'tanh', 'relu'],
                  'solver': ['lbfgs', 'sgd', 'adam'],
                  'alpha': [0.0001, 0.001, 0.01]}

grid_search = GridSearchCV(MLPClassifier(), 
                           param_grid=parameter_grid, 
                           cv=10,
                           verbose=3,
                           n_jobs=-1)

grid_search.fit(train_in, train_classes)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
mlp = grid_search.best_estimator_
mlp_prediction = mlp.predict(test_in)


best_mlp_confusion_matrix = confusion_matrix(test_classes, mlp_prediction)
best_mlp_classification_report = classification_report(test_classes, mlp_prediction)


print(f"--- Improved original dataset ---\nConfusion matrix:\n{best_mlp_confusion_matrix}\n\nClassification report:\n{best_mlp_classification_report}\n") 

#### Undersampled dataset

In [None]:
grid_search.fit(us_train_in, us_train_classes)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
mlp = grid_search.best_estimator_
mlp_prediction = mlp.predict(us_test_in)


best_us_mlp_confusion_matrix = confusion_matrix(us_test_classes, mlp_prediction)
best_us_mlp_classification_report = classification_report(us_test_classes, mlp_prediction)


print(f"--- Improved undersampled dataset ---\nConfusion matrix:\n{best_us_mlp_confusion_matrix}\n\nClassification report:\n{best_us_mlp_classification_report}\n") 

#### Oversampled dataset

In [None]:
grid_search.fit(os_train_in, os_train_classes)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
mlp = grid_search.best_estimator_
mlp_prediction = mlp.predict(os_test_in)


best_os_mlp_confusion_matrix = confusion_matrix(os_test_classes, mlp_prediction)
best_os_mlp_classification_report = classification_report(os_test_classes, mlp_prediction)


print(f"--- Improved oversampled dataset ---\nConfusion matrix:\n{best_os_mlp_confusion_matrix}\n\nClassification report:\n{best_os_mlp_classification_report}\n") 