# Covid Symptom Identification

Project developed by:

- Ana Teresa Cruz (up201806460)
- André Nascimento (up201806461)
- António Bezerra (up201806854)


In [None]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('large_data.csv')

dataset.head()

In [None]:
dataset.describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sb

dataset_corr = dataset.corr()
plt.figure(figsize=(30,30))
sb.heatmap(dataset_corr, annot=True)

### All data hist

In [None]:
dataset.hist(bins=[-0.05, 0.05, 0.95, 1.05], range=(0,1), figsize=(22, 16))

### Allergy data hist

In [None]:
allergy_data = dataset.loc[dataset['TYPE'] == 'ALLERGY']
allergy_data.hist(bins=[-0.05, 0.05, 0.95, 1.05], range=(0,1), figsize=(22,16))

### Cold data hist

In [None]:
cold_data = dataset.loc[dataset['TYPE'] == 'COLD']
cold_data.hist(bins=[-0.05, 0.05, 0.95, 1.05], range=(0,1), figsize=(22,16))

### Covid data hist

In [None]:
covid_data = dataset.loc[dataset['TYPE'] == 'COVID']
covid_data.hist(bins=[-0.05, 0.05, 0.95, 1.05], range=(0,1), figsize=(22,16))

### Flu data hist

In [None]:
flu_data = dataset.loc[dataset['TYPE'] == 'FLU']
flu_data.hist(bins=[-0.05, 0.05, 0.95, 1.05], range=(0,1), figsize=(22,16))

## Data preprocessing

Since our dataset did not contain null or invalid values or significant outliers, we did not have much preprocessing work to do. The only manipulation we did was split the dataset into input and label sets so that they could be passed to the SciKit classifiers we use.

In [None]:
dataset['TYPE'] = dataset['TYPE'].astype('category')

col_names = list(dataset.columns)
col_names.remove('TYPE')

inputs = dataset[col_names].values
labels = dataset['TYPE'].values

#### Train and Test split data

We used SciKit's built-in train_test_split function in order to generate train and test datasets. We defined the trainig data as 1/4 of the entire dataset. We use the stratify option in order to maintain the original dataset's class distribution.

In [None]:
from sklearn.model_selection import train_test_split

(train_in,
 test_in,
 train_classes,
 test_classes) = train_test_split(inputs, labels, test_size=0.25, random_state=1, stratify=labels)

### Resampling

Our data analysis stage showed that our working dataset was heavily inbalanced. (inserir percentagens)

Early exploratory analysis of classification methods proved that this was having a negative effect on the accuracy of the classifier, especially for the minority classes. To solve this problem we implemented resampling techniques that would generate a more balanced training set.

We implemented both **undersampling** and **oversampling**. These techniques differ in that undersampling removes samples from majority categories, while oversampling duplicates samples from minority categories. Oversampling is generally preffered, but undersampling generates smaller and therefore less complex datasets.

We used random over and undersampling techniques.


In [None]:
from collections import Counter

print("---Train Set---")
print(Counter(train_classes))
print("\n---Test Set---")
print(Counter(test_classes))

In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()

us_inputs, us_labels = rus.fit_resample(train_in, train_classes)

print(Counter(us_labels))

In [None]:
from imblearn.over_sampling import SMOTE

ros = SMOTE()

os_inputs, os_labels = ros.fit_resample(train_in, train_classes)

print(Counter(os_labels))

## Decision Tree Classifier

#### Original dataset

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()

dtc.fit(train_in, train_classes)
dtc_prediction = dtc.predict(test_in)

dtc_classification_report = classification_report(test_classes, dtc_prediction, output_dict=True)

print("--- Original dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, dtc_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, dtc_prediction)}\n")

#### Undersampled dataset

In [None]:
dtc.fit(us_inputs, us_labels)
dtc_prediction = dtc.predict(test_in)

dtc_us_classification_report = classification_report(test_classes, dtc_prediction, output_dict=True)

print("--- Undersampled dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, dtc_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, dtc_prediction)}\n")

#### Oversampled dataset

In [None]:
dtc.fit(os_inputs, os_labels)
dtc_prediction = dtc.predict(test_in)

dtc_os_classification_report = classification_report(test_classes, dtc_prediction, output_dict=True)

print("--- Oversampled dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, dtc_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, dtc_prediction)}\n")

### Parameter Tunning (GridSearch)

In [None]:
from sklearn.model_selection import GridSearchCV

parameter_grid = {'criterion': ['gini', 'entropy'],
                  'splitter': ['best', 'random'],
                  'max_depth': [11, 13, 15, 17],
                  'max_features': [14, 15, 16, 17]}

grid_search = GridSearchCV(DecisionTreeClassifier(),
                           param_grid=parameter_grid,
                           cv=10,
                           verbose=4,
                           n_jobs=-1)

#### Original dataset

In [None]:
grid_search.fit(train_in, train_classes)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
dtc = grid_search.best_estimator_
dtc_prediction = dtc.predict(test_in)

best_dtc_classification_report = classification_report(test_classes, dtc_prediction, output_dict=True)

print("--- Improved original dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, dtc_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, dtc_prediction)}\n")

#### Undersampled dataset

In [None]:
grid_search.fit(us_inputs, us_labels)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
dtc = grid_search.best_estimator_
dtc_prediction = dtc.predict(test_in)

best_us_dtc_classification_report = classification_report(test_classes, dtc_prediction, output_dict=True)

print("--- Improved undersampled dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, dtc_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, dtc_prediction)}\n")

#### Oversampled dataset

In [None]:
grid_search.fit(os_inputs, os_labels)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
dtc = grid_search.best_estimator_
dtc_prediction = dtc.predict(test_in)

best_os_dtc_classification_report = classification_report(test_classes, dtc_prediction, output_dict=True)

print("--- Improved oversampled dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, dtc_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, dtc_prediction)}\n")

## K-Nearest Neighbors

#### Original dataset

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

knn.fit(train_in, train_classes)
knn_prediction = knn.predict(test_in)

knn_classification_report = classification_report(test_classes, knn_prediction, output_dict=True)

print("--- Original dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, knn_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, knn_prediction)}\n") 

#### Undersampled dataset

In [None]:
knn.fit(us_inputs, us_labels)
knn_prediction = knn.predict(test_in)

knn_us_classification_report = classification_report(test_classes, knn_prediction, output_dict=True)

print("--- Undersampled dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, knn_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, knn_prediction)}\n")

#### Oversampled dataset

In [None]:
knn.fit(os_inputs, os_labels)
knn_prediction = knn.predict(test_in)

knn_os_classification_report = classification_report(test_classes, knn_prediction, output_dict=True)

print("--- Oversampled dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, knn_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, knn_prediction)}\n")

### Parameter Tunning

In [None]:
parameter_grid = {'n_neighbors': [5, 8, 12, 15],
                  'weights': ['uniform', 'distance'],
                  'algorithm': ['ball_tree', 'kd_tree', 'brute']}

grid_search = GridSearchCV(KNeighborsClassifier(),
                           param_grid=parameter_grid,
                           cv=10,
                           verbose=4,
                           n_jobs=-1)

#### Original dataset

In [None]:
grid_search.fit(train_in, train_classes)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
knn = grid_search.best_estimator_
knn_prediction = knn.predict(test_in)

best_knn_classification_report = classification_report(test_classes, knn_prediction, output_dict=True)

print("--- Improved original dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, knn_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, knn_prediction)}\n")

#### Undersampled dataset

In [None]:
grid_search.fit(us_inputs, us_labels)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
knn = grid_search.best_estimator_
knn_prediction = knn.predict(test_in)

best_us_knn_classification_report = classification_report(test_classes, knn_prediction, output_dict=True)

print("--- Improved undersampled dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, knn_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, knn_prediction)}\n")

#### Oversampled dataset

In [None]:
grid_search = GridSearchCV(KNeighborsClassifier(),
                           param_grid=parameter_grid,
                           cv=5,
                           verbose=4,
                           n_jobs=-1)

grid_search.fit(os_inputs, os_labels)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
knn = grid_search.best_estimator_
knn_prediction = knn.predict(test_in)

best_os_knn_classification_report = classification_report(test_classes, knn_prediction, output_dict=True)

print("--- Improved oversampled dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, knn_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, knn_prediction)}\n")

## Support-vector machines

#### Original dataset

In [None]:
from sklearn.svm import SVC

svc = SVC()

svc.fit(train_in, train_classes)
svc_prediction = svc.predict(test_in)

svc_classification_report = classification_report(test_classes, svc_prediction, output_dict=True)

print("--- Original dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, svc_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, svc_prediction)}\n")

#### Undersampled dataset

In [None]:
svc.fit(us_inputs, us_labels)
svc_prediction = svc.predict(test_in)

svc_us_classification_report = classification_report(test_classes, svc_prediction, output_dict=True)

print("--- Undersampled dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, svc_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, svc_prediction)}\n")

#### Oversampled dataset

In [None]:
svc.fit(os_inputs, os_labels)
svc_prediction = svc.predict(test_in)

svc_os_classification_report = classification_report(test_classes, svc_prediction, output_dict=True)

print("--- Oversampled dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, svc_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, svc_prediction)}\n")

### Parameter Tunning

In [None]:
parameter_grid = {'C' : [0.1, 1, 10, 100], 
            'gamma' : ['scale', 'auto'],
            'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

grid_search = GridSearchCV(SVC(),
                           param_grid=parameter_grid,
                           cv=10,
                           verbose=4,
                           n_jobs=-1)

#### Original dataset

In [None]:
grid_search.fit(train_in, train_classes)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
svc = grid_search.best_estimator_
svc_prediction = svc.predict(test_in)

best_svc_classification_report = classification_report(test_classes, svc_prediction, output_dict=True)

print("--- Improved original dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, svc_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, svc_prediction)}\n")

#### Undersampled dataset

In [None]:
grid_search.fit(us_inputs, us_labels)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
svc = grid_search.best_estimator_
svc_prediction = svc.predict(test_in)

best_us_svc_classification_report = classification_report(test_classes, svc_prediction, output_dict=True)

print("--- Improved undersampled dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, svc_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, svc_prediction)}\n")

#### Oversampled dataset

In [None]:
grid_search.fit(os_inputs, os_labels)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
svc = grid_search.best_estimator_
svc_prediction = svc.predict(test_in)

best_os_svc_classification_report = classification_report(test_classes, svc_prediction, output_dict=True)

print("--- Improved oversampled dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, svc_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, svc_prediction)}\n")

## Neural Networks

#### Original dataset

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier()

mlp.fit(train_in, train_classes)
mlp_prediction = mlp.predict(test_in)

mlp_classification_report = classification_report(test_classes, mlp_prediction, output_dict=True)

print("--- Original dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, mlp_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, mlp_prediction)}\n")

#### Undersampled dataset

In [None]:
mlp.fit(us_inputs, us_labels)
mlp_prediction = mlp.predict(test_in)

mlp_us_classification_report = classification_report(test_classes, mlp_prediction, output_dict=True)

print("--- Undersampled dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, mlp_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, mlp_prediction)}\n")

#### Oversampled dataset

In [None]:
mlp.fit(os_inputs, os_labels)
mlp_prediction = mlp.predict(test_in)

mlp_os_classification_report = classification_report(test_classes, mlp_prediction, output_dict=True)

print("--- Oversampled dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, mlp_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, mlp_prediction)}\n")


### Parameter Tunning

In [None]:
parameter_grid = {'hidden_layer_sizes': [50, 100, 150],
                  'activation' : ['identity', 'logistic', 'tanh', 'relu'],
                  'solver': ['lbfgs', 'sgd', 'adam'],
                  'alpha': [0.0001, 0.001, 0.01]}

grid_search = GridSearchCV(MLPClassifier(), 
                           param_grid=parameter_grid, 
                           cv=10,
                           verbose=4,
                           n_jobs=-1)

#### Original dataset

In [None]:
grid_search.fit(train_in, train_classes)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
mlp = grid_search.best_estimator_
mlp_prediction = mlp.predict(test_in)

best_mlp_classification_report = classification_report(test_classes, mlp_prediction, output_dict=True)

print("--- Improved original dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, mlp_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, mlp_prediction)}\n") 

#### Undersampled dataset

In [None]:
grid_search.fit(us_inputs, us_labels)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
mlp = grid_search.best_estimator_
mlp_prediction = mlp.predict(test_in)

best_us_mlp_classification_report = classification_report(test_classes, mlp_prediction, output_dict=True)

print("--- Improved undersampled dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, mlp_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, mlp_prediction)}\n") 

#### Oversampled dataset

In [None]:
grid_search.fit(os_inputs, os_labels)
print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

In [None]:
mlp = grid_search.best_estimator_
mlp_prediction = mlp.predict(test_in)

best_os_mlp_classification_report = classification_report(test_classes, mlp_prediction, output_dict=True)
%store best_os_mlp_classification_report

print("--- Improved oversampled dataset ---\n")
print(f"Confusion matrix:\n{confusion_matrix(test_classes, mlp_prediction)}\n")
print(f"Classification report:\n{classification_report(test_classes, mlp_prediction)}\n") 