# IIQ3402 - Tarea 1 
This task entails the Exploratory Data Analysis (EDA) of a dataset containing information whether individuals could smell certain smells, and comorbitities such as fever, coughing, muscle pain, and more. At last it contains a column with the result of a PCR test for that individual.

In [None]:
import polars as pl
import numpy as np
import plotly.express as px
from sklearn import linear_model

In [None]:
df = pl.read_csv("Enunciado_Tarea1/datos_tarea1.csv")
df = df.rename(
    {
        "1__Cold": "Cold",
        "2__Cough": "Cough",
        "3__Fever": "Fever",
        "4__Muscular_pain": "Muscular Pain",
        "5__Breathing_difficulty": "Breathing difficulty",
        "6__Self_reported_anosmia": "Self-reported anosmia",
        "7__Self_reported_ageusia": "Self-reported ageusia",
        "COVID19_PCR": "COVID19 PCR Result",
    }
)
df

In [None]:
df_neg = df.filter(pl.col("COVID19 PCR Result") == 0)
df_pos = df.filter(pl.col("COVID19 PCR Result") == 1)

neg_pcr = df.filter(pl.col("COVID19 PCR Result") == 0).shape[0]
pos_pcr = df.filter(pl.col("COVID19 PCR Result") == 1).shape[0]

print(f"Total number of positive PCR: {pos_pcr}")
print(f"Total number of negative PCR: {neg_pcr}")

## Hypothesis 1

**Covid-positive individuals most frequently identify orange, pineapple and mint odors.**


In [None]:
scent_cols = ["Banana", "Caramel", "Mint", "Orange", "Pineapple", "Vanilla"]

scent_data = {
    "Scent": [],
    "Group": [],
    "Count": [],
    "Percentage": [],
}

for i, scent in enumerate(scent_cols):
    scent_data["Scent"].append(scent)
    scent_data["Group"].append("PCR Positive")
    scent_data["Count"].append(df_pos.filter(pl.col(scent) == 1).shape[0])
    scent_data["Percentage"].append(round(
        100 * df_pos.filter(pl.col(scent) == 1).shape[0] / pos_pcr, 1
    ))

    scent_data["Scent"].append(scent)
    scent_data["Group"].append("PCR Negative")
    scent_data["Count"].append(df_neg.filter(pl.col(scent) == 1).shape[0])
    scent_data["Percentage"].append(round(
        100 * df_neg.filter(pl.col(scent) == 1).shape[0] / neg_pcr, 1
        ))

scent_df = pl.from_dict(scent_data)
scent_df

In [None]:
fig = px.bar(
    scent_df,
    x="Scent",
    y="Count",
    color="Group",
    barmode="group",
    labels={"Count": "Count of Patients"},
    title="Scent Detection by PCR Result",
    text="Count",
)
fig.show()

In [None]:
fig = px.bar(
    scent_df,
    x="Scent",
    y="Percentage",
    color="Group",
    barmode="group",
    labels={"Percentage": "Percentage of Patients"},
    title="Relative Scent Detection by PCR Result",
    text="Percentage",
    range_y=[0, 100],
)
fig.show()

Based on the plot, the hypothesis does not seem to be supported. 
Here the relative scent detection is the number of individuals who could smell the scent divided by the total number of individuals who tested positive, and negative for COVID-19.


## Hypothesis 2
**¿Son las personas con comorbilidades menos propensas a contraer COVID - 19?**

English: Are people with comorbidities less prone to contracting
COVID - 19?

Meaning that we have to check if there is a correlation between the people with e.g. fever and the PCR test result.

In [None]:
symptoms = [
    "Cold",
    "Cough",
    "Fever",
    "Muscular Pain",
    "Breathing difficulty",
    "Self-reported anosmia",
    "Self-reported ageusia",
]
symptom_data = {
    "Symptom": [],
    "Group": [],
    "Count": [],
    "Percentage": [],
}
for symptom in symptoms:
    # Positive
    symptom_data["Symptom"].append(symptom)
    symptom_data["Group"].append("Positive PCR")
    symptom_data["Count"].append(df_pos.filter(pl.col(symptom) == 1).shape[0])
    symptom_data["Percentage"].append(round(
        100 * df_pos.filter(pl.col(symptom) == 1).shape[0] / pos_pcr, 1
        ))

    # Negative
    symptom_data["Symptom"].append(symptom)
    symptom_data["Group"].append("Negative PCR")
    symptom_data["Count"].append(df_neg.filter(pl.col(symptom) == 1).shape[0])
    symptom_data["Percentage"].append(round(
        100 * df_neg.filter(pl.col(symptom) == 1).shape[0] / neg_pcr, 1
        ))
    
symptom_df = pl.from_dict(symptom_data)
symptom_df

In [None]:
fig = px.bar(
    symptom_df,
    x="Symptom",
    y="Count",
    color="Group",
    barmode="group",
    labels={"Count": "Count of Patients"},
    title="Symptoms by PCR Result",
    text="Count",
)
fig.show()

In [None]:
fig = px.bar(
    symptom_df,
    x="Symptom",
    y="Percentage",
    color="Group",
    barmode="group",
    labels={"Percentage": "Percentage of Patients"},
    title="Relative Symptoms by PCR Result",
    text="Percentage",
    range_y=[0, 100],
)
fig.show()

## Hypthesis 3
¿Existe une relación entre las dificultades respiratorias y la función
olfativa en individuos?

English: Is there a relationship between breathing difficulties and olfactory function in individuals

### Thought process
Make a an olfactory score, summing the number of olfactory tests that were positive. 
Then, check if there is a correlation between the olfactory score and the breathing difficulties.

In [None]:
scents = ["Banana", "Caramel", "Mint", "Orange", "Pineapple", "Vanilla"]
df = df.with_columns(
    [
       pl.sum_horizontal(pl.col(scent) for scent in scents).alias("Olfactory Score"),
    ]
)
olfactory_df = df.select(scents + ["Olfactory Score", "Breathing difficulty"])
olfactory_df.columns

In [None]:
olfactory_df = olfactory_df.with_columns(
    pl.when(pl.col("Breathing difficulty") == 1)
    .then(pl.lit("Yes"))
    .otherwise(pl.lit("No"))
    .alias("Difficulties breathing")
)

fig = px.box(
    olfactory_df,
    x="Difficulties breathing",
    y="Olfactory Score",
    color="Difficulties breathing",
    title="Olfactory Score by Breathing Difficulty",
    # range_y=[0, 6],
)
fig.show()

In [None]:
olfactory_scores = [0, 1, 2, 3, 4, 5, 6]
olfactory_data = {
    "Olfactory Score": [],
    "Group": [],
    "Count": []
}

for score in olfactory_scores:
    olfactory_data["Olfactory Score"].append(score)
    olfactory_data["Group"].append("Difficulties Breathing")
    olfactory_data["Count"].append(df.filter(pl.col("Breathing difficulty") == 1).filter(pl.col("Olfactory Score") == score).shape[0])
    
    olfactory_data["Olfactory Score"].append(score)
    olfactory_data["Group"].append("No Difficulties Breathing")
    olfactory_data["Count"].append(df.filter(pl.col("Breathing difficulty") == 0).filter(pl.col("Olfactory Score") == score).shape[0])
    
olfactory_df = pl.from_dict(olfactory_data)
olfactory_df


In [None]:
fig = px.bar(
    olfactory_df,
    x="Olfactory Score",
    y="Count",
    color="Group",
    barmode="group",
    labels={"Count": "Count of Patients"},
    title="Olfactory Score by Breathing Difficulty",
    text="Count",
)
fig.show()

# Part 4
## Weighted Olifactory Score

By giving certain weight to the olfactory tests, we can create a continuous variable that can be used to indicate COVID19 in patients. 
The score is defined as follows:

$$
y = \sum_{{i=1}}^{{6}} w_{i} x_{i}
$$

where
- $y$ is the olfactory score
- $x_{i}$ is the result of the olfactory test $i$ (1 if positive, 0 if negative)
- $w_{i}$ is the weight of the olfactory test $i$.

The weights can be found in Table 1 in [Enunciado Tarea 1](Enunciado_Tarea1/Enunciado%20Tarea%201.pdf).

In [None]:
olfcatory_weights = {
    "Banana": 0.09332,
    "Caramel": 0.09332,
    "Mint": 0.34668,
    "Orange": 0.09334,
    "Pineapple": 0.18667,
    "Vanilla": 0.18666,
}

In [None]:
df = df.with_columns(
    [
        pl.sum_horizontal(
            pl.col(scent) * olfcatory_weights[scent] for scent in scents
        ).alias("Weighted Olfactory Score"),
    ]
)

df = df.with_columns(
    pl.when(pl.col("COVID19 PCR Result") == 1)
    .then(pl.lit("PCR Positive"))
    .otherwise(pl.lit("PCR Negative"))
    .alias("Group"),
)
df

In [None]:
fig = px.box(
    df,
    x="Group",
    y="Weighted Olfactory Score",
    color="Group",
    title="Weighted Olfactory Score by PCR Result",
)
fig.show()

Based on these results we want to see if there is a relationship between the symptoms and the olfactory score, not just based on the positive/negative results of the PCR.

## Logistic Regression
Using logistic regression on the weighted olfactory score

NOTE: Due to lack of time, a visual analysis of the data was not performed.

In [None]:
# Model training
X = df.select('Weighted Olfactory Score').to_numpy()
y = df.select('COVID19 PCR Result').to_numpy().ravel()


model = linear_model.LogisticRegression()
model.fit(X, y)



In [None]:
x_vals = np.linspace(X.min(), X.max(), 1000).reshape(-1, 1)
y_pred = model.predict_proba(x_vals)[:, 1]

print(x_vals.flatten().shape)
print(y_pred.shape)
fig = px.scatter(
    x=x_vals.flatten(),
    y=y_pred,
    title="Logistic Regression Model",
    labels={"x": "Weighted Olfactory Score", "y": "Probability of PCR Positive"},
)

y_jittered = y + np.random.normal(0, 0.03, size=y.shape) # Jittering of the y-values was added to better visualize the points
fig.add_scatter(
    x=X.flatten(),
    y=y_jittered,
    mode="markers",
    name="PCR Results",
    marker=dict(color="gray", opacity=0.4),
)
fig.show()