This notebook contains the detailed extraction of the population statistics.

- **Author**: Benkirane Ismail
- **Email**: [ibenkirane@mgb.org](mailto:ibenkirane@mgb.org)
- **Version**: 1.0.0
- **Date**: 2023-10-19

# Imports

In [1]:
import sys
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
import matplotlib.pyplot as plt

sys.path.append('../')

from utils import UTILITIES

utilities = UTILITIES()

# Get the Data

In [None]:
file_path = r"F:\MONSCE Data Edited\RedCap Data\NonVerbalMeasurement_DATA_LABELS_2023-11-01_1238.csv"
data = utilities.get_population_statistics(file_path)

# Statistical Analysis

In [None]:
age_list = []
ethnicity_list = []
gender_list = []
orientation_list = []
time_since_positive_list = []

subjects_to_not_consider = [1002, 1005, 1009, 1011, 1016, 1021, 1023, 1029, 1034, 1036]

for subject_id in data.keys():
    if int(subject_id) in subjects_to_not_consider:
        continue
    age_list.append(data[subject_id]['Age'])
    ethnicity_list.append(data[subject_id]['Ethnicity'])
    gender_list.append(data[subject_id]['Gender'])
    orientation_list.append(data[subject_id]['Orientation'])
    time_since_positive_list.append(2021-data[subject_id]['Date of Diagnosis'])

# Express Plotting

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
fig.subplots_adjust(hspace=0.6, wspace=0.4) 

sns.histplot(age_list, kde=False, color='r', bins='auto', ax=axes[0])
axes[0].set_title('Age Distribution')
axes[0].set_xlabel('Age')
axes[0].set_ylabel('Frequency')

sns.histplot(time_since_positive_list, kde=False, color='r', bins='auto', ax=axes[1])
axes[1].set_title('Years since initial HIV diagnosis Distribution')
axes[1].set_xlabel('Years')
axes[1].set_ylabel('Frequency')

plt.show()

fig, axes = plt.subplots(1, 3, figsize=(12, 5))
fig.subplots_adjust(hspace=0.6, wspace=0.4) 

orientation_counts = Counter(orientation_list)
df_orientation = pd.DataFrame(list(orientation_counts.items()), columns=['Sexual Orientation', 'Frequency'])
sns.barplot(x='Sexual Orientation', y='Frequency', data=df_orientation, color='r', ax=axes[0])
axes[0].set_title('Sexual Orientation Distribution')
axes[0].set_xlabel('')
axes[0].set_ylabel('Frequency')
for label in axes[0].get_xticklabels():
    label.set_rotation(45)  

ethnicity_counts = Counter(ethnicity_list)
df_ethnicity = pd.DataFrame(list(ethnicity_counts.items()), columns=['Ethnicity', 'Frequency'])
sns.barplot(x='Ethnicity', y='Frequency', data=df_ethnicity, color='r', ax=axes[1])
axes[1].set_title('Ethnicity Distribution')
axes[1].set_xlabel('')
axes[1].set_ylabel('Frequency')
for label in axes[1].get_xticklabels():
    label.set_rotation(45)  

gender_counts = Counter(gender_list)
df_gender = pd.DataFrame(list(gender_counts.items()), columns = ['Gender', 'Frequency'])
sns.barplot(x='Gender', y='Frequency', data=df_gender, color='r', ax=axes[2])
axes[2].set_title('Gender Distribution')
axes[2].set_xlabel('')
axes[2].set_ylabel('Frequency')
for label in axes[2].get_xticklabels():
    label.set_rotation(45)  

plt.show()

# Detailed Plotting

## Age

In [None]:
print('Mean:', np.mean(age_list))
print('Median:', np.median(age_list))
print('Std:', np.std(age_list))
print('Mode:', max(set(age_list), key=age_list.count))
print('Range:', max(age_list)-min(age_list))

sns.histplot(age_list, kde=False, color='red', bins='auto')

plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')

plt.show()

## Time elapsed since positive diagnosis

In [None]:
print('Mean:', np.mean(time_since_positive_list))
print('Median:', np.median(time_since_positive_list))
print('Std:', np.std(time_since_positive_list))
print('Mode:', max(set(time_since_positive_list), key=time_since_positive_list.count))
print('Range:', max(time_since_positive_list)-min(time_since_positive_list))

sns.histplot(time_since_positive_list, kde=False, color='red', bins='auto')

plt.title('Distribution')
plt.xlabel('Time elapsed since positive diagnosis')
plt.ylabel('Frequency')

plt.show()

## Gender

In [None]:
gender_counts = Counter(gender_list)

print(gender_counts)

df = pd.DataFrame(list(gender_counts.items()), columns = ['Gender', 'Frequency'])

plt.figure(figsize=(4, 5))  
sns.barplot(x='Gender', y='Frequency', data=df, palette=['red'])

plt.title('Gender Distribution')
plt.xlabel('Gender')
plt.ylabel('Frequency')

plt.tight_layout()  
plt.show()

## Ethnicity

In [None]:
ethnicity_counts = Counter(ethnicity_list)

print(ethnicity_counts)

df = pd.DataFrame(list(ethnicity_counts.items()), columns=['Ethnicity', 'Frequency'])

plt.figure(figsize=(4,5))  
sns.barplot(x='Ethnicity', y='Frequency', data=df, palette=['red'])

plt.title('Ethnicity Distribution')
plt.xlabel('Ethnicity')
plt.ylabel('Frequency')

plt.tight_layout()  
plt.show()


## Sexual Orientation

In [None]:
orientation_counts = Counter(orientation_list)

print(orientation_counts)

df = pd.DataFrame(list(orientation_counts.items()), columns=['Sexual Orientation', 'Frequency'])

plt.figure(figsize=(6, 8))  
sns.barplot(x='Sexual Orientation', y='Frequency', data=df, palette=['red'])

plt.title('Sexual Orientation Distribution')
plt.xlabel('Sexual Orientation')
plt.ylabel('Frequency')

plt.tight_layout()  
plt.show()


# Comparison across Clusters

In [None]:
clusters = {
    'Cluster 1' : [1003, 1007, 1013, 1015, 1020, 1024, 1026],
    'Cluster 2' : [1001, 1031, 1032, 1037, 1039],
    'Cluster 3' : [1008, 1017, 1022, 1025, 1033, 1040, 1041, 1042]
}

In [None]:
age_list = dict()
ethnicity_list = dict()
gender_list = dict()
orientation_list = dict()
time_since_positive_list = dict()

subjects_to_not_consider = [1002, 1005, 1009, 1011, 1016, 1021, 1023, 1029, 1034, 1036]

for idx, group in enumerate([clusters['Cluster 1'], clusters['Cluster 2'], clusters['Cluster 3']]):
    age_list[f'Cluster{idx}'] = list()
    ethnicity_list[f'Cluster{idx}'] = list()
    gender_list[f'Cluster{idx}'] = list()
    orientation_list[f'Cluster{idx}'] = list()
    time_since_positive_list[f'Cluster{idx}'] = list()
    for subject_id in data.keys():
        if int(subject_id) in subjects_to_not_consider:
            continue
        if int(subject_id) in group:
            age_list[f'Cluster{idx}'].append(data[subject_id]['Age'])
            ethnicity_list[f'Cluster{idx}'].append(data[subject_id]['Ethnicity'])
            gender_list[f'Cluster{idx}'].append(data[subject_id]['Gender'])
            orientation_list[f'Cluster{idx}'].append(data[subject_id]['Orientation'])
            time_since_positive_list[f'Cluster{idx}'].append(2021-data[subject_id]['Date of Diagnosis'])

## Age

In [None]:
age_data = []
for group, ages in age_list.items():
    for age in ages:
        age_data.append((age, group))

df_age = pd.DataFrame(age_data, columns=['Age', 'Cluster'])

df_age['Cluster'] = pd.Categorical(df_age['Cluster'], categories=['Cluster0', 'Cluster1', 'Cluster2'], ordered=True)

plt.figure(figsize=(12, 8))
sns.boxplot(x='Cluster', y='Age', data=df_age, color='lightgray')
sns.stripplot(x='Cluster', y='Age', data=df_age, color='red', jitter=True, size=5)
plt.title('Participant Ages by Cluster')
plt.xlabel('')
plt.ylabel('Age')

plt.show()

## Time Since Positive Diagnosis

In [None]:
time_since_positive_data = []
for group, times in time_since_positive_list.items():
    for time in times:
        time_since_positive_data.append((time, group))

df_time_since_positive = pd.DataFrame(time_since_positive_data, columns=['TimeSincePositive', 'Cluster'])

df_time_since_positive['Cluster'] = pd.Categorical(df_time_since_positive['Cluster'], categories=['Cluster0', 'Cluster1', 'Cluster2'], ordered=True)

plt.figure(figsize=(12, 8))
sns.boxplot(x='Cluster', y='TimeSincePositive', data=df_time_since_positive, color='lightgray')
sns.stripplot(x='Cluster', y='TimeSincePositive', data=df_time_since_positive, color='red', jitter=True, size=5)

plt.title('Years Elapsed Since Positive by Cluster')
plt.xlabel('')
plt.ylabel('Years Since Diagnosis')

plt.show()

## Gender

In [None]:
gender_data = []
for group, genders in gender_list.items():
    for gender in genders:
        gender_data.append((gender, group))

df_gender = pd.DataFrame(gender_data, columns=['Gender', 'Cluster'])

df_gender['Cluster'] = pd.Categorical(df_gender['Cluster'], categories=['Cluster0', 'Cluster1', 'Cluster2'], ordered=True)

plt.figure(figsize=(12, 8))
sns.countplot(x='Cluster', hue='Gender', data=df_gender, palette=['red', 'grey'])
plt.title('Gender Distribution by Cluster')
plt.xlabel('')
plt.ylabel('Count')
plt.legend(title='Gender')

plt.show()

## Sexual Orientation

In [None]:
orientation_data = []
for group, orientations in orientation_list.items():
    for orientation in orientations:
        orientation_data.append((orientation, group))

df_orientation = pd.DataFrame(orientation_data, columns=['Orientation', 'Cluster'])

df_orientation['Cluster'] = pd.Categorical(df_orientation['Cluster'], categories=['Cluster0', 'Cluster1', 'Cluster2'], ordered=True)

plt.figure(figsize=(12, 8))
sns.countplot(x='Cluster', hue='Orientation', data=df_orientation, palette=['red', 'grey', 'pink'])
plt.title('Sexual Orientation Distribution by Cluster')
plt.xlabel('')
plt.ylabel('Count')

plt.legend(title='Orientation')

plt.show()


## Ethnicity

In [None]:
ethnicity_data = []
for group, ethnicities in ethnicity_list.items():
    for ethnicity in ethnicities:
        ethnicity_data.append((ethnicity, group))

df_ethnicity = pd.DataFrame(ethnicity_data, columns=['Ethnicity', 'Cluster'])

df_ethnicity['Cluster'] = pd.Categorical(df_ethnicity['Cluster'], categories=['Cluster0', 'Cluster1', 'Cluster2'], ordered=True)

plt.figure(figsize=(12, 8))
sns.countplot(x='Cluster', hue='Ethnicity', data=df_ethnicity, palette=['grey', 'red'])
plt.title('Ethnicity Distribution by Cluster')
plt.xlabel('')
plt.ylabel('Count')

plt.legend(title='Ethnicity')

plt.show()