## Data Overview

In [2]:
import json
import pandas as pd
import numpy as np
import os

base_dir = os.path.abspath("..")
file_path = os.path.join(base_dir, "data", "scraped_data", "general_profiles_data.json")

# Load JSON file
with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)
    
# Convert JSON to DataFrame
user_profiles = []

for username, profile in data.items():
    user_profiles.append({
        "username": username,
        "gender": profile["demographics"].get("gender", "N/A"),
        "country": profile["demographics"].get("country", "N/A"),
        "age": profile["demographics"].get("age", "N/A"),
        "ethnicity": profile["demographics"].get("ethnicity", "N/A"),
        "tags": profile.get("tags", ["N/A"]),
    })

df = pd.DataFrame(user_profiles)
df.head()

Unnamed: 0,username,gender,country,age,ethnicity,tags
0,Gimme,Female,United Kingdom,,,[N/A]
1,Flyerswife,,,,,[N/A]
2,Coachtrip,,United Kingdom,,,"[Lung disease, Oxygen Therapy, Respiratory fai..."
3,Wegeners88,Woman,United Kingdom,,White,"[Vasculitis, Prednisolone, Overactive thyroid ..."
4,Tumorboy20,Man,United States,54.0,White,"[Meningioma, Neurosurgery]"


## General Statistics

In [4]:
# 1. Gender distribution
gender_counts = df["gender"].value_counts()

# # 2. Demographics
demographic_stats = df[["gender", "country", "age", "ethnicity"]].describe(include="all")

# 3. Unique tags
all_tags = [tag for tag_list in df["tags"] for tag in tag_list if tag != 'N/A']
unique_tags = set(all_tags)
num_unique_tags = len(unique_tags)

# 4. Users without any tags
users_without_tags = [tags for tags in df["tags"] if tags == ['N/A']]
num_users_without_tags = len(users_without_tags)

# # 5. Average / Median number of tags per user
df["num_tags"] = df["tags"].apply(lambda tags: len(tags) if tags != ['N/A'] else 0)
avg_tags_per_user = df["num_tags"].mean()
median_tags_per_user = df["num_tags"].median()

# --------------------
# Visualize
# --------------------
results = {
    "Gender Distribution": gender_counts.to_dict(),
    "Total Users": len(df),
    "Demographics Overview": demographic_stats.to_dict(),
    "Total Unique Tags": num_unique_tags,
    "Users without Tags": num_users_without_tags,
    "Average Tags per User": avg_tags_per_user,
    "Median Tags per User": median_tags_per_user,
}

pd.set_option("display.max_colwidth", None)  # Show full content in each column
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.expand_frame_repr", False)  # Prevent line wrapping-
pd.set_option("display.max_rows", None)  # Show all rows if needed

# Convert results to DataFrame
df_results = pd.DataFrame(list(results.items()), columns=["Metric", "Value"])

df_results

Unnamed: 0,Metric,Value
0,Gender Distribution,"{'N/A': 4011, 'Female': 3934, 'Male': 1745, 'Woman': 1210, 'Man': 554, 'Other': 31, 'Prefer not to say': 23, 'Non-binary': 7, 'My pronouns are she/her. Beyond that, I'm just me': 1, 'She/They/He': 1, 'She/they': 1}"
1,Total Users,11518
2,Demographics Overview,"{'gender': {'count': 11518, 'unique': 11, 'top': 'N/A', 'freq': 4011}, 'country': {'count': 11518, 'unique': 98, 'top': 'United Kingdom', 'freq': 4273}, 'age': {'count': 11518, 'unique': 76, 'top': 'N/A', 'freq': 7477}, 'ethnicity': {'count': 11518, 'unique': 18, 'top': 'N/A', 'freq': 5930}}"
3,Total Unique Tags,2140
4,Users without Tags,2964
5,Average Tags per User,3.424379
6,Median Tags per User,2.0


## Demographics

In [5]:
# Extract demographics for better overview
demographics_df = pd.DataFrame([df_results[df_results["Metric"] == "Demographics Overview"]["Value"].values[0]])
demographics_df

Unnamed: 0,gender,country,age,ethnicity
0,"{'count': 11518, 'unique': 11, 'top': 'N/A', 'freq': 4011}","{'count': 11518, 'unique': 98, 'top': 'United Kingdom', 'freq': 4273}","{'count': 11518, 'unique': 76, 'top': 'N/A', 'freq': 7477}","{'count': 11518, 'unique': 18, 'top': 'N/A', 'freq': 5930}"


### Countries

In [12]:
# Count occurrences of each country
country_counts = df["country"].value_counts()

# Calculate percentage of each country
country_percentage = (country_counts / len(df)) * 100

# Filter countries with a percentage > 5%
countries_above_5_percent = country_percentage[country_percentage > 5]

# Convert to DataFrame
country_percentage = pd.DataFrame({
    "Country": countries_above_5_percent.index,
    "Count": country_counts.loc[countries_above_5_percent.index].values,
    "Percentage": countries_above_5_percent.values
})

print(country_percentage)

          Country  Count  Percentage
0  United Kingdom   4273   37.098455
1             N/A   4050   35.162355
2   United States   2203   19.126584


### Age

In [13]:
# Define age groups
age_bins = [0, 18, 30, 40, 50, 60, 70, float("inf")]
age_labels = ["Under 18", "18-30", "30-40", "40-50", "50-60", "60-70", "Over 70"]

# Convert age column to numeric, replacing "N/A" with NaN
df["age"] = pd.to_numeric(df["age"], errors="coerce")

# Categorize users into age groups
df["age_group"] = pd.cut(df["age"], bins=age_bins, labels=age_labels, right=False)

# Count occurrences of each age group
age_group_counts = df["age_group"].value_counts(dropna=False)

# Calculate percentage of each age group
age_group_percentage = (age_group_counts / len(df)) * 100

# Count "N/A" values and calculate their percentage
na_count = df["age"].isna().sum()
na_percentage = (na_count / len(df)) * 100

age_stats_df = pd.DataFrame({
    "Age Group": age_group_counts.index.astype(str),
    "Count": age_group_counts.values,
    "Percentage": age_group_percentage.values
})

print(age_stats_df)

  Age Group  Count  Percentage
0       nan   7477   64.915784
1   Over 70    878    7.622851
2     60-70    866    7.518666
3     50-60    799    6.936968
4     30-40    586    5.087689
5     40-50    516    4.479944
6     18-30    396    3.438097
7  Under 18      0    0.000000


### Gender

In [14]:
# GENDER DISTRIBUTION (Original Groups)

# Count occurrences of each gender
gender_counts = df["gender"].value_counts()

# Calculate percentage of each gender
gender_percentage = (gender_counts / len(df)) * 100

gender_stats_df = pd.DataFrame({
    "Gender": gender_counts.index,
    "Count": gender_counts.values,
    "Percentage": gender_percentage.values
})

print(gender_stats_df)

                                                Gender  Count  Percentage
0                                                  N/A   4011   34.823754
1                                               Female   3934   34.155235
2                                                 Male   1745   15.150200
3                                                Woman   1210   10.505296
4                                                  Man    554    4.809863
5                                                Other     31    0.269144
6                                    Prefer not to say     23    0.199687
7                                           Non-binary      7    0.060774
8   My pronouns are she/her.  Beyond that, I'm just me      1    0.008682
9                                          She/They/He      1    0.008682
10                                            She/they      1    0.008682


In [15]:
# GENDER DISTRIBUTION (4 Groups)

# Define final gender mapping
final_gender_mapping = {
    "N/A": "N/A",
    "Prefer not to say": "N/A",
    "Female": "Female",
    "Woman": "Female",
    "Male": "Male",
    "Man": "Male",
    "Others" : "Others",
    "Non-binary" : "Others",
    "My pronouns are she/her.  Beyond that, I'm just me" : "Others", 
    "She/They/He": "Others",
    "She/they" : "Others",
}

# Apply the mapping, defaulting to "Other" for any unspecified values
df["gender"] = df["gender"].apply(lambda x: final_gender_mapping.get(x, "Others"))

# Recalculate gender distribution
final_gender_counts = df["gender"].value_counts()
final_gender_percentage = (final_gender_counts / len(df)) * 100

final_gender_stats_df = pd.DataFrame({
    "Gender": final_gender_counts.index,
    "Count": final_gender_counts.values,
    "Percentage": final_gender_percentage.values
})

print(final_gender_stats_df)

   Gender  Count  Percentage
0  Female   5144   44.660531
1     N/A   4034   35.023442
2    Male   2299   19.960063
3  Others     41    0.355965


### Ethnicity

In [19]:
# Count occurrences of each ethnicity
ethnicity_counts = df["ethnicity"].value_counts()

# Calculate percentage of each ethnicity
ethnicity_percentage = (ethnicity_counts / len(df)) * 100

ethnicity_percentage = pd.DataFrame({
    "Ethnicity": ethnicity_counts.index,
    "Count": ethnicity_counts.values,
    "Percentage": ethnicity_percentage.values
})

# Filter ethnicities with percentage above 5%
# ethnicities_above_5_percent = ethnicity_stats_df[ethnicity_stats_df["Percentage"] > 5]

print(ethnicity_percentage)

                                      Ethnicity  Count  Percentage
0                                           N/A   5930   51.484633
1                                         White   3894   33.807953
2                             White / Caucasian    899    7.805175
3                                   South Asian    149    1.293627
4                             Prefer not to say    137    1.189443
5                Mixed / Multiple ethnic groups    114    0.989755
6                   Hispanic / Latino / Spanish     70    0.607744
7                      Black / African American     69    0.599062
8                            Other ethnic group     49    0.425421
9                   Black / African / Caribbean     43    0.373329
10                                   East Asian     43    0.373329
11                            Latino / Hispanic     36    0.312554
12  Black / African / Caribbean / Black British     35    0.303872
13                               Middle Eastern     22    0.19

In [17]:
# Define ethnicity mapping to major groups
ethnicity_mapping = {
    "N/A": "N/A",
    "Prefer not to say": "N/A",
    
    "Latino": "Latino",
    "Hispanic / Latino / Spanish": "Latino",
    "Latino": "Latino / Hispanic",
    
    "Black": "Black",
    "Black": "Black",
    "Black": "Black",
    "Black": "Black",
    
    
    "White": "White",
    
    "Asian": "Asian",
    
    "Other ethnic group" : "Other ethnic group"
}

# Apply the mapping, defaulting to the original value if not found
df["ethnicity_mapped"] = df["ethnicity"].apply(lambda x: ethnicity_mapping.get(x, x))

# Recalculate ethnicity distribution
ethnicity_counts = df["ethnicity_mapped"].value_counts()
ethnicity_percentage = (ethnicity_counts / len(df)) * 100

# Identify groups above 5% significance
significant_ethnicities = ethnicity_percentage[ethnicity_percentage > 5].index.tolist()

# Assign all other groups to "Others"
df["ethnicity_final"] = df["ethnicity_mapped"].apply(lambda x: x if x in significant_ethnicities else "Others")

# Recalculate final ethnicity distribution
final_ethnicity_counts = df["ethnicity_final"].value_counts()
final_ethnicity_percentage = (final_ethnicity_counts / len(df)) * 100

final_ethnicity_stats_df = pd.DataFrame({
    "Ethnicity": final_ethnicity_counts.index,
    "Count": final_ethnicity_counts.values,
    "Percentage": final_ethnicity_percentage.values
})

# Display results
print(final_ethnicity_stats_df)


           Ethnicity  Count  Percentage
0                N/A   6067   52.674075
1              White   3894   33.807953
2  White / Caucasian    899    7.805175
3             Others    658    5.712797


## Tags

In [23]:
# Unique tags
all_tags = []
for tag_list in df["tags"]:
    for tag in tag_list:
        if tag != 'N/A':
            all_tags.append(tag)
unique_tags = set(all_tags)
num_unique_tags = len(unique_tags)

In [52]:
# Users without any tags
users_without_tags = [tags for tags in df["tags"] if tags == ['N/A']]
num_users_without_tags = len(users_without_tags)

165


In [None]:
# Average / Median number of tags per user
# Add new column with number of tags per user
df["num_tags"] = df["tags"].apply(lambda tags: len(tags) if tags != ['N/A'] else 0)

avg_tags_per_user = df["num_tags"].mean()
median_tags_per_user = df["num_tags"].median()