In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px

In [2]:
file_path = "../Data/cleaned_blood_type_distribution_by_country.csv"
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,Country/Dependency,Population,O+,A+,B+,AB+,O-,A-,B-,AB-
0,Albania,3074579,34.10%,31.20%,14.50%,5.20%,6.00%,5.50%,2.60%,0.90%
1,Algeria,43576691,40.00%,30.00%,15.00%,4.25%,6.60%,2.30%,1.10%,0.75%
2,Argentina,45479118,50.34%,31.09%,8.20%,2.16%,4.29%,2.98%,0.74%,0.20%
3,Armenia,3021324,29.00%,46.30%,12.00%,5.60%,2.00%,3.70%,1.00%,0.40%
4,Australia,25466459,38.00%,32.00%,12.00%,4.00%,7.00%,6.00%,2.00%,1.00%


In [3]:
df.columns = df.columns.str.strip().str.replace(' ', '_')
df.rename(columns = {'Country/Dependency':'Country'}, inplace = True)
df.head()

Unnamed: 0,Country,Population,O+,A+,B+,AB+,O-,A-,B-,AB-
0,Albania,3074579,34.10%,31.20%,14.50%,5.20%,6.00%,5.50%,2.60%,0.90%
1,Algeria,43576691,40.00%,30.00%,15.00%,4.25%,6.60%,2.30%,1.10%,0.75%
2,Argentina,45479118,50.34%,31.09%,8.20%,2.16%,4.29%,2.98%,0.74%,0.20%
3,Armenia,3021324,29.00%,46.30%,12.00%,5.60%,2.00%,3.70%,1.00%,0.40%
4,Australia,25466459,38.00%,32.00%,12.00%,4.00%,7.00%,6.00%,2.00%,1.00%


In [4]:
df['Population'] = df['Population'].str.replace(',', '').astype(float)
df.head()

Unnamed: 0,Country,Population,O+,A+,B+,AB+,O-,A-,B-,AB-
0,Albania,3074579.0,34.10%,31.20%,14.50%,5.20%,6.00%,5.50%,2.60%,0.90%
1,Algeria,43576691.0,40.00%,30.00%,15.00%,4.25%,6.60%,2.30%,1.10%,0.75%
2,Argentina,45479118.0,50.34%,31.09%,8.20%,2.16%,4.29%,2.98%,0.74%,0.20%
3,Armenia,3021324.0,29.00%,46.30%,12.00%,5.60%,2.00%,3.70%,1.00%,0.40%
4,Australia,25466459.0,38.00%,32.00%,12.00%,4.00%,7.00%,6.00%,2.00%,1.00%


In [5]:
blood_groups = ['A+', 'A-', 'B+', 'B-', 'AB+', 'AB-', 'O+', 'O-']

for col in blood_groups:
    df[col] = df[col].str.replace('%', '').astype(float)
    
df.head()

Unnamed: 0,Country,Population,O+,A+,B+,AB+,O-,A-,B-,AB-
0,Albania,3074579.0,34.1,31.2,14.5,5.2,6.0,5.5,2.6,0.9
1,Algeria,43576691.0,40.0,30.0,15.0,4.25,6.6,2.3,1.1,0.75
2,Argentina,45479118.0,50.34,31.09,8.2,2.16,4.29,2.98,0.74,0.2
3,Armenia,3021324.0,29.0,46.3,12.0,5.6,2.0,3.7,1.0,0.4
4,Australia,25466459.0,38.0,32.0,12.0,4.0,7.0,6.0,2.0,1.0


In [6]:
print(df.isnull().sum())

Country       0
Population    0
O+            0
A+            0
B+            0
AB+           1
O-            1
A-            1
B-            1
AB-           1
dtype: int64


In [7]:
df.fillna(0, inplace = True)

In [8]:
df.to_csv("../Data/processed_blood_type_data.csv", index = False)

# Data Analysis

In [9]:
df = pd.read_csv("../Data/processed_blood_type_data.csv")
df.head()

Unnamed: 0,Country,Population,O+,A+,B+,AB+,O-,A-,B-,AB-
0,Albania,3074579.0,34.1,31.2,14.5,5.2,6.0,5.5,2.6,0.9
1,Algeria,43576691.0,40.0,30.0,15.0,4.25,6.6,2.3,1.1,0.75
2,Argentina,45479118.0,50.34,31.09,8.2,2.16,4.29,2.98,0.74,0.2
3,Armenia,3021324.0,29.0,46.3,12.0,5.6,2.0,3.7,1.0,0.4
4,Australia,25466459.0,38.0,32.0,12.0,4.0,7.0,6.0,2.0,1.0


In [10]:
df.describe()

Unnamed: 0,Population,O+,A+,B+,AB+,O-,A-,B-,AB-
count,126.0,126.0,126.0,126.0,126.0,126.0,126.0,126.0,126.0
mean,119161100.0,40.315,29.242937,16.177857,4.575873,4.26127,3.365,1.351667,0.45754
std,709549700.0,9.903148,6.525914,7.56548,2.297524,2.998154,2.556899,0.865503,0.378155
min,39137.0,27.0,14.0,4.72,0.0,0.0,0.0,0.0,0.0
25%,5495125.0,32.655,24.3625,9.7,2.51,1.85,1.0,0.6025,0.1
50%,12311040.0,38.0,30.0,15.0,4.2,4.295,2.76,1.3,0.4
75%,45287370.0,46.775,34.0,21.045,6.21,6.0,6.0,2.0,0.875
max,7772851000.0,75.0,46.3,36.8,11.32,14.1,8.1,3.13,1.2


In [11]:
print(df.isnull().sum())    

Country       0
Population    0
O+            0
A+            0
B+            0
AB+           0
O-            0
A-            0
B-            0
AB-           0
dtype: int64


# Visualization

In [12]:
avg_blood_distribution = df[['A+', 'A-', 'B+', 'B-', 'AB+', 'AB-', 'O+', 'O-']].mean()

fig = px.pie(
    values = avg_blood_distribution.values,
    names = avg_blood_distribution.index,
    title = 'Global Blood Type Distribution'
)
fig.show()

In [20]:
df_sorted = df.sort_values(by = 'Population', ascending = False)

for col in blood_groups:
    df_sorted[col] = (df_sorted[col] / 100) * df_sorted['Population']
fig = px.bar(
    df_sorted.head(20),  # Show only the top 20 countries
    x="Country",
    y=["O+", "A+", "B+", "AB+", "O-", "A-", "B-", "AB-"],
    title="Top 20 Countries - Blood Group Distribution (By Population)",
    barmode="group",
    opacity=0.8  # Make bars more visible
)

fig.update_layout(
    xaxis_tickangle=-45,
    height=700,
    width=1200,
    yaxis_type="log",
    margin=dict(l=20, r=20, t=40, b=20),
    bargap=0.35
)
fig.update_traces(marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5, opacity=0.6)

fig.show()


In [24]:
fig = px.choropleth(
    df,
    locations="Country",
    locationmode="country names",
    color="O+",
    hover_data = ["A+", "B+", "AB+", "O-", "A-", "B-", "AB-"],
    title="Blood Type O+ Distribution Across the World"
)

fig.update_layout(
    geo=dict(
        showframe=False,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
    margin=dict(l=0, r=0, t=50, b=0)
)

fig.show()