In [30]:
import pandas as pd
import statistics
from statistics import mode
from scipy.stats import pearsonr, spearmanr
import numpy as np
from sklearn.preprocessing import StandardScaler



STATISTICS

In [5]:

# Load happiness data
happiness_df = pd.read_csv("Countries' Happiness Data 2017.csv")
happiness_scores = happiness_df["Happiness.Score"].dropna().tolist()

# Load literacy data
with open("API_SE.ADT.LITR.ZS_DS2_en_csv_v2_19396.csv", "r", encoding="utf-8") as f:
    lines_lit = f.readlines()

for i, line in enumerate(lines_lit):
    if "2017" in line:
        header = line.strip().split(",")
        lit_index = [j for j, col in enumerate(header) if col.strip('"') == "2017"][0]
        data_start_lit = i + 1
        break

literacy_rates = []
for line in lines_lit[data_start_lit:]:
    parts = line.strip().split(",")
    try:
        value = float(parts[lit_index].strip().strip('"'))
        literacy_rates.append(value)
    except:
        continue

# Load enrollment data
with open("API_SE.TER.ENRR_DS2_en_csv_v2_23897.csv", "r", encoding="utf-8") as f:
    lines_enr = f.readlines()

enrollment_rates = []
for line in lines_enr[5:]:
    parts = line.strip().split(",")
    try:
        value = float(parts[61].strip().strip('"'))
        enrollment_rates.append(value)
    except:
        continue

# Function to calculate and display stats
def describe_data(data, label):
    print(f"\n{label}")
    print("-" * len(label))
    print(f"Count              : {len(data)}")
    print(f"Mean               : {statistics.mean(data):.2f}")
    print(f"Median             : {statistics.median(data):.2f}")
    try:
        print(f"Mode               : {statistics.mode(data):.2f}")
    except statistics.StatisticsError:
        print("Mode               : No unique mode")
    print(f"Variance           : {statistics.variance(data):.2f}")
    print(f"Standard Deviation : {statistics.stdev(data):.2f}")

# Output stats for all three datasets
describe_data(enrollment_rates, "Tertiary Enrollment Rate % 0-100")
describe_data(literacy_rates, "Adult Literacy Rate % 0-100")
describe_data(happiness_scores, "Happiness Score % 0-10")



Tertiary Enrollment Rate % 0-100
--------------------------------
Count              : 193
Mean               : 43.26
Median             : 42.36
Mode               : 23.09
Variance           : 781.66
Standard Deviation : 27.96

Adult Literacy Rate % 0-100
---------------------------
Count              : 266
Mean               : 79.10
Median             : 79.79
Mode               : 66.34
Variance           : 126.08
Standard Deviation : 11.23

Happiness Score 0-10
--------------------
Count              : 155
Mean               : 5.35
Median             : 5.28
Mode               : 7.28
Variance           : 1.28
Standard Deviation : 1.13


In [8]:
forest_df = pd.read_csv("Forest_Area_2017_Countries.csv")

forest_data = forest_df["2017"].dropna()

count = forest_data.count()
mean = forest_data.mean()
median = forest_data.median()
try:
    mode_val = mode(forest_data)
except:
    mode_val = "No unique mode"
variance = forest_data.var()
std_dev = forest_data.std()

print("Forest Area Ratio %0-100")
print(" ")
print(f"Count : {count}")
print(f"Mean : {mean:.2f}")
print(f"Median : {median:.2f}")
print(f"Mode : {mode_val}")
print(f"Variance : {variance:.2f}")
print(f"Standard Deviation : {std_dev:.2f}")

Forest Area Ratio %0-100
 
Count : 222
Mean : 32.59
Median : 31.18
Mode : 0.0
Variance : 559.32
Standard Deviation : 23.65


In [10]:
df_pm25 = pd.read_excel("PM2.5_2017_Countries.xlsx")

print("Air Pollution \n")
pm25_stats = {
    'Count': int(df_pm25['PM2.5_2017'].count()),
    'Mean': round(float(df_pm25['PM2.5_2017'].mean()), 2),
    'Median': round(float(df_pm25['PM2.5_2017'].median()), 2),
    'Mode': round(float(df_pm25['PM2.5_2017'].mode()[0]), 2),
    'Variance': round(float(df_pm25['PM2.5_2017'].var()), 2),
    'Standard Deviation': round(float(df_pm25['PM2.5_2017'].std()), 2)
}

for k, v in pm25_stats.items():
    print(f"{k} : {v}")

Air Pollution 

Count : 248
Mean : 27.06
Median : 21.86
Mode : 40.78
Variance : 300.03
Standard Deviation : 17.32


In [14]:
df = pd.read_csv("coastlines.csv")
df = df.dropna(subset=["coast_to_area_wf"])
df["coast_ratio"] = df["coast_to_area_wf"]

data = df["coast_ratio"]

print("Coastline-to-Area Ratio % 0–100")
print(f"\nCount : {data.count()}")
print(f"Mean : {data.mean():.2f}")
print(f"Median : {data.median():.2f}")
print(f"Mode : {data.mode().iloc[0]:.2f}")
print(f"Variance : {data.var():.2f}")
print(f"Standard Deviation : {data.std():.2f}")

Coastline-to-Area Ratio % 0–100

Count : 235
Mean : 26.74
Median : 0.82
Mode : 0.00
Variance : 8235.38
Standard Deviation : 90.75


In [17]:
df = pd.read_csv("arable_land_2017.csv")
data = df["arable_land_percent"].dropna()

print("Arable Land Ratio % 0–100")
print(f"\nCount : {data.count()}")
print(f"Mean : {data.mean():.2f}")
print(f"Median : {data.median():.2f}")
print(f"Mode : {data.mode().iloc[0]:.2f}")
print(f"Variance : {data.var():.2f}")
print(f"Standard Deviation : {data.std():.2f}")

Arable Land Ratio % 0–100

Count : 255
Mean : 13.68
Median : 10.08
Mode : 2.86
Variance : 167.22
Standard Deviation : 12.93


P VALUE AND CORRELATION COEFFICIENTS

In [18]:

# 1. P-VALUE & PEARSON CORRELATION




# Load Happiness Data
happiness_df = pd.read_csv("Countries' Happiness Data 2017.csv")

# Match Adult Literacy Rate with Happiness
with open("API_SE.ADT.LITR.ZS_DS2_en_csv_v2_19396.csv", "r", encoding="utf-8") as f:
    lines_lit = f.readlines()

for i, line in enumerate(lines_lit):
    if "2017" in line:
        header = line.strip().split(",")
        lit_index = [j for j, col in enumerate(header) if col.strip('"') == "2017"][0]
        data_start_lit = i + 1
        break

common_lit_scores = []
common_lit_happy = []

for line in lines_lit[data_start_lit:]:
    parts = line.strip().split(",")
    country = parts[0].strip('"')
    try:
        lit_val = float(parts[lit_index].strip().strip('"'))
        row = happiness_df[happiness_df["Country"] == country]
        if not row.empty:
            common_lit_scores.append(lit_val)
            common_lit_happy.append(float(row["Happiness.Score"].values[0]))
    except:
        continue

# Match Tertiary Enrollment with Happiness
with open("API_SE.TER.ENRR_DS2_en_csv_v2_23897.csv", "r", encoding="utf-8") as f:
    lines_enr = f.readlines()

common_enr_scores = []
common_enr_happy = []

for line in lines_enr[5:]:
    parts = line.strip().split(",")
    country = parts[0].strip('"')
    try:
        enr_val = float(parts[61].strip().strip('"'))
        row = happiness_df[happiness_df["Country"] == country]
        if not row.empty:
            common_enr_scores.append(enr_val)
            common_enr_happy.append(float(row["Happiness.Score"].values[0]))
    except:
        continue

# 2. PEARSON CORRELATION RESULTS

def pearson_stats(x, y, label):
    r, p = pearsonr(x, y)
    print(f"\n{label} (Pearson Correlation)")
    print("-" * (len(label) + 26))
    print(f"Correlation Coefficient (r): {r:.4f}")
    print(f"P-value                    : {p:.4e}")

pearson_stats(common_enr_scores, common_enr_happy, "Tertiary Enrollment vs Happiness")
pearson_stats(common_lit_scores, common_lit_happy, "Literacy Rate vs Happiness")

# 3. SPEARMAN CORRELATION RESULTS

def spearman_stats(x, y, label):
    r, p = spearmanr(x, y)
    print(f"\n{label} (Spearman Correlation)")
    print("-" * (len(label) + 27))
    print(f"Correlation Coefficient (ρ): {r:.4f}")
    print(f"P-value                    : {p:.4e}")

spearman_stats(common_enr_scores, common_enr_happy, "Tertiary Enrollment vs Happiness Score")
spearman_stats(common_lit_scores, common_lit_happy, "Literacy Rate vs Happiness Score")



Tertiary Enrollment vs Happiness (Pearson Correlation)
----------------------------------------------------------
Correlation Coefficient (r): 0.6404
P-value                    : 3.7797e-14

Literacy Rate vs Happiness (Pearson Correlation)
----------------------------------------------------
Correlation Coefficient (r): 0.1199
P-value                    : 1.5808e-01

Tertiary Enrollment vs Happiness Score (Spearman Correlation)
-----------------------------------------------------------------
Correlation Coefficient (ρ): 0.6491
P-value                    : 1.3098e-14

Literacy Rate vs Happiness Score (Spearman Correlation)
-----------------------------------------------------------
Correlation Coefficient (ρ): 0.1164
P-value                    : 1.7093e-01


In [22]:

forest_df = pd.read_csv("Forest_Area_2017_Countries.csv")
happiness_df = pd.read_excel("Filtered_Happiness_Data_2017 (1).xlsx")

merged_df = pd.merge(happiness_df, forest_df, left_on="Country", right_on="Country Name")

forest = merged_df["2017"]
happiness = merged_df["Happiness.Score"]

pearson_corr, pearson_p = pearsonr(forest, happiness)

spearman_corr, spearman_p = spearmanr(forest, happiness)

print("Forest Area vs Happiness Score (Pearson Correlation)")
print(f"Correlation Coefficient (r): {pearson_corr:.4f}")
print(f"P-value                 : {pearson_p:.4e}")
print()

print("Forest Area vs Happiness Score (Spearman Correlation)")
print(f"Correlation Coefficient (ρ): {spearman_corr:.4f}")
print(f"P-value                   : {spearman_p:.4e}")


Forest Area vs Happiness Score (Pearson Correlation)
Correlation Coefficient (r): 0.0957
P-value                 : 2.7854e-01

Forest Area vs Happiness Score (Spearman Correlation)
Correlation Coefficient (ρ): 0.1037
P-value                   : 2.4025e-01


In [24]:


happiness_df = pd.read_excel("Filtered_Happiness_Data_2017 (1).xlsx")[['Country', 'Happiness.Score']]
pm25_df = pd.read_excel("PM2.5_2017_Countries.xlsx")

merged_df = pd.merge(happiness_df, pm25_df, on='Country')

pearson_corr, pearson_p = pearsonr(merged_df['PM2.5_2017'], merged_df['Happiness.Score'])
spearman_corr, spearman_p = spearmanr(merged_df['PM2.5_2017'], merged_df['Happiness.Score'])

print("Air Pollution (PM2.5) vs Happiness Score")
print(f"Pearson Correlation Coefficient (r): {round(pearson_corr, 4)}")
print(f"Pearson P-value                   : {round(pearson_p, 4)}\n")
print(f"Spearman Correlation Coefficient (ρ): {round(spearman_corr, 4)}")
print(f"Spearman P-value                     : {round(spearman_p, 4)}")


Air Pollution (PM2.5) vs Happiness Score
Pearson Correlation Coefficient (r): -0.4664
Pearson P-value                   : 0.0

Spearman Correlation Coefficient (ρ): -0.5832
Spearman P-value                     : 0.0


In [31]:


coast_df = pd.read_csv("coastlines.csv")
happiness_df = pd.read_excel("Filtered_Happiness_Data_2017 (1).xlsx")

coast_df = coast_df[["Country", "coast_to_area_wf"]].dropna()
coast_df.columns = ["Country", "coast_ratio"]

coast_df["coast_log"] = np.log1p(coast_df["coast_ratio"])
scaler = StandardScaler()
coast_df["coast_log_scaled"] = scaler.fit_transform(coast_df[["coast_log"]])

merged = happiness_df.merge(coast_df, on="Country", how="inner")

pearson_r, pearson_p = pearsonr(merged["coast_log_scaled"], merged["Happiness.Score"])
spearman_r, spearman_p = spearmanr(merged["coast_log_scaled"], merged["Happiness.Score"])

print("* Coastline Ratio vs Happiness Score (Log + Scaled)\n")

print("   Pearson Correlation")
print(f"   Correlation Coefficient (r): {pearson_r:.4f}")
print(f"   P-value                    : {pearson_p:.5f}\n")

print("   Spearman Correlation")
print(f"   Correlation Coefficient (ρ): {spearman_r:.4f}")
print(f"   P-value                    : {spearman_p:.5f}")


* Coastline Ratio vs Happiness Score (Log + Scaled)

   Pearson Correlation
   Correlation Coefficient (r): 0.3608
   P-value                    : 0.00001

   Spearman Correlation
   Correlation Coefficient (ρ): 0.4206
   P-value                    : 0.00000


In [32]:
arable_df = pd.read_csv("arable_land_2017.csv")
happiness_df = pd.read_excel("Filtered_Happiness_Data_2017 (1).xlsx")

arable_df["arable_log"] = np.log1p(arable_df["arable_land_percent"])
scaler = StandardScaler()
arable_df["arable_log_scaled"] = scaler.fit_transform(arable_df[["arable_log"]])

merged = happiness_df.merge(arable_df, left_on="Country", right_on="country", how="inner")

pearson_r, pearson_p = pearsonr(merged["arable_log_scaled"], merged["Happiness.Score"])
spearman_r, spearman_p = spearmanr(merged["arable_log_scaled"], merged["Happiness.Score"])

print("* Arable Land vs Happiness Score (Log + Scaled)\n")

print("   Pearson Correlation")
print(f"   Correlation Coefficient (r): {pearson_r:.4f}")
print(f"   P-value                    : {pearson_p:.4f}\n")

print("   Spearman Correlation")
print(f"   Correlation Coefficient (ρ): {spearman_r:.4f}")
print(f"   P-value                    : {spearman_p:.4f}")

* Arable Land vs Happiness Score (Log + Scaled)

   Pearson Correlation
   Correlation Coefficient (r): -0.1380
   P-value                    : 0.1117

   Spearman Correlation
   Correlation Coefficient (ρ): -0.1371
   P-value                    : 0.1141
