# Final Project: Happiness Report Data Analysis (2015)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy.stats import pearsonr

# Load dataset
df = pd.read_csv("2015.csv")
df.head()

In [None]:
# Check for nulls
df.info()

In [None]:
# Remove outliers using Z-score method
numeric_cols = df.select_dtypes(include=[np.number])
z_scores = np.abs((numeric_cols - numeric_cols.mean()) / numeric_cols.std())
threshold = 3
df_clean = df[(z_scores < threshold).all(axis=1)]
print(f"Removed {df.shape[0] - df_clean.shape[0]} outliers, {df_clean.shape[0]} rows remain.")

In [None]:
mean_score = np.mean(df_clean["Happiness Score"])
std_score = np.std(df_clean["Happiness Score"])
median_score = np.median(df_clean["Happiness Score"])
max_score = np.max(df_clean["Happiness Score"])
min_score = np.min(df_clean["Happiness Score"])

mean_score, std_score, median_score, max_score, min_score

In [None]:
x = df_clean["Economy (GDP per Capita)"]
y = df_clean["Health (Life Expectancy)"]
correlation, p_value = pearsonr(x, y)
correlation, p_value

In [None]:
X = sm.add_constant(df_clean["Economy (GDP per Capita)"])
y = df_clean["Happiness Score"]
model = sm.OLS(y, X).fit()
model.summary()

In [None]:
sns.set(style="whitegrid")
plt.figure(figsize=(14, 6))

# Plot 1: Happiness Score by Region
plt.subplot(1, 2, 1)
sns.boxplot(x='Region', y='Happiness Score', data=df_clean)
plt.xticks(rotation=90)
plt.title("Happiness Score by Region")

# Plot 2: GDP vs Life Expectancy
plt.subplot(1, 2, 2)
sns.scatterplot(x="Economy (GDP per Capita)", y="Health (Life Expectancy)", data=df_clean)
plt.title("GDP vs Life Expectancy")

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.regplot(x="Economy (GDP per Capita)", y="Happiness Score", data=df_clean, line_kws={"color": "red"})
plt.title("Correlation: GDP per Capita vs Happiness Score")
plt.show()