In [1]:
import pandas as pd
import os

# List all CSV files in the current directory
csv_files = [file for file in os.listdir() if file.endswith('.csv')]

# Optional: print to verify
print("Files found:", csv_files)

# Load and combine
dfs = []
for file in csv_files:
    df = pd.read_csv(file)
    df['Year'] = file[-8:-4]  # Adjust this based on your file naming, or use a cleaner method if needed
    dfs.append(df)

# Combine all into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Preview it
combined_df.head()


Files found: ['2020.csv', '2021.csv', '2023.csv', '2022.csv', '2024.csv']


Unnamed: 0,Country name,Happiness Rank,Happiness score,Upperwhisker,Lowerwhisker,Economy (GDP per Capita)\t,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Year
0,Finland,1,7.81,7.87,7.75,1.29,1.5,0.96,0.66,0.16,0.48,2020
1,Denmark,2,7.65,7.71,7.58,1.33,1.5,0.98,0.67,0.24,0.5,2020
2,Switzerland,3,7.56,7.63,7.49,1.39,1.47,1.04,0.63,0.27,0.41,2020
3,Iceland,4,7.5,7.62,7.39,1.33,1.55,1.0,0.66,0.36,0.14,2020
4,Norway,5,7.49,7.56,7.42,1.42,1.5,1.01,0.67,0.29,0.43,2020


In [2]:
import os

csv_files = [file for file in os.listdir() if file.endswith('.csv')]
print(f"CSV Files Found ({len(csv_files)}):", csv_files)


CSV Files Found (5): ['2020.csv', '2021.csv', '2023.csv', '2022.csv', '2024.csv']


In [3]:
import pandas as pd

dfs = []

for file in csv_files:
    df = pd.read_csv(file)
    df['Year'] = file.split('.')[0]  # Adjust if needed based on filename format
    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)

print("Combined Shape:", combined_df.shape)
combined_df.head(10)  # See the first 10 rows for variety


Combined Shape: (728, 12)


Unnamed: 0,Country name,Happiness Rank,Happiness score,Upperwhisker,Lowerwhisker,Economy (GDP per Capita)\t,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Year
0,Finland,1,7.81,7.87,7.75,1.29,1.5,0.96,0.66,0.16,0.48,2020
1,Denmark,2,7.65,7.71,7.58,1.33,1.5,0.98,0.67,0.24,0.5,2020
2,Switzerland,3,7.56,7.63,7.49,1.39,1.47,1.04,0.63,0.27,0.41,2020
3,Iceland,4,7.5,7.62,7.39,1.33,1.55,1.0,0.66,0.36,0.14,2020
4,Norway,5,7.49,7.56,7.42,1.42,1.5,1.01,0.67,0.29,0.43,2020
5,Netherlands,6,7.45,7.5,7.39,1.34,1.46,0.98,0.61,0.34,0.37,2020
6,Sweden,7,7.35,7.42,7.28,1.32,1.43,0.99,0.65,0.27,0.44,2020
7,New Zealand,8,7.3,7.38,7.22,1.24,1.49,1.01,0.65,0.33,0.46,2020
8,Austria,9,7.29,7.36,7.23,1.32,1.44,1.0,0.6,0.26,0.28,2020
9,Luxembourg,10,7.24,7.3,7.18,1.54,1.39,0.99,0.61,0.2,0.37,2020


In [4]:
# Check column names, missing values, and data types
print("Columns:", combined_df.columns)
print("\nMissing values per column:")
print(combined_df.isnull().sum())

print("\nData types:")
print(combined_df.dtypes)

Columns: Index(['Country name', 'Happiness Rank', 'Happiness score', 'Upperwhisker',
       'Lowerwhisker', 'Economy (GDP per Capita)\t', 'Social support',
       'Healthy life expectancy', 'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption', 'Year'],
      dtype='object')

Missing values per column:
Country name                    0
Happiness Rank                  0
Happiness score                 0
Upperwhisker                    0
Lowerwhisker                    0
Economy (GDP per Capita)\t      3
Social support                  3
Healthy life expectancy         4
Freedom to make life choices    3
Generosity                      3
Perceptions of corruption       3
Year                            0
dtype: int64

Data types:
Country name                     object
Happiness Rank                    int64
Happiness score                 float64
Upperwhisker                    float64
Lowerwhisker                    float64
Economy (GDP per Capita)\t      flo

In [6]:
# Strip whitespace and tab characters from column names
combined_df.columns = combined_df.columns.str.strip()


In [7]:
combined_df['Year'] = combined_df['Year'].astype(int)


In [8]:
combined_df.dropna(inplace=True)


In [9]:
print("✅ Cleaned columns:", combined_df.columns.tolist())
print("\n🧼 Remaining missing values:\n", combined_df.isnull().sum())
print("\n📐 Data types:\n", combined_df.dtypes)


✅ Cleaned columns: ['Country name', 'Happiness Rank', 'Happiness score', 'Upperwhisker', 'Lowerwhisker', 'Economy (GDP per Capita)', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption', 'Year']

🧼 Remaining missing values:
 Country name                    0
Happiness Rank                  0
Happiness score                 0
Upperwhisker                    0
Lowerwhisker                    0
Economy (GDP per Capita)        0
Social support                  0
Healthy life expectancy         0
Freedom to make life choices    0
Generosity                      0
Perceptions of corruption       0
Year                            0
dtype: int64

📐 Data types:
 Country name                     object
Happiness Rank                    int64
Happiness score                 float64
Upperwhisker                    float64
Lowerwhisker                    float64
Economy (GDP per Capita)        float64
Social support                  

In [10]:
# Define columns to drop
columns_to_drop = [
    'Country name', 'Happiness Rank', 'Upperwhisker', 'Lowerwhisker'
]

# Create reduced DataFrame
reduced_df = df.drop(columns=columns_to_drop)

print("✅ Reduced DataFrame shape:", reduced_df.shape)
print("🧠 Remaining columns:", reduced_df.columns.tolist())


✅ Reduced DataFrame shape: (143, 8)
🧠 Remaining columns: ['Happiness score', 'Economy (GDP per Capita)\t', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption', 'Year']


In [11]:
from sklearn.linear_model import LinearRegression

X = reduced_df.drop(columns=['Happiness score'])
y = reduced_df['Happiness score']

model = LinearRegression()
model.fit(X, y)

importance = pd.Series(model.coef_, index=X.columns)
print("Linear Regression Coefficients:")
print(importance.sort_values(ascending=False))


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [12]:
X.isna().sum()


Economy (GDP per Capita)\t      3
Social support                  3
Healthy life expectancy         3
Freedom to make life choices    3
Generosity                      3
Perceptions of corruption       3
Year                            0
dtype: int64

In [13]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
X_clean = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
y_clean = y.copy()


In [14]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_clean, y_clean)

importance = pd.Series(model.coef_, index=X.columns)
print("Linear Regression Coefficients (Feature Importance):")
print(importance.sort_values(ascending=False))


Linear Regression Coefficients (Feature Importance):
Freedom to make life choices    1.896792
Social support                  1.438366
Healthy life expectancy         1.382510
Perceptions of corruption       1.007223
Generosity                      0.462057
Economy (GDP per Capita)\t      0.441487
Year                            0.000000
dtype: float64


In [15]:
from sklearn.preprocessing import MinMaxScaler

# Create a copy of the reduced data to normalize
features_to_normalize = reduced_df.drop(columns=['Happiness score'])
scaler = MinMaxScaler()

# Apply Min-Max scaling
normalized_array = scaler.fit_transform(features_to_normalize)
normalized_df = pd.DataFrame(normalized_array, columns=features_to_normalize.columns)

# Add back the target column
normalized_df['Happiness score'] = reduced_df['Happiness score'].values

print("✅ Normalization complete. Here's the first few rows:")
print(normalized_df.head())


✅ Normalization complete. Here's the first few rows:
   Economy (GDP per Capita)\t  Social support  Healthy life expectancy  \
0                    0.861280        0.972171                 0.810968   
1                    0.891172        0.940012                 0.815636   
2                    0.878561        1.000000                 0.837806   
3                    0.877160        0.928262                 0.844807   
4                    0.842130        0.935683                 0.863477   

   Freedom to make life choices  Generosity  Perceptions of corruption  Year  \
0                      0.995365    0.354115                   0.949565   0.0   
1                      0.953650    0.508728                   0.953043   0.0   
2                      0.949015    0.643392                   0.316522   0.0   
3                      0.971031    0.551122                   0.911304   0.0   
4                      0.742758    0.381546                   0.335652   0.0   

   Happiness score  


In [16]:
# Discretize Happiness score into 3 bins
bins = 3
labels = ['Low', 'Medium', 'High']

normalized_df['Happiness Category'] = pd.cut(
    normalized_df['Happiness score'],
    bins=bins,
    labels=labels
)

print("\n🎯 Discretization complete. Sample:")
print(normalized_df[['Happiness score', 'Happiness Category']].head())



🎯 Discretization complete. Sample:
   Happiness score Happiness Category
0            7.741               High
1            7.583               High
2            7.525               High
3            7.344               High
4            7.341               High


In [17]:
final_df = reduced_df.copy()
final_df['Happiness Category'] = pd.cut(
    final_df['Happiness score'],
    bins=3,
    labels=['Low', 'Medium', 'High']
)

# Save it to a CSV file
final_df.to_csv('happiness_final.csv', index=False)
