In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy as stats

In [None]:
survey_df = pd.read_csv('/workspaces/ESG-Analysis/Data/ESS8e02_3.csv')
survey_df.head()

In [None]:
survey_df.describe()

In [None]:
survey_df.info()

Selecting only columns according to the research interest

In [None]:
data_needed = survey_df[['idno', 'cntry', 'domicil', 'wrclmch', 'clmthgt1', 'ccgdbd', 'eneffap', 'rdcenr', 'ccrdprs', 'eisced', 'lrscale']]
data_needed.head()

In [None]:
data_needed.isna().sum()

In [None]:
data_needed['cntry'].unique()

Remove countries that are not part of Europe (Russian Federation, Israel) and Ensure no 'RU' or 'IS'

In [None]:
data_needed = data_needed[~data_needed['cntry'].isin(['RU', 'IL'])]

if data_needed['cntry'].isin(['RU', 'IL']).any():
    print("Still: 'RU' or 'IL'")
else:
    print("No: 'RU' or 'IL'")

Data from World Bank about education

In [None]:
world_bank_df = pd.read_csv('/workspaces/ESG-Analysis/Data/01d4cc76-d950-4515-a40d-b85330acb7b9_Data.csv')
world_bank_df

In [None]:
# Setting '2021 [YR2021]' as a variable to call it more efficiently
year = '2016 [YR2016]'

In [None]:
set(world_bank_df[year])

In [None]:
world_bank_df.replace('..', np.nan, inplace=True) # replace '..' for nan values
world_bank_df = world_bank_df[~world_bank_df[year].isna()] # remove all nan values

In [None]:
world_bank_df['Country Code'].unique()

In [None]:
three_to_two = {
    'AUT': 'AT', 
    'BEL': 'BE', 
    'CZE': 'CZ', 
    'EST': 'EE', 
    'FIN': 'FI',
    'FRA': 'FR', 
    'DEU': 'DE', 
    'HUN': 'HU', 
    'ISL': 'IS', 
    'IRL': 'IE',
    'ITA': 'IT', 
    'LTU': 'LT', 
    'NLD': 'NL', 
    'NOR': 'NO', 
    'POL': 'PL',
    'PRT': 'PT', 
    'SVN': 'SI', 
    'ESP': 'ES', 
    'SWE': 'SE', 
    'CHE': 'CH',
    'GBR': 'GB'
}

world_bank_df['Country Code'] = world_bank_df['Country Code'].map(three_to_two)
world_bank_df


In [None]:
world_bank_df = world_bank_df.pivot(columns='Series Name', index='Country Code', values=year).reset_index()
world_bank_df.columns.name = None
world_bank_df

In [None]:
# Merge the two DataFrames
survey_and_wb = data_needed.merge(world_bank_df, left_on='cntry', right_on='Country Code', how='left')

# Drop the redundant "Country Code" column
survey_and_wb.drop(columns=['Country Code'], inplace=True)

survey_and_wb.head()

In [None]:
survey_and_wb.shape