### Data Cleaning

In [35]:
import pandas as pd

# Excel file path
file_path = 'WorldEnergyData/Statistical Review of World Energy Data.xlsx'

# Specify the sheet name
sheet_name = 'Primary Energy Consumption'

# Read the data into a DataFrame using pd.read_excel()
df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)

# Remove rows 1, 2, 4
df = df.drop([0, 1, 3])

# Remove last three columns
df = df.iloc[:, :-3]

# Use the row with years as column names
df.columns = df.iloc[0]

# Delete the last 6 rows
df = df.iloc[:-15]

# Reset the index
df = df.reset_index(drop=True)

# Remove the rows where it has years
df = df.drop([0])

# Delete rows with all NaN values
df = df.dropna(axis=0, how='all')

# Delete rows where the country name has 'Total' in it
df = df[~df.iloc[:, 0].str.contains('Total')]

# Rename column to 'Country'
df.rename(columns={df.columns[0]: 'Country'}, inplace=True)

# Convert the years (column names) to integers starting from the second column
df.columns = [df.columns[0]] + [int(col) for col in df.columns[1:]]

# Print column names to check their format
print(df.columns)

# Convert data to numeric, ignoring errors
df.iloc[:, 1:] = df.iloc[:, 1:].apply(pd.to_numeric, errors='coerce')


df_melted.dtypes



Index(['Country',      1965,      1966,      1967,      1968,      1969,
            1970,      1971,      1972,      1973,      1974,      1975,
            1976,      1977,      1978,      1979,      1980,      1981,
            1982,      1983,      1984,      1985,      1986,      1987,
            1988,      1989,      1990,      1991,      1992,      1993,
            1994,      1995,      1996,      1997,      1998,      1999,
            2000,      2001,      2002,      2003,      2004,      2005,
            2006,      2007,      2008,      2009,      2010,      2011,
            2012,      2013,      2014,      2015,      2016,      2017,
            2018,      2019,      2020,      2021,      2022],
      dtype='object')


Country               object
year                  object
energy_consumption    object
dtype: object

### Melt data and format for plotly visualization

In [85]:


df_melted = df.melt(id_vars=['Country'], var_name='year', value_name='energy_consumption')

# Read country codes into a new df
df_codes = pd.read_csv('WorldEnergyData/countries_codes_and_coordinates.csv')

# Replace "North Macedonia" wtih "Macedonia, the former Yugoslav Republic of" in df_codes
df_melted['Country'] = df_melted['Country'].replace('North Macedonia', 'Macedonia, the former Yugoslav Republic of')

# Replace "China Hong Kong SAR" with "Hong Kong" in df_codes
df_melted['Country'] = df_melted['Country'].replace('China Hong Kong SAR', 'Hong Kong')

# Replace "US" with "United States" in df_codes
df_melted['Country'] = df_codes['Country'].replace('US', 'United States')

# Merge the DataFrame with the country codes
# We're using 'left' merge to keep all records from your original data
df_merged = pd.merge(df_melted, codes_df[['Country', 'Alpha-2 code']], on='Country', how='left')

# Set the option to display all rows and columns (use with caution for very large DataFrames)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Delete rows where there are NaN values
df_merged = df_merged.dropna()

# rename Alpha-2 code column name
df_merged = df_merged.rename(columns={'Alpha-2 code': 'iso_alpha'})

# First, ensure that iso_alpha values are correctly formatted by removing quotes
df_merged['iso_alpha'] = df_merged['iso_alpha'].str.replace('"', '')

# Convert energy from exajoules to gigajoules
df_merged['energy_consumption'] = df_merged['energy_consumption'] * 1000000
df_merged.head()



Unnamed: 0,Country,year,energy_consumption,iso_alpha
0,Afghanistan,1965,5000530.28035,AF
1,Åland Islands,1965,1055970.769487,AX
2,Albania,1965,51982923.795742,AL
3,Algeria,1965,1182699.451535,DZ
4,American Samoa,1965,985930.651254,AS


### Map Visualization

In [88]:
import plotly.express as px

# Convert 'year' to string for the animation
df_merged['year'] = df_merged['year'].astype(str)

# Create basic choropleth map
fig = px.choropleth(df_merged, locations='iso_alpha', color='Country', hover_name='energy_consumption',
                    projection='natural earth', animation_frame='year',
                    title='Energy Consumption over time by Country') 
fig.show()