In [None]:
import pandas as pd
import numpy as np
import re

In [107]:
Energy = pd.read_excel("utility_files/energy.xls", header=None)
# get rid of unnecessary columns and rows
Energy = Energy.iloc[18: 245, 2:]
# print(Energy)

# let's set the column names
col_names = ['Country', 'Energy Supply', 'Energy Supply per Capita', "% Renewable"]
Energy.columns = col_names
# Energy = Energy.reset_index() 
# convert Energy supplies to gigajoules
# print(Energy["Energy Supply"])
es = "Energy Supply"

Energy[es] = Energy[es].apply(lambda x: x * 10 ** 6) ## convert each value to gigajoules
# print(Energy[es])

# rename certain countries
names_dic = {"Republic of Korea": "South Korea",
"United States of America": "United States",
"United Kingdom of Great Britain and Northern Ireland": "United Kingdom",
"China, Hong Kong Special Administrative Region": "Hong Kong"}



def new_name(name):
    # first make sure all the names are free of numbers and parentheses
    name_1 = re.sub("\d+", "", name) # remove any numbers in the country's name
    name_2 = re.sub(r"\(.*\)", "", name_1) # remove any letters written in between parentheses
    
    if name_2 in names_dic:
        return names_dic[name_2]
    
    return name_2.strip()

# test_names = ["Bolivia (Plurinational State of)", "Netherlands12", "Portugal13", "Sint Maarten (Dutch part)",
#  "United Kingdom of Great Britain and Northern Ireland19", "United States of America20"]
# for t in test_names:
#     print(new_name(t))

cn = "Country"
Energy[cn] = Energy[cn].apply(new_name)
# for name in Energy[cn]:
#     print(name)

Energy = Energy.set_index(cn)

country_list = ['Australia', 'Bolivia', 'China', 'Hong Kong', 'China, Macao Special Administrative Region', 'Denmark', 'Falkland Islands', 'France', 'Greenland', 'Indonesia', 'Iran', 'Italy', 'Japan', 'Kuwait', 'Micronesia', 'Netherlands', 'Portugal', 'South Korea', 'Saudi Arabia', 'Serbia', 'Sint Maarten', 'Spain', 'Switzerland', 'Ukraine', 'United Kingdom', 'United States', 'Venezuela']

for i in range(len(country_list)):
    if country_list[i] not in Energy.index.values.tolist():
        print(country_list[i])

In [None]:
GDP = pd.read_csv("utility_files/world_bank.csv", header=None, skiprows=range(4)) # skip the header as well as the first 4 rows
# print(GDP.head())
GDP.columns = GDP.iloc[0, :].tolist() 
# print(GDP.columns)
GDP = GDP.rename(columns={"Country Name": "Country"})
# print(GDP)
# print(GDP.columns[0])
# convert all the years from the form xxxx.0 to xxxx (for later use)
GDP.columns = [re.sub("\.0", "", str(col)) for col in GDP.columns]
print(GDP.columns)
name_mapper = {"Korea, Rep.": "South Korea", 
"Iran, Islamic Rep.": "Iran",
"Hong Kong SAR, China": "Hong Kong"}
cn = "Country"
GDP[cn] = GDP[cn].apply(lambda x: name_mapper[x] if x in name_mapper else x)
GDP = GDP.set_index(cn)



In [None]:
# load the Sciamgo Journal and Country Rank data for Energy Engineering and Power Technology data to the ScimEm dataframe
ScimEn = pd.read_excel("utility_files/scimagojr-3.xlsx") # this file does not demand any preprocessing
print(ScimEn.head())
ScimEn = ScimEn.set_index("Country")


In [92]:
# now we will need to join the datasets,
# we will use only the last 10 years of the GPD dataset
# only the first 15 countries in the ScimEn dataset
# top_n = 15
# ScimEn = ScimEn.iloc[:top_n, :]
# print(ScimEn)

GDP = GDP.loc[:, [str(i) for i in range(2006, 2016)]]

df = pd.merge(ScimEn, Energy, how='inner', left_index=True, right_index=True)
df = pd.merge(df, GDP, how='inner', left_index=True, right_index=True)
# df.drop(columns='index', inplace=True)
# print(df.index)
print(df.index)


Index(['China', 'United States', 'Japan', 'United Kingdom',
       'Russian Federation', 'Canada', 'Germany', 'India', 'France',
       'South Korea',
       ...
       'Burundi', 'Liberia', 'Togo', 'American Samoa', 'Gibraltar', 'Maldives',
       'Belize', 'Palau', 'Guyana', 'Mauritania'],
      dtype='object', name='Country', length=161)


In [108]:
# let's try to compute how many rows were lost in the merging procedure.
# Let's denote the Energy data set as E, the GDP data set as G and The journal dataset as J:
# to compute all the elements different from the intersection of the 3 dataset:
# |E| + |G| + |J| - |E union J| - |G union J| - |E union G| + |E union J Union G|
J_union_E = pd.merge(ScimEn, Energy, how='inner', left_index=True, right_index=True)
J_union_G = pd.merge(ScimEn, GDP, how='inner', left_index=True, right_index=True)
E_union_G = pd.merge(Energy, GDP, how='inner', left_index=True, right_index=True)
print(len(Energy))
print(len(GDP))
print(len(ScimEn))
print((J_union_E).index[110:120])
print(len(J_union_G))
print(len(E_union_G))

print(len(Energy) + len(GDP) + len(ScimEn) - len(J_union_E) - len(J_union_G) - len(E_union_G)) 


227
265
191
Index(['Zimbabwe', 'Senegal', 'El Salvador', 'Fiji', 'Jamaica', 'Mauritius',
       'Mozambique', 'Nicaragua', 'Myanmar', 'Tajikistan'],
      dtype='object', name='Country')
165
186
159


In [None]:
# we need the top 15 countries when it comes to GDP
df_c = df.copy()
top_GDP = df_c.loc[:, [str(i) for i in range(2006, 2016)]].apply(np.mean, axis=1).sort_values(ascending=False)
print(top_GDP)

In [None]:
# we need to see by how much the GDP of the 6th country changed over the last 10 years
country_name = top_GDP.index[0]
print(country_name)

In [None]:
print(df.columns)
# print(df)
df = df.iloc[:15,:]

In [None]:
print(GDP.loc[country_name, "2015"] - GDP.loc[country_name, "2006"])

In [None]:
print(df["% Renewable"])
max_renewable_index = np.argmax(df["% Renewable"])
ans = (str(df.index[max_renewable_index]), df["% Renewable"][max_renewable_index])
print(ans)
print(type(ans[0]))

In [None]:
df1 = df.copy()
df1["self_cit_ratio"] = df1["Self-citations"] / df1["Citations"]
max_self_citation = np.argmax(df1["self_cit_ratio"])
print(max_self_citation)
tup = (df1.index[max_self_citation], df1["self_cit_ratio"][max_self_citation])
print(type(tup))
print(df1["self_cit_ratio"])
print(tup)

In [None]:
df1 = df.copy()
df1["Pop"] = df1["Energy Supply"] / df1["Energy Supply per Capita"]
pop_sorted = df1["Pop"].sort_values(ascending=False)
print(pop_sorted)
print(pop_sorted.index[2])

In [None]:
df1 = df.copy()
df1['PopEst'] = df1['Energy Supply'] / df1['Energy Supply per Capita']
df1['Citable docs per Capita'] = df1['Citable documents'] / df1['PopEst']
df1 = df1.loc[:, ["Energy Supply per Capita", "Citable docs per Capita"]]
df1 = df1.astype(float)
print(df1.corr(method ='pearson').loc["Energy Supply per Capita", "Citable docs per Capita"])



In [None]:
df1 = df.copy()
renew = "% Renewable"
median = df1[renew].median()
median_renew = (df1[renew] >= median).astype(int)
print(median_renew)

In [None]:
ContinentDict  = {'China':'Asia', 
                  'United States':'North America', 
                  'Japan':'Asia', 
                  'United Kingdom':'Europe', 
                  'Russian Federation':'Europe', 
                  'Canada':'North America', 
                  'Germany':'Europe', 
                  'India':'Asia',
                  'France':'Europe', 
                  'South Korea':'Asia', 
                  'Italy':'Europe', 
                  'Spain':'Europe', 
                  'Iran':'Asia',
                  'Australia':'Australia', 
                  'Brazil':'South America'} 

In [None]:
funcs_list = [pd.Series.count, pd.Series.sum, np.mean, np.std]
df1 = df1.copy()
df1['PopEst'] = df1['Energy Supply'] / df1['Energy Supply per Capita']
continents = df1.loc[:, ["PopEst"]].groupby(lambda x: ContinentDict[x]).agg({"PopEst":  funcs_list})
continents = continents.loc[:,"PopEst"]
continents.columns = ['size', 'sum', 'mean', 'std']
print(continents.columns)
print(continents.index)
print(continents)


In [None]:
df1 = df.iloc[:15, :].copy()
df1 =df1.reset_index()
df1["Continent"] = df1["Country"].apply(lambda x: ContinentDict[x])
bins_name = "% Renewable bins" 
df1[bins_name] = pd.cut(df1[renew], 5)
# df1 = df1.set_index("Country")
df1 = df1.loc[:, [bins_name, "Continent", "Country"]]
#print(df1)
df1 = df1.rename(columns={bins_name:"% Renewable"})
c = "Continent"
r = "% Renewable"
df1 = df1.groupby([c, r]).agg({"Country": pd.Series.count}).dropna()
print(type(df1["Country"]))
# print(df1)

In [None]:
def seperate_num_string_comma(num_string):
    parts = num_string.split(".")
    # take into account non-decimal numbers
    bef_dec = (parts[1] if len(parts) >= 2 else "")
    aft_dec = parts[0]
    # revert the non-decimal part
    aft_dec = aft_dec[::-1]
    # add the commas
    aft_dec_comma = [aft_dec[i] + "," if (i > 0 and i%3 == 0) else aft_dec[i] for i in range(len(aft_dec))]
    # revert the non-decimal part after adding commas
    aft_dec_comma = aft_dec_comma[::-1]
    return "".join(aft_dec_comma) + ("." + bef_dec if len(parts) >= 2 else "")


# for _ in range(50):
#     x = np.random.randint(1000, 10 ** 8)
#     print(x)
#     print(seperate_num_string_comma(str(x)))

# for _ in range(25):
#     x = np.random.rand() * 10 ** 9
#     print(x)
#     print(seperate_num_string_comma(str(x)))
#     print("###################################")


In [None]:
df1 = df.copy()
df1["Pop"] = df1["Energy Supply"] / df1["Energy Supply per Capita"]
print(df1["Pop"].apply(lambda x: seperate_num_string_comma(str(x))))