In [2]:
import pandas as pd
import numpy as np

In [3]:
## Create a list of strings of years we want to specify the data by. In this case 2000 - 2019. Needs to be as string type so it can be used to filter the dataframe by year
yearsToCheck = list((np.arange(2000, 2020, 1)).astype(str))

# Declare our country categories
developedCountries = ['Australia','Japan','Sweden','Switzerland','USA']
developingCountries = ['Brazil','Egypt','Greece','India','South Africa']
underdevelopedCountries = ['Afghanistan','Bangladesh','Cambodia','Ethiopia','Senegal']

In [97]:
# Read our population CSV
populationCsvPath = 'data/raw_data/pop.csv'
readPopCsv = pd.read_csv(populationCsvPath)
populationDataFrame = (pd.DataFrame(readPopCsv).set_index('country'))[yearsToCheck]

# Allocate a dataframe variable for developed, developing & underdeveloped country groups.
developed_populationDataFrame = populationDataFrame.loc[developedCountries]
developing_populationDataFrame = populationDataFrame.loc[developingCountries]
underdeveloped_populationDataFrame = populationDataFrame.loc[underdevelopedCountries]

# Group each dataframe into a variable to loop through.
df = [developed_populationDataFrame, underdeveloped_populationDataFrame, developing_populationDataFrame]

# Create a loop which removes the decimals and 'B' or 'M ' so the values can be turned into integers instead of string values. You can perform maths operations on int but not string.

# For each category dataframe
for j in df:
    # for each year in the range '2000' to '2019
    for i in yearsToCheck:
        # for the index number in range of the dataframe[year] up the the length of the object i.e. dataframe[year][0] to call the first value
        for index in range(len(j[i])):
            try:
                # If the last character is an 'M' and there is no decimal, i.e. 150M
                if (j[i][index][-1] == 'M') & ('.' not in j[i][index]):
                    # Replace the M with 000000 indicating a million number
                    j[i][index] = j[i][index].replace('M', '000000')
                # If the last character is an 'B' and there is no decimal, i.e. 17B
                elif (j[i][index][-1] == 'B') & ('.' not in j[i][index]):
                    # Replace the B with 000000000 indicating a million number
                    j[i][index] = j[i][index].replace('B', '000000000')
                # If the last character is an 'M' and the decimal is third character from the end i.e. 52.6M
                elif (j[i][index][-1] == 'M') & (j[i][index][-3] == '.'):
                    # Replace the M with 00000 & replace the demial with nothing
                    j[i][index] = j[i][index].replace('.', '').replace('M', '00000')
                # If the last character is an 'B' and the decimal is third character from the end i.e. 31.9B
                elif (j[i][index][-1] == 'B') & (j[i][index][-3] == '.'):
                    # Replace the B with 00000000 & replace the demial with nothing
                    j[i][index] = j[i][index].replace('.', '').replace('B', '00000000')
                # If the last character is an 'M' and the decimal is fourth character from the end i.e. 121.75M
                elif (j[i][index][-1] == 'M') & (j[i][index][-4] == '.'):
                    # Replace the M with 0000 & replace the demial with nothing
                    j[i][index] = j[i][index].replace('.', '').replace('M', '0000')
                 # If the last character is an 'B' and the decimal is fourth character from the end i.e. 1.09B
                elif (j[i][index][-1] == 'B') & (j[i][index][-4] == '.'):
                    # Replace the B with 0000000 & replace the demial with nothing
                    j[i][index] = j[i][index].replace('.', '').replace('B', '0000000')

            # If the loop encounters an exception, skip and go to the next iteration.
            except Exception:
                pass


# Save the modified dataframes as csv and json files
developed_populationDataFrame.to_csv('data/clean_data/population/csv/developed_population.csv')
developing_populationDataFrame.to_csv('data/clean_data/population/csv/developing_population.csv')
underdeveloped_populationDataFrame.to_csv('data/clean_data/population/csv/underveloped_population.csv')

developed_populationDataFrame.to_json('data/clean_data/population/json/developed_population.json', indent = 1)
developing_populationDataFrame.to_json('data/clean_data/population/json/developing_population.json', indent = 1)
underdeveloped_populationDataFrame.to_json('data/clean_data/population/json/underveloped_population.json', indent = 1)

In [96]:
# Start the same as the previous cell.
gdpUSDPath = 'data/raw_data/total_gdp_us_inflation_adjusted.csv'
readGdpUSDCsv = pd.read_csv(gdpUSDPath, index_col = 'country')
GdpDataFrame = pd.DataFrame(readGdpUSDCsv)[yearsToCheck]

# Same declaration of variables to hold each dataframe
developed_GdpDataFrame = GdpDataFrame.loc[developedCountries]
developing_GdpDataFrame = GdpDataFrame.loc[developingCountries]

# There is NA values for Afghanistan for 2000 & 2001. We will approximate these figures to create a complete dataset.
underdeveloped_GdpDataFrame = GdpDataFrame.loc[underdevelopedCountries].fillna(0)

# Create an aggregate variable of all the dataframes
df = [developed_GdpDataFrame, developing_GdpDataFrame, underdeveloped_GdpDataFrame]

# For each dataframe in the aggregate variable
for each in df:
    # Loop each year in the years we want 2000 - 2019
    for year in yearsToCheck:

        # Using try here because the NA values were filled with int 0 and may return an error if included in the loop
        try:
            # Turn each value the loop passes through to a string and split it on the decimal. Storing in the split_iteration variable allows us to call each side of the decimal by index position.
            for index in range(len(each[year])):
                iteration = str(each[year][index])
                split_iteration = iteration.split('.')

                # If there is an end character of B or TR in the string, then there is no decimal and the number looks like this: 17B or 3TR. Replace the B or TR with the appropriate number of zeroes to restore it to integer.
                if 'B' in split_iteration[0]:
                    each[year][index] = iteration.replace('B', '000000000')
                elif 'TR' in split_iteration[0]:
                    each[year][index] = iteration.replace('TR', '000000000000')            
                try: 
                    # Checking the length of the second index position of the variable allows us to know the format of the string after the decimal i.e 17.2TR or 260.18B etc.
                    if len(split_iteration[1]) == 2:
                        # Replace the B or TR with the appropriate number of zeroes to make it an integer and remove the decimal by replacing it with an empy character. Repeat for each iteration/combination of ending character and length
                        each[year][index] = iteration.replace('B', '00000000').replace('.', '')                    
                    elif 'TR' in split_iteration[1]:
                        if (len(split_iteration[1]) == 3):
                            each[year][index] = iteration.replace('TR', '00000000000').replace('.', '')                        
                        elif (len(split_iteration[1]) == 4):
                            each[year][index] = iteration.replace('TR', '0000000000').replace('.', '')
                    elif 'B' in split_iteration[1]:
                        if (len(split_iteration[1]) == 2):
                            each[year][index] = iteration.replace('B', '00000000').replace('.', '') 
                        elif (len(split_iteration[1]) == 3):
                            each[year][index] = iteration.replace('B', '0000000').replace('.', '')
                       
                except Exception:
                    pass
        except Exception:
            pass

# Now we can begin the process of finding the average annual change in GDP per year for afghanistan and create approximates for the missing years.
new_read = pd.read_csv(gdpUSDPath)

# Create a varialbe which holds the Afghanistan from the original csv. Fill the na with 0 and reset the index to max values callable by index position.
afg_df = pd.DataFrame(new_read).loc[pd.DataFrame(new_read)['country'] == 'Afghanistan'].fillna(0).copy().reset_index()

# transpose drops the rows to the index and the index to rows etc. Basically translates the dataframe 90 degrees.
afg_df = afg_df[yearsToCheck].transpose().reset_index(drop = True)

# Here we perform the same cleaning as above on the Afghanistan values to turn them from strings into integers.
for index, value in enumerate(afg_df[0]):
    try:
        split = value.split('.')[1]
        if len(split) == 3:
            afg_df[0][index] = int(value.replace('B', '0000000').replace('.', ''))
        elif len(split) == 2:
            afg_df[0][index] = int(value.replace('B', '00000000').replace('.', ''))
    except Exception:
        pass

# Here we create a bucket to hold the delta values. These values are df[current year + 1] - df [current year]. Performing this operation allows us to check the change from year to year. From this we can average this change to find approximate values for the missing years.
valueDelta = []
search = list(range(2, 20))

for i in search:
    try:
       valueDelta.append(afg_df[0][i+1] - afg_df[0][i])
    except Exception:
        pass
df[2]['2001'][0] = round(int(df[2]['2002'][0]) - (round(sum(valueDelta) / len(valueDelta))), - 7)
df[2]['2000'][0] = round(int(df[2]['2002'][0]) - (round(sum(valueDelta) / len(valueDelta))) - (round(sum(valuePlus) / len(valuePlus))), - 7)

# Save the modified dataframes as csv and json files
developed_GdpDataFrame.to_csv('data/clean_data/gdpUSD/csv/developed_gdp.csv')
developing_GdpDataFrame.to_csv('data/clean_data/gdpUSD/csv/developing_gdp.csv')
underdeveloped_GdpDataFrame.to_csv('data/clean_data/gdpUSD/csv/underdeveloped_gdp.csv')

developed_GdpDataFrame.to_json('data/clean_data/gdpUSD/json/developed_gdp.json', indent = 1)
developing_GdpDataFrame.to_json('data/clean_data/gdpUSD/json/developing_gdp.json', indent = 1)
underdeveloped_GdpDataFrame.to_json('data/clean_data/gdpUSD/json/underdeveloped_gdp.json', indent = 1)

In [7]:
avgDailyIncPpPath = 'data/raw_data/mincpcap_cppp.csv'
readvgDailyIncCsv = pd.read_csv(avgDailyIncPpPath)
avgDailyIncPpDataFrame = (pd.DataFrame(readvgDailyIncCsv).set_index('country'))[yearsToCheck]

developed_avgDailyIncPpDataFrame = avgDailyIncPpDataFrame.loc[developedCountries]
developing_avgDailyIncPpDataFrame = avgDailyIncPpDataFrame.loc[developingCountries]
underdeveloped_avgDailyIncPpDataFrame = avgDailyIncPpDataFrame.loc[underdevelopedCountries]

developed_avgDailyIncPpDataFrame.to_csv('data/clean_data/averageIncomePerPerson/csv/developed_average_daily_income.csv')
developing_avgDailyIncPpDataFrame.to_csv('data/clean_data/averageIncomePerPerson/csv/developing_average_daily_income.csv')
underdeveloped_avgDailyIncPpDataFrame.to_csv('data/clean_data/averageIncomePerPerson/csv/underdeveloped_average_daily_income.csv')

developed_avgDailyIncPpDataFrame.to_json('data/clean_data/averageIncomePerPerson/json/developed_average_daily_income.json', indent = 1)
developing_avgDailyIncPpDataFrame.to_json('data/clean_data/averageIncomePerPerson/json/developing_average_daily_income.json', indent = 1)
underdeveloped_avgDailyIncPpDataFrame.to_json('data/clean_data/averageIncomePerPerson/json/underdeveloped_average_daily_income.json', indent = 1)

In [8]:
# This is in tonnes
co2PerCapPath = 'data/raw_data/co2_pcap_cons.csv'
readco2PerCapCsv = pd.read_csv(co2PerCapPath)
co2PerCapDataFrame = (pd.DataFrame(readco2PerCapCsv).set_index('country'))[yearsToCheck]

developed_co2PerCapDataFrame = co2PerCapDataFrame.loc[developedCountries]
developing_co2PerCapDataFrame = co2PerCapDataFrame.loc[developingCountries]
underdeveloped_co2PerCapDataFrame = co2PerCapDataFrame.loc[underdevelopedCountries]

developed_co2PerCapDataFrame.to_csv('data/clean_data/co2PerCapitaConsumption/csv/developed_co2_per_capita.csv')
developing_co2PerCapDataFrame.to_csv('data/clean_data/co2PerCapitaConsumption/csv/developing_co2_per_capita.csv')
underdeveloped_co2PerCapDataFrame.to_csv('data/clean_data/co2PerCapitaConsumption/csv/underdeveloped_co2_per_capita.csv')

developed_co2PerCapDataFrame.to_json('data/clean_data/co2PerCapitaConsumption/json/developed_co2_per_capita.json', indent = 1)
developing_co2PerCapDataFrame.to_json('data/clean_data/co2PerCapitaConsumption/json/developing_co2_per_capita.json', indent = 1)
underdeveloped_co2PerCapDataFrame.to_json('data/clean_data/co2PerCapitaConsumption/json/underdeveloped_co2_per_capita.json', indent = 1)

In [9]:
# This is in million tonnes
co2TotalEmissionsPath = 'data/raw_data/co2_cons.csv'
readco2TotalEmissionCsv = pd.read_csv(co2TotalEmissionsPath)
co2TotalEmissionsDataFrame = (pd.DataFrame(readco2TotalEmissionCsv).set_index('country'))[yearsToCheck]

developed_co2TotalEmissionsDataFrame = co2TotalEmissionsDataFrame.loc[developedCountries]
developing_co2TotalEmissionsDataFrame = co2TotalEmissionsDataFrame.loc[developingCountries]
underdeveloped_co2TotalEmissionsDataFrame = co2TotalEmissionsDataFrame.loc[underdevelopedCountries]

developed_co2TotalEmissionsDataFrame.to_csv('data/clean_data/co2EmissionsTotal/csv/developed_co2_total_emissions.csv')
developing_co2TotalEmissionsDataFrame.to_csv('data/clean_data/co2EmissionsTotal/csv/developing_co2_total_emissions.csv')
underdeveloped_co2TotalEmissionsDataFrame.to_csv('data/clean_data/co2EmissionsTotal/csv/underdeveloped_co2_total_emissions.csv')

developed_co2TotalEmissionsDataFrame.to_json('data/clean_data/co2EmissionsTotal/json/developed_co2_total_emissions.json', indent = 1)
developing_co2TotalEmissionsDataFrame.to_json('data/clean_data/co2EmissionsTotal/json/developing_co2_total_emissions.json', indent = 1)
underdeveloped_co2TotalEmissionsDataFrame.to_json('data/clean_data/co2EmissionsTotal/json/underdeveloped_co2_total_emissions.json', indent = 1)