## Data Preprocessing

- Collecting all the data for Jakarta in one DataFrame

In [2]:
import pandas as pd

data_jakarta = pd.read_csv('1980-2023 renewable energy data/ninja_pv_-7.2623_112.7361_1980.csv', header=3)
name = "1980-2023 renewable energy data/ninja_pv_-7.2623_112.7361_"

for i in range(1981, 2024):
    data_jakarta = pd.concat([data_jakarta, pd.read_csv(name + str(i) + '.csv', header=3)])

data_jakarta.drop(columns=['time'], inplace=True)
data_jakarta['local_time'] = pd.to_datetime(data_jakarta['local_time'])

# print("Info:\n", data_jakarta.info())
# print("Head:\n", data_jakarta.head())
# print("Tail:\n", data_jakarta.tail())

- Summimg up hourly data to turn it into daily data

In [3]:
data_jakarta['local_time'] = data_jakarta['local_time'].dt.date
data_jakarta = data_jakarta.groupby('local_time').sum()
data_jakarta.reset_index(inplace=True)

data_jakarta.head()

Unnamed: 0,local_time,electricity
0,1980-01-01,2812.179
1,1980-01-02,1942.609
2,1980-01-03,1671.945
3,1980-01-04,4477.104
4,1980-01-05,3781.536


- Calculating the daily average temperature for each month
- Taking the transformed data and saving it to an Excel file for further analysis

In [10]:
monthly_means = {}
data_jakarta['local_time'] = pd.to_datetime(data_jakarta['local_time'])

for month in range(1,13):
    monthly_means[month] = [data_jakarta[(data_jakarta['local_time'].dt.year == year) & (data_jakarta['local_time'].dt.month == month)]['electricity'].mean() for year in range(1981, 2024)]

monthly_means = pd.DataFrame(monthly_means, index=range(1981, 2024))
monthly_means.to_excel('transformed_data/monthly_daily_means_jakarta.xlsx')

monthly_means.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12
1981,3532.276774,3820.705,5398.487516,4515.1851,4999.026548,6043.605833,5975.734452,7225.32971,5990.8509,5848.291355,3697.957733,3270.727613
1982,4196.32629,4253.705357,4378.09471,5131.017533,6268.580065,6472.317,6820.661903,6968.873806,7006.4077,6286.224129,6089.551467,4072.249258
1983,4314.323,4635.159964,4019.452258,4494.310533,4678.313613,6131.512033,6611.597484,7048.137484,6975.4444,5728.733323,4284.487167,4373.479548
1984,3798.063452,4054.91469,4787.995613,4928.569333,5176.562968,6364.320633,6410.060742,6641.491161,5759.261667,5637.447161,4693.925467,3525.820129
1985,4716.085677,3629.220679,4843.494806,4966.439467,5815.099,5544.415767,5923.945097,6911.108903,6572.106267,6194.059548,4451.898367,4086.839032


## Data Visualization

- Plotting graphs for average daily temperature for each month, across the years
- Saving the visualizations

In [19]:
import matplotlib.pyplot as plt

month_to_name = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}

for month in range(1, 13):
    name = month_to_name[month]
    # Create the plot with a larger figure size
    plt.figure(figsize=(10, 6))

    # Plotting the data with customizations
    plt.plot(monthly_means[month], label=name, color='blue', linestyle='-', linewidth=2, marker='o', markersize=6)

    # Adding grid lines
    plt.grid(True, linestyle='--', alpha=0.6)

    # Customizing the labels and title
    plt.xlabel('Year', fontsize=14, fontweight='bold')
    plt.ylabel('Electricity (kWh)', fontsize=14, fontweight='bold')
    plt.title(f'Average Daily Electricity Production in Jakarta [{name}]', fontsize=16, fontweight='bold')

    # Adjusting the ticks
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)

    # Adding limit to the y-axis
    plt.ylim(0, 10000)

    # Save the plot
    plt.savefig(f'visualizations\daily_avg_plots\{name}.png')

    # Close the plot
    plt.close()

## ANOVAs

- Performing ANOVA on the data to check if the means of the data (for a given month) are different for different years
- Storing the results in ```results``` folder

In [9]:
from scipy.stats import f_oneway

# data_jakarta['local_time'] = pd.to_datetime(data_jakarta['local_time'])

def anova_test(data):
    f, p = f_oneway(*data)
    return f, p

data_anova = {}

for month in range(1, 13):
    data = [list(data_jakarta[(data_jakarta['local_time'].dt.year == year) & (data_jakarta['local_time'].dt.month == month)]['electricity']) for year in range(1980, 2024)]
    f, p = anova_test(data)
    data_anova[month] = {'f': f, 'p': p}

data_anova

{1: {'f': 6.307179688696506, 'p': 4.529831111210825e-31},
 2: {'f': 5.897754777205716, 'p': 4.361069141739825e-28},
 3: {'f': 7.9329495762371645, 'p': 9.45388202481601e-42},
 4: {'f': 7.794974857729912, 'p': 1.1342161381146498e-40},
 5: {'f': 12.356986951165068, 'p': 7.198778386484757e-70},
 6: {'f': 16.364336535588595, 'p': 5.326167953220151e-93},
 7: {'f': 9.713140355740302, 'p': 2.8373397904842e-53},
 8: {'f': 7.342012505623559, 'p': 7.029592102433699e-38},
 9: {'f': 13.363574475054781, 'p': 1.6004997301014666e-75},
 10: {'f': 17.26058976858006, 'p': 8.825476015375355e-99},
 11: {'f': 9.990996172546776, 'p': 9.190732646743703e-55},
 12: {'f': 5.872161998222271, 'p': 3.3134794512915533e-28}}

In [20]:
pd.DataFrame(data_anova).to_excel("results/daily/anova_results.xlsx")

- ANOVA tests are performed for a given month in groups of 4 years (1980-1983, 1984-1987 and so on)

In [21]:
data_anova_4 = {}

for month in range(1, 13):
    data = [list(data_jakarta[(data_jakarta['local_time'].dt.year == year) & (data_jakarta['local_time'].dt.month == month)]['electricity']) for year in range(1980, 2024)]
    # make further 4 year groups in the data
    # each year element is already in a list, so make a big list with the 4 year data
    data = [data[i]+data[i+1]+data[i+2]+data[i+3] for i in range(0, len(data), 4)]

    f, p = anova_test(data)
    data_anova_4[month] = {'f': f, 'p': p}

data_anova_4

{1: {'f': 6.655992405310937, 'p': 3.7679443613724643e-10},
 2: {'f': 10.49474290427925, 'p': 3.372965947604517e-17},
 3: {'f': 10.841346686523174, 'p': 6.5318855116623494e-18},
 4: {'f': 7.778641337791804, 'p': 3.334788256358022e-12},
 5: {'f': 15.520688509385687, 'p': 1.2997186376373452e-26},
 6: {'f': 11.08669408808854, 'p': 2.415585825622279e-18},
 7: {'f': 5.423825046890683, 'p': 6.449279335218695e-08},
 8: {'f': 8.603779530760464, 'p': 9.595924724844103e-14},
 9: {'f': 9.123674186229115, 'p': 1.0786861327894861e-14},
 10: {'f': 15.059178063491892, 'p': 9.286290765554239e-26},
 11: {'f': 11.074927877858034, 'p': 2.5404589515007754e-18},
 12: {'f': 10.458782411134122, 'p': 3.3768082993751885e-17}}

In [22]:
pd.DataFrame(data_anova_4).to_excel("results/daily/anova_4Yblocks_jakarta.xlsx")

- ANOVA tests are performed for a given month in groups of 11 years (1980-1990, 1991-2001, 2002-2012, 2013-2023)

In [23]:
data_anova_11 = {}

for month in range(1, 13):
    data = [list(data_jakarta[(data_jakarta['local_time'].dt.year == year) & (data_jakarta['local_time'].dt.month == month)]['electricity']) for year in range(1980, 2024)]
    # make further 11 year groups in the data
    # each year element is already in a list, so make a big list with the 4 year data
    data = [data[i]+data[i+1]+data[i+2]+data[i+3]+data[i+4]+data[i+5]+data[i+6]+data[i+7]+data[i+8]+data[i+9]+data[i+10] for i in range(0, len(data), 11)]

    f, p = anova_test(data)
    data_anova_11[month] = {'f': f, 'p': p}

data_anova_11

{1: {'f': 15.616912163057881, 'p': 5.457110714032792e-10},
 2: {'f': 29.32180548549421, 'p': 2.5731313978669222e-18},
 3: {'f': 18.320834721813984, 'p': 1.1781475267874577e-11},
 4: {'f': 30.258046864452226, 'p': 6.458332623558815e-19},
 5: {'f': 27.81867412759177, 'p': 1.8497844885423474e-17},
 6: {'f': 29.035091417780528, 'p': 3.53324620962379e-18},
 7: {'f': 9.6101835560704, 'p': 2.7979468726573845e-06},
 8: {'f': 3.511806393354974, 'p': 0.014769878377746045},
 9: {'f': 3.0394886473338514, 'p': 0.02811191049725237},
 10: {'f': 6.620760745334331, 'p': 0.0001934561521948307},
 11: {'f': 14.458276737660725, 'p': 2.861564181590436e-09},
 12: {'f': 23.451951448816217, 'p': 8.421429917338259e-15}}

In [24]:
pd.DataFrame(data_anova_11).to_excel("results/daily/anova_11Yblocks_jakarta.xlsx")