In [1]:
import scipy.optimize as optimize
import scipy.stats as stats
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os


In [2]:
# Datasets

folder_path = r"C:\Users\Alicia BASSIERE\OneDrive - GENES\Documents\Paper 01 - DIPU\Estimation\wind\clusterOnshore"
file_prefix = "cluster_"

csv_files = [file for file in os.listdir(folder_path) if file.startswith(file_prefix)]
dataframes = []
zero_count = 0

for file in csv_files:
    file_path = os.path.join(folder_path, file)
    data = pd.read_csv(file_path, index_col=0)
    data.drop(data.tail(1).index, inplace=True)

    # Remove zeros from the dataframe
    data = data[data != 0]

    # Count the number of zeros removed
    zero_count += (data == 0).sum().sum()

    dataframes.append(data)


In [3]:
def neg_log_likelihood(params, data):
    alpha, beta = params
    return -np.sum(stats.beta.logpdf(data, alpha, beta))

In [4]:
# Create an empty dataframe to store the results
result_df = pd.DataFrame(columns=['Dataset', 'Alpha', 'Beta'])

# Iterate over each dataframe
for idx, data in enumerate(dataframes):
    # Minimization
    bounds = [(0.01, 100), (0.01, 100)] # set bounds for alpha and beta
    result = optimize.minimize(neg_log_likelihood, [0.1, 0.1], args=(data,), bounds=bounds)
    alpha_mle, beta_mle = result.x

    # Create a temporary dataframe with the results
    temp_df = pd.DataFrame({'Dataset': [f'Dataset_{idx}'], 'Alpha': [alpha_mle], 'Beta': [beta_mle]})

    # Concatenate the temporary dataframe with the result dataframe
    result_df = pd.concat([result_df, temp_df], ignore_index=True)


# Print the final dataframe
print(result_df)

       Dataset     Alpha      Beta
0    Dataset_0  1.033401  2.781667
1    Dataset_1  0.100000  0.100000
2    Dataset_2  0.995456  3.884933
3    Dataset_3  0.100000  0.100000
4    Dataset_4  0.100000  0.100000
5    Dataset_5  0.100000  0.100000
6    Dataset_6  1.003867  3.867980
7    Dataset_7  0.100000  0.100000
8    Dataset_8  0.100000  0.100000
9    Dataset_9  0.100000  0.100000
10  Dataset_10  1.044452  1.488646
11  Dataset_11  0.100000  0.100000
12  Dataset_12  0.100000  0.100000
13  Dataset_13  0.100000  0.100000
14  Dataset_14  0.100000  0.100000
15  Dataset_15  0.100000  0.100000
16  Dataset_16  0.100000  0.100000
17  Dataset_17  0.100000  0.100000
18  Dataset_18  0.100000  0.100000
19  Dataset_19  1.019030  1.575608
20  Dataset_20  0.100000  0.100000
21  Dataset_21  0.100000  0.100000
22  Dataset_22  0.100000  0.100000
23  Dataset_23  0.100000  0.100000
24  Dataset_24  1.074297  3.057369
25  Dataset_25  1.000197  5.523226
26  Dataset_26  0.100000  0.100000
27  Dataset_27  1.07

In [5]:
# Create an empty dataframe to store the results
result_df = pd.DataFrame(columns=['Dataset', 'Alpha', 'Beta'])

# Define the folder path to save the PDF files
save_folder = r"C:\Users\Alicia BASSIERE\OneDrive - GENES\Documents\Paper 02 - Mean Field\Estimation\Wind capacity factor"

# Iterate over each dataframe
for idx, data in enumerate(dataframes):
    # Remove zeros from the dataframe
    data = data[data != 0]

    # Define the bounds for alpha and beta
    bounds = [(0.01, 100), (0.01, 100)]

    # Perform differential evolution
    result = optimize.differential_evolution(neg_log_likelihood, bounds, args=(data,))
    alpha_mle, beta_mle = result.x

    # Create a temporary dataframe with the results
    temp_df = pd.DataFrame({'Dataset': [f'Dataset_{idx}'], 'Alpha': [alpha_mle], 'Beta': [beta_mle]})

    # Concatenate the temporary dataframe with the result dataframe
    result_df = pd.concat([result_df, temp_df], ignore_index=True)

    # Plot the estimated distribution
    x = np.linspace(0, 1, 1000)
    y = stats.beta.pdf(x, alpha_mle, beta_mle)

    plt.plot(x, y)
    plt.xlabel('Value')
    plt.ylabel('Density')
    plt.title(f'Dataset_{idx} - Estimated Distribution')
    plt.savefig(f'{save_folder}/Dataset_{idx}_distribution.pdf')
    plt.close()

# Print the final dataframe
print(result_df)

       Dataset      Alpha       Beta
0    Dataset_0   1.033403   2.781675
1    Dataset_1  45.722017  96.446003
2    Dataset_2   0.995456   3.884933
3    Dataset_3   5.336834  20.507501
4    Dataset_4   9.526821  11.295595
5    Dataset_5  23.125070  64.143078
6    Dataset_6   1.003867   3.867979
7    Dataset_7  68.797991  57.017934
8    Dataset_8  44.574042  97.569084
9    Dataset_9  22.359783  38.607490
10  Dataset_10   1.044451   1.488645
11  Dataset_11  60.650355  18.945196
12  Dataset_12  76.715590  39.675659
13  Dataset_13  85.818157  42.532584
14  Dataset_14  36.783022  91.914761
15  Dataset_15  79.516837  10.144954
16  Dataset_16  15.499649  35.024739
17  Dataset_17  38.477778  71.577363
18  Dataset_18  75.193557  34.212173
19  Dataset_19   1.019030   1.575609
20  Dataset_20  77.539741  78.136129
21  Dataset_21  11.558684  51.142564
22  Dataset_22  87.145351  74.306470
23  Dataset_23  57.138273  15.685532
24  Dataset_24   1.074297   3.057367
25  Dataset_25   1.000197   5.523224
2