# Statistical study of the subset sum problem
## Metaheuristic Problem Solving | MUIIA

### Imports
---

In [253]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

### READ & PREPROCESS DATAFRAME
---

In [254]:
s = 100
case = 2
prob_type = "high" # high | medium | low

pathFiles = f"./../src/case{case}/z{s}/{prob_type}prob"

In [255]:
def read_csv_result(path):
    return pd.read_csv(f'{path}/result.csv')

def read_and_combine_exec_csv(pathFolder, numberFiles):
    strs = [f'{pathFolder}/result{x+1}.csv' for x in range(numberFiles)]
    return pd.concat(map(pd.read_csv, strs), ignore_index=True)

In [256]:
""" READ & PREPROCESS DATAFRAME """
# reading csv file
df = read_csv_result(pathFiles)

# merging two csv files
dff = read_and_combine_exec_csv(pathFiles+"/executions", 30)



In [257]:
""" VARIABLES GA """
C = 0

if (s == 100): 
    C = 3677.0
if (s == 1000):
    C = 366097.0
if (s == 10000):
    C = 3.7957325E7

# C = C_z10000

In [258]:
""" SET OPTIMAL COLUMN """
df["optimal"] = np.where(df["bestf"] == C, 1, 0)

In [259]:
""" NORMALIZE DF """
max_bound = C
min_bound = min([df['bestf'].min(),df['worstf'].min(),df['avgf'].min()])

# Method 1
def normalize(df, column, min, max):
    # min = df[column].min()
    # max = C
    return ((df[column] - min) / (max - min))

# normalice df
df["bestf"] = normalize(df,"bestf",min_bound, max_bound)
df["worstf"] = normalize(df,"worstf",min_bound, max_bound)
df["avgf"] = normalize(df,"avgf",min_bound, max_bound)

# Method 2

# columns_to_normalize = ['bestf', 'worstf', 'avgf']
# df[columns_to_normalize] = MinMaxScaler(feature_range=(0, 1)).fit_transform(df[columns_to_normalize])

In [260]:
""" NORMALIZE DFF """
max_bound = C
min_bound = min([dff['bestf'].min(),dff['worstf'].min(),dff['avgf'].min()])

# normalice dff
# dff[columns_to_normalize] = MinMaxScaler(feature_range=(0, 1)).fit_transform(dff[columns_to_normalize]
dff["bestf"] = normalize(dff,"bestf",min_bound, max_bound)
dff["worstf"] = normalize(dff,"worstf",min_bound, max_bound)
dff["avgf"] = normalize(dff,"avgf",min_bound, max_bound)

In [261]:
# Get index of best fitness executions
optimalIndx = df["optimal"] == 1

# Get optimal and not optimal dataset of mean dataset
optimalRes = df[optimalIndx]
notOptimalRes = df[~optimalIndx]

# Get optimal and not optimal dataset of full dataset
optimalResFull = dff[dff["exec"].isin(optimalRes["exec"])]
notOptimalResFull = dff[dff["exec"].isin(notOptimalRes["exec"])]

In [262]:
duration_mean = round(df["duration"].mean(),2)
optimal_n = round(df[df["optimal"] == 1]["exec"].count(),4)
avg_f_mean = round(dff["avgf"].mean(),4)
avg_w_mean = round(dff["worstf"].mean(),4)

print("""
Avg f mean      - {}
Avg worst mean  - {}
Optimal n       - {}
Duration mean   - {}
""".format(avg_f_mean, avg_w_mean,optimal_n,duration_mean))


Avg f mean      - 0.9381
Avg worst mean  - 0.711
Optimal n       - 2
Duration mean   - 35233.4



In [263]:
df.describe()

Unnamed: 0,exec,bestf,worstf,avgf,endstep,duration,optimal
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,15.5,0.999994,0.757284,0.958458,57732.7,35233.4,0.066667
std,8.803408,6e-06,0.260583,0.049271,9595.977825,6761.904305,0.253708
min,1.0,0.999971,0.0,0.697739,10001.0,2034.0,0.0
25%,8.25,0.999992,0.749888,0.967557,60000.0,36316.0,0.0
50%,15.5,0.999996,0.82393,0.967768,60000.0,36503.5,0.0
75%,22.75,0.999998,0.899043,0.967849,60000.0,36856.0,0.0
max,30.0,1.0,0.940826,0.968241,60000.0,40238.0,1.0


### PLOTS
---

#### BAR PLOTS

In [264]:
# df.iloc[1:5,:].head()

In [265]:
# sns.set_theme(style="darkgrid")

# f, ax = plt.subplots(figsize=(15, 5))
# sns.despine(f)

# sns.barplot(x="exec", y="bestf",data=df, hue="optimal")

# # Add titles to the plot
# plt.title("Comparison of Best Fitness")
# plt.xlabel("Execution")
# plt.ylabel("Best Fitness")
# plt.legend()

# # Show plot
# plt.show()

In [266]:
# sns.set_theme(style="darkgrid")

# f, ax = plt.subplots(figsize=(15, 5))
# sns.despine(f)

# sns.barplot(x="exec", y="avgf",data=df, hue="optimal")

# # Add titles to the plot
# plt.title("Comparison of Average Fitness")
# plt.xlabel("Execution")
# plt.ylabel("Average Fitness")
# plt.legend()

# # Show plot
# plt.show()

In [267]:
# sns.set_theme(style="darkgrid")

# f, ax = plt.subplots(figsize=(15, 5))
# sns.despine(f)

# sns.barplot(x="exec", y="worstf",data=df, hue="optimal")

# # Add titles to the plot
# plt.title("Comparison of Worst Fitness")
# plt.xlabel("Execution")
# plt.ylabel("Worst Fitness")
# plt.legend()

# # Show plot
# plt.show()

#### HIST PLOTS

In [268]:
# f, ax = plt.subplots(figsize=(20, 10))
# sns.despine(f)

# # Draw a nested boxplot to show bills by day and time
# sns.boxplot(x="exec", y="worstf", data=dff)
# sns.despine(offset=10, trim=True)

In [269]:
# f, ax = plt.subplots(figsize=(20, 10))
# sns.despine(f)

# # Draw a nested boxplot to show bills by day and time
# sns.boxplot(x="exec", y="avgf", data=dff)
# sns.despine(offset=10)

In [270]:
# f, ax = plt.subplots(figsize=(20, 10))
# sns.despine(f)

# # Draw a nested boxplot to show bills by day and time
# sns.lineplot(x="step", y="bestf", data=dff, hue="exec")
# sns.despine(offset=10)