In [1]:
import pandas as pd
import numpy as np
from math import log2

In [3]:


# Define the dataset
data = {
    'Week of the month': [1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 1],
    'Day of the week': [4, 5, 6, 2, 3, 4, 5, 6, 2, 3, 4, 5, 6, 2, 3, 4, 5, 6, 2, 3],
    'Non-urgent order': [316.307, 128.633, 43.651, 171.297, 90.532, 110.925, 144.124, 119.379, 218.856, 146.518, 178.433, 145.865, 170.566, 220.343, 193.768, 122.736, 144.051, 105.415, 240.66, 131.067],
    'Urgent order': [223.27, 96.042, 84.375, 127.667, 113.526, 96.36, 118.919, 113.87, 124.381, 101.045, 102.793, 91.18, 114.412, 141.406, 141.854, 124.256, 158.408, 108.688, 163.72, 166.649],
    'Order type A': [61.543, 38.058, 21.826, 41.542, 37.679, 30.792, 43.304, 38.584, 33.973, 36.399, 45.706, 43.851, 43.339, 46.241, 56.519, 56.167, 51.66, 47.717, 59.135, 90.476],
    'Order type B': [175.586, 56.037, 25.125, 113.294, 56.618, 50.704, 66.371, 85.961, 148.274, 43.306, 111.036, 66.277, 136.434, 120.865, 136.709, 78.101, 92.272, 71.474, 157.681, 80.509],
    'Order type C': [302.448, 130.58, 82.461, 162.284, 116.22, 125.868, 153.368, 124.413, 162.044, 168.723, 124.678, 133.44, 128.405, 196.296, 143.644, 112.724, 164.948, 113.935, 187.564, 127.575],
    'Fiscal sector orders': [0, 0, 1.386, 18.156, 6.459, 79, 0, 15.709, 1.054, 865, 194, 6.523, 23.2, 1.653, 1.25, 0, 6.421, 19.023, 0, 844],
    'Orders from the traffic controller sector': [65556, 40419, 11992, 49971, 48534, 52042, 46573, 35033, 66612, 58224, 47046, 66910, 32529, 34878, 57858, 52321, 47167, 42737, 39273, 60543],
    'Banking orders (1)': [44914, 21399, 3452, 33703, 19646, 8773, 33597, 26278, 19461, 7742, 17299, 17768, 34002, 32905, 23956, 10046, 6440, 26020, 32917, 19141],
    'Banking orders (2)': [188411, 89461, 21305, 69054, 16411, 47522, 48269, 56665, 103376, 82395, 108719, 36693, 78153, 117137, 101048, 62799, 91784, 27873, 155617, 78378],
    'Banking orders (3)': [14793, 7679, 14947, 18423, 20257, 24966, 20973, 18502, 10458, 11948, 15560, 29046, 31949, 29188, 30134, 24233, 15973, 17600, 9203, 73839],
    'Target (Total orders)': [539.577, 224.675, 129.412, 317.12, 210.517, 207.364, 263.043, 248.958, 344.291, 248.428, 281.42, 243.568, 308.178, 363.402, 336.872, 246.992, 308.88, 233.126, 404.38, 298.56]
}

# Create a DataFrame from the dataset
df = pd.DataFrame(data)
print(df)



    Week of the month  Day of the week  Non-urgent order  Urgent order  \
0                   1                4           316.307       223.270   
1                   1                5           128.633        96.042   
2                   1                6            43.651        84.375   
3                   2                2           171.297       127.667   
4                   2                3            90.532       113.526   
5                   2                4           110.925        96.360   
6                   2                5           144.124       118.919   
7                   2                6           119.379       113.870   
8                   3                2           218.856       124.381   
9                   3                3           146.518       101.045   
10                  3                4           178.433       102.793   
11                  3                5           145.865        91.180   
12                  3                6

In [4]:


# Calculate the entropy of a given column
def calculate_entropy(column):
    counts = column.value_counts()
    probabilities = counts / len(column)
    entropy = -sum(probabilities * np.log2(probabilities))
    return entropy

# Calculate the information gain of an attribute
def calculate_information_gain(data, attribute, target):
    attribute_entropy = 0
    values = data[attribute].unique()
    
    for value in values:
        subset = data[data[attribute] == value]
        subset_entropy = calculate_entropy(subset[target])
        weight = len(subset) / len(data)
        attribute_entropy += weight * subset_entropy
    
    target_entropy = calculate_entropy(data[target])
    information_gain = target_entropy - attribute_entropy
    return information_gain

# Find the attribute with the highest information gain
target_attribute = 'Target (Total orders)'
attributes = df.columns[:-1]
information_gains = {}

for attribute in attributes:
    information_gain = calculate_information_gain(df, attribute, target_attribute)
    information_gains[attribute] = information_gain

highest_gain_attribute = max(information_gains, key=information_gains.get)
highest_gain_value = information_gains[highest_gain_attribute]



# Print the attribute with the highest information gain
print(f"\nAttribute with the highest information gain: {highest_gain_attribute}")
print(f"Highest information gain value: {highest_gain_value:.4f}")



Attribute with the highest information gain: Non-urgent order
Highest information gain value: 4.3219


In [12]:


# Calculate the consistency score
consistency_score = (df['Non-urgent order']) / df['Target (Total orders)']
for i, score in enumerate(consistency_score):
    print(f"Data point {i}: Consistency Score = {score:.4f}")
average_consistency_score = consistency_score.mean()

print("Consistency Score:", average_consistency_score)


Data point 0: Consistency Score = 0.5862
Data point 1: Consistency Score = 0.5725
Data point 2: Consistency Score = 0.3373
Data point 3: Consistency Score = 0.5402
Data point 4: Consistency Score = 0.4300
Data point 5: Consistency Score = 0.5349
Data point 6: Consistency Score = 0.5479
Data point 7: Consistency Score = 0.4795
Data point 8: Consistency Score = 0.6357
Data point 9: Consistency Score = 0.5898
Data point 10: Consistency Score = 0.6340
Data point 11: Consistency Score = 0.5989
Data point 12: Consistency Score = 0.5535
Data point 13: Consistency Score = 0.6063
Data point 14: Consistency Score = 0.5752
Data point 15: Consistency Score = 0.4969
Data point 16: Consistency Score = 0.4664
Data point 17: Consistency Score = 0.4522
Data point 18: Consistency Score = 0.5951
Data point 19: Consistency Score = 0.4390
Consistency Score: 0.5335785615524851


In [16]:


# Apply min-max scaling to normalize the data
normalized_df = (df - df.min()) / (df.max() - df.min())

# Calculate the consistency score
consistency_score = normalized_df['Non-urgent order'] / normalized_df['Target (Total orders)']
for i, score in enumerate(consistency_score):
    print(f"Data point {i}: Consistency Score = {score:.4f}")
    average_consistency_score = consistency_score.mean()

print("Consistency Score:", average_consistency_score)


Data point 0: Consistency Score = 1.0000
Data point 1: Consistency Score = 1.3420
Data point 2: Consistency Score = nan
Data point 3: Consistency Score = 1.0230
Data point 4: Consistency Score = 0.8695
Data point 5: Consistency Score = 1.2983
Data point 6: Consistency Score = 1.1311
Data point 7: Consistency Score = 0.9529
Data point 8: Consistency Score = 1.2266
Data point 9: Consistency Score = 1.3002
Data point 10: Consistency Score = 1.3339
Data point 11: Consistency Score = 1.3470
Data point 12: Consistency Score = 1.0680
Data point 13: Consistency Score = 1.1360
Data point 14: Consistency Score = 1.0885
Data point 15: Consistency Score = 1.0118
Data point 16: Consistency Score = 0.8416
Data point 17: Consistency Score = 0.8959
Data point 18: Consistency Score = 1.0778
Data point 19: Consistency Score = 0.7774
Consistency Score: 1.0905995149880656
