# Information Theory (Review 1)
Let $X, Y$ be a two random variables and $P,Q$, two distributions of the same random variabel.

1. $H[X] = E_X[-\log{P_X(X)}] = -\sum_{x} P_X(x) \log{P_X(x)}$

2. $H[P] = E_P[-\log{P}] =  - \sum_{x}P(x)\log{P(x)}$

3. $H[X|Y=y] = - \sum_x P(X=x| Y=y) \log{P(X=x| Y=y)}$

4. $H[X|Y] = E_{X|Y}[-\log{P(X|Y)}] = \sum_y P(Y=y) H[X|Y=y] $

5. $H[X \land Y=y] =-\sum_x P(X=x\land Y=y) \log{P(X=x\land Y=y)} $

6. $H[X\land Y] = \sum_y H[X \land Y=y]$

7. $H[X|Y] = H[X\land Y] - H[Y]$

    proof. 

8. $I(X;Y) = H[X] - H[X|Y] = H[Y] - H[Y|X]=I(Y;X) = H[X] + H[Y] - H[X\land Y]$

9. Cross Entropy: $H[P,Q]= E_P[-\log{Q}] = - \sum_x P(x) \log{Q(x)}$ 

10. KL-divergence: $D_{kl}(P||Q)=H[P,Q] - H[P]$



## References:
1. CS189


In [147]:
import numpy as np
import pandas as pd

# Implementations of Formulas

In [148]:


def entropy(column, measure:str='counting'):
    """
    column is a column in the data_setset, i.e. values of a
    feature in the data_setset
    measure could be "counting" or "leb"
    """
    if measure == 'counting':
        counts = np.bincount(column)
        probabilities = counts / len(column)
        return -np.sum([p * np.log2(p) for p in probabilities if p > 0])


def information_gain(data_set, split_feature, target_feature):
    # Total entropy w.r.t the target_feature before the split
    total_entropy = entropy(data_set[target_feature])

    # Calculate weighted entropy after the split
    values, counts = np.unique(data_set[split_feature], return_counts=True)
    print(values, counts)
    weighted_entropy = np.sum([(counts[i] / np.sum(counts)) *
                               entropy(data_set[data_set[split_feature] == values[i]][target_feature]) # w.r.t target feature,
                               for i in range(len(values))])

    # Information gain
    return total_entropy - weighted_entropy

In [149]:
entropy([1,1,2,2,3,3]),np.log2(3), entropy([1,1,2,3,3,3,3,4,4])
# n_values
# max_entropy = log n_value
# what about the min_entropy?

(1.584962500721156, 1.584962500721156, 1.836591668108979)

In [150]:
S = pd.read_csv('PlayTennis.csv')
data_size = len(S)
print(data_size)
print(S)


14
     Outlook Temperature Humidity    Wind Play Tennis
0      Sunny         Hot     High    Weak          No
1      Sunny         Hot     High  Strong          No
2   Overcast         Hot     High    Weak         Yes
3       Rain        Mild     High    Weak         Yes
4       Rain        Cool   Normal    Weak         Yes
5       Rain        Cool   Normal  Strong          No
6   Overcast        Cool   Normal  Strong         Yes
7      Sunny        Mild     High    Weak          No
8      Sunny        Cool   Normal    Weak         Yes
9       Rain        Mild   Normal    Weak         Yes
10     Sunny        Mild   Normal  Strong         Yes
11  Overcast        Mild     High  Strong         Yes
12  Overcast         Hot   Normal    Weak         Yes
13      Rain        Mild     High  Strong          No


In [151]:
# label encoding the data
from sklearn.preprocessing import LabelEncoder
# encoded_df = S.apply(LabelEncoder().fit_transform)

dict_unique_values = {}
for col in S.columns:
    S[col], unique_values = pd.factorize(S[col])
    dict_unique_values[col] = unique_values

print(dict_unique_values)
S


{'Outlook': Index(['Sunny', 'Overcast', 'Rain'], dtype='object'), 'Temperature': Index(['Hot', 'Mild', 'Cool'], dtype='object'), 'Humidity': Index(['High', 'Normal'], dtype='object'), 'Wind': Index(['Weak', 'Strong'], dtype='object'), 'Play Tennis': Index(['No', 'Yes'], dtype='object')}


Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
0,0,0,0,0,0
1,0,0,0,1,0
2,1,0,0,0,1
3,2,1,0,0,1
4,2,2,1,0,1
5,2,2,1,1,0
6,1,2,1,1,1
7,0,1,0,0,0
8,0,2,1,0,1
9,2,1,1,0,1


In [152]:
# Calculate Information Gain for each feature
target = 'Play Tennis'  # Specify your target column
for column in S.columns:
        # print()
    print(f"Total Entropy w.r.t \n{column} = {entropy(S[column])}")
    # if column != target:
    gain = information_gain(S, column, target)
    print(f"Information Gain for {column} \nwrt. `{target}`: {gain}\n")


Total Entropy w.r.t 
Outlook = 1.5774062828523454
[0 1 2] [5 4 5]
Information Gain for Outlook 
wrt. `Play Tennis`: 0.24674981977443933

Total Entropy w.r.t 
Temperature = 1.5566567074628228
[0 1 2] [4 6 4]
Information Gain for Temperature 
wrt. `Play Tennis`: 0.02922256565895487

Total Entropy w.r.t 
Humidity = 1.0
[0 1] [7 7]
Information Gain for Humidity 
wrt. `Play Tennis`: 0.15183550136234159

Total Entropy w.r.t 
Wind = 0.9852281360342515
[0 1] [8 6]
Information Gain for Wind 
wrt. `Play Tennis`: 0.04812703040826949

Total Entropy w.r.t 
Play Tennis = 0.9402859586706311
[0 1] [5 9]
Information Gain for Play Tennis 
wrt. `Play Tennis`: 0.9402859586706311



In [None]:
# 1. Loading Dataset
dataset_name = 'lung-cancer.csv'
data_path = f"dataset/{dataset_name}"

raw_data = pd.read_csv(data_path)
raw_data

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,F,56,1,1,1,2,2,2,1,1,2,2,2,2,1,YES
305,M,70,2,1,1,1,1,2,2,2,2,2,2,1,2,YES
306,M,58,2,1,1,1,1,1,2,2,2,2,1,1,2,YES
307,M,67,2,1,2,1,1,2,2,1,2,2,2,1,2,YES


In [154]:
# 2. Preprocessing

from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split

# checking missing values
print(raw_data.isna().sum())

# ordinal Encoding of the first column
values, counts= np.unique(raw_data['GENDER'], return_counts=True)
print(values, counts)
ordinal_encoder_gender = OrdinalEncoder(categories=[values])
raw_data['GENDER'] = ordinal_encoder_gender.fit_transform(raw_data[['GENDER']])


values, counts = np.unique(raw_data["LUNG_CANCER"], return_counts=True)

ordinal_encoder_lung_cancer = OrdinalEncoder(categories=[values])
raw_data['LUNG_CANCER'] = ordinal_encoder_lung_cancer.fit_transform(raw_data[['LUNG_CANCER']])

# Seperating the target column
Y = np.array(raw_data[raw_data.columns[-1]])
X = np.array(raw_data.drop([raw_data.columns[-1]], axis=1))

X.shape, Y.shape


GENDER                   0
AGE                      0
SMOKING                  0
YELLOW_FINGERS           0
ANXIETY                  0
PEER_PRESSURE            0
CHRONIC DISEASE          0
FATIGUE                  0
ALLERGY                  0
WHEEZING                 0
ALCOHOL CONSUMING        0
COUGHING                 0
SHORTNESS OF BREATH      0
SWALLOWING DIFFICULTY    0
CHEST PAIN               0
LUNG_CANCER              0
dtype: int64
['F' 'M'] [147 162]


((309, 15), (309,))

In [161]:
# splitting the dataset 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2)

print(X_train.shape, X_test.shape)

# normalizing the dataset
min_max_scaler = MinMaxScaler()


X_train_normalized = min_max_scaler.fit_transform(X_train[:, 1:])
X_train_normalized = np.hstack((X_train[:,0].reshape(X_train.shape[0], 1), X_train_normalized))
print(X_train_normalized.shape)

X_test_normalized = min_max_scaler.fit_transform(X_test[:,1:])
X_test_normalized = np.hstack((X_test[:,0].reshape(X_test.shape[0],1), X_test_normalized))
X_test_normalized.shape
# X_test_normalized = min_max_scaler.fit_transform(X_test)
# Y_test_normalized = min_max_scaler.fit_transform(Y_test)

# X_train


(247, 15) (62, 15)
(247, 15)


(62, 15)

In [552]:
from sklearn.tree import DecisionTreeClassifier
params_score = []
for i in range(1,15):
    for j in range(2, 90):
        model_dt = DecisionTreeClassifier(criterion='entropy', 
                                        max_depth=i, 
                                        max_leaf_nodes=j,
                                        max_features='log2')
        model_dt.fit(X_train_normalized, Y_train)
        score = model_dt.score(X_test_normalized, Y_test)
        res ={
            "score": score,
            **model_dt.get_params()
        }
        params_score.append(res)
        
params_score.sort(key=lambda x: x['score'], reverse=True)
params_score

[{'score': 0.9838709677419355,
  'ccp_alpha': 0.0,
  'class_weight': None,
  'criterion': 'entropy',
  'max_depth': 5,
  'max_features': 'log2',
  'max_leaf_nodes': 63,
  'min_impurity_decrease': 0.0,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'min_weight_fraction_leaf': 0.0,
  'monotonic_cst': None,
  'random_state': None,
  'splitter': 'best'},
 {'score': 0.9838709677419355,
  'ccp_alpha': 0.0,
  'class_weight': None,
  'criterion': 'entropy',
  'max_depth': 6,
  'max_features': 'log2',
  'max_leaf_nodes': 37,
  'min_impurity_decrease': 0.0,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'min_weight_fraction_leaf': 0.0,
  'monotonic_cst': None,
  'random_state': None,
  'splitter': 'best'},
 {'score': 0.967741935483871,
  'ccp_alpha': 0.0,
  'class_weight': None,
  'criterion': 'entropy',
  'max_depth': 3,
  'max_features': 'log2',
  'max_leaf_nodes': 28,
  'min_impurity_decrease': 0.0,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'min_weight_fraction_leaf': 0

In [551]:
best_model = DecisionTreeClassifier(criterion='entropy', 
                                        max_depth=3, 
                                        max_leaf_nodes=37,
                                        max_features='log2')
model_dt.fit(X_train_normalized, Y_train)
score = model_dt.score(X_test_normalized, Y_test)
score

0.967741935483871

In [463]:
from sklearn.metrics import make_scorer, f1_score, accuracy_score
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline

import numpy as np

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns


In [495]:

# Create a pipeline to scale data and train SVM
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('DTC', DecisionTreeClassifier())
])

In [496]:
param_grid_DTC = {
    'DTC__max_depth': [i for i in range(1, 15)],
    'DTC__criterion': ['entropy', 'gini', 'log_loss'], 
    'DTC__max_features': ['auto', 'log2', 'sqrt'],
    'DTC__max_leaf_nodes': [i for i in range(20)]+[None]
}

scoring = {
    'f1_score': make_scorer(f1_score, average='weighted'), 
    'accuracy': 'accuracy'
}

In [499]:
# Perform grid search
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid_DTC,
    cv=5,                         # Use 5-fold cross-validation
    scoring=scoring,              # Use multiple metrics
    refit='accuracy',             # Optimize the model for F1-score
    # n_jobs=-1                     # Use all available CPU cores
)

# Step 5: Fit the model on the training set (this includes cross-validation)
grid_search.fit(X_train_normalized, Y_train)


5250 fits failed out of a total of 13230.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4410 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/lib/python3/dist-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3/dist-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/lib/python3/dist-packages/sklearn/base.py", line 1467, in wrapper
    estimator._valida

In [500]:
results = grid_search.cv_results_
# print(results)
# Create a DataFrame from cv_results_
df = pd.DataFrame(grid_search.cv_results_)
# print(df.columns)
# print(df['params'])
# print(df['mean_test_accuracy'])
mean_test_accuracy = df[['params', 'mean_test_accuracy']].copy()
# print(mean_test_accuracy.iloc[0]) # print the first row
# print(mean_test_accuracy.iloc[:, 1])  # print the second column
# print(mean_test_accuracy[mean_test_accuracy.columns[1]])  # print the second column


# Save to CSV
df.to_csv("grid_search_results.csv", index=False)

print("Results saved to grid_search_results.csv")
# Sort by mean_test_accuracy and print top 5
# top_results = sorted(zip(results['mean_test_accuracy'], results['params']), reverse=True)[:5]
# for score, params in top_results:
#     print(f"Score {score:.4f}: Params {params}")

mean_test_accuracy_sorted = mean_test_accuracy.sort_values(by="mean_test_accuracy",  ascending=False)
params = df.columns
params
df

Results saved to grid_search_results.csv


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_DTC__criterion,param_DTC__max_depth,param_DTC__max_features,param_DTC__max_leaf_nodes,params,split0_test_f1_score,...,std_test_f1_score,rank_test_f1_score,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,split4_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy
0,0.001841,0.000651,0.000000,0.000000,entropy,1,auto,0,"{'DTC__criterion': 'entropy', 'DTC__max_depth'...",,...,,1597,,,,,,,,1597
1,0.001507,0.000755,0.000000,0.000000,entropy,1,auto,1,"{'DTC__criterion': 'entropy', 'DTC__max_depth'...",,...,,1597,,,,,,,,1597
2,0.000859,0.000192,0.000000,0.000000,entropy,1,auto,2,"{'DTC__criterion': 'entropy', 'DTC__max_depth'...",,...,,1597,,,,,,,,1597
3,0.001028,0.000479,0.000000,0.000000,entropy,1,auto,3,"{'DTC__criterion': 'entropy', 'DTC__max_depth'...",,...,,1597,,,,,,,,1597
4,0.000640,0.000042,0.000000,0.000000,entropy,1,auto,4,"{'DTC__criterion': 'entropy', 'DTC__max_depth'...",,...,,1597,,,,,,,,1597
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2641,0.001599,0.000341,0.002463,0.000480,log_loss,14,sqrt,16,"{'DTC__criterion': 'log_loss', 'DTC__max_depth...",0.848095,...,0.027132,1059,0.84,0.80,0.836735,0.795918,0.857143,0.825959,0.023925,1591
2642,0.001572,0.000242,0.002675,0.000483,log_loss,14,sqrt,17,"{'DTC__criterion': 'log_loss', 'DTC__max_depth...",0.828485,...,0.020251,443,0.84,0.86,0.857143,0.877551,0.857143,0.858367,0.011925,1272
2643,0.001531,0.000483,0.002370,0.000372,log_loss,14,sqrt,18,"{'DTC__criterion': 'log_loss', 'DTC__max_depth...",0.811556,...,0.011154,1169,0.84,0.80,0.857143,0.836735,0.857143,0.838204,0.020892,1569
2644,0.001789,0.000777,0.002669,0.000760,log_loss,14,sqrt,19,"{'DTC__criterion': 'log_loss', 'DTC__max_depth...",0.843269,...,0.031791,322,0.86,0.84,0.918367,0.897959,0.836735,0.870612,0.032329,459


In [501]:
print("Best Parameters from Grid Search:", grid_search.best_params_)
print("Best score from Grid Search:", grid_search.best_score_)
print("Best estimator from Grid Search:",  grid_search.best_estimator_)

Best Parameters from Grid Search: {'DTC__criterion': 'entropy', 'DTC__max_depth': 10, 'DTC__max_features': 'log2', 'DTC__max_leaf_nodes': 11}
Best score from Grid Search: 0.9071836734693877
Best estimator from Grid Search: Pipeline(steps=[('scaler', MinMaxScaler()),
                ('DTC',
                 DecisionTreeClassifier(criterion='entropy', max_depth=10,
                                        max_features='log2',
                                        max_leaf_nodes=11))])
