In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv('BMIdata.csv')
data

Unnamed: 0,Gender,Height,Weight,Index
0,Male,174,96,4
1,Male,189,87,2
2,Female,185,110,4
3,Female,195,104,3
4,Male,149,61,3
...,...,...,...,...
495,Female,150,153,5
496,Female,184,121,4
497,Female,141,136,5
498,Male,150,95,5


In [43]:
data['obese'] = (data.Index >= 4).astype('int')
data.drop('Index', axis = 1, inplace = True)
data


Unnamed: 0,Gender,Height,Weight,obese
0,Male,174,96,1
1,Male,189,87,0
2,Female,185,110,1
3,Female,195,104,0
4,Male,149,61,0
...,...,...,...,...
495,Female,150,153,1
496,Female,184,121,1
497,Female,141,136,1
498,Male,150,95,1


In [44]:
print(
  " Misclassified when cutting at 100kg:",
  data.loc[(data['Weight']>=100) & (data['obese']==0),:].shape[0], "\n",
  "Misclassified when cutting at 80kg:",
  data.loc[(data['Weight']>=80) & (data['obese']==0),:].shape[0]
)


 Misclassified when cutting at 100kg: 18 
 Misclassified when cutting at 80kg: 63


In [45]:

#Menghitung Entropy

def entropy(y):
  '''
  Given a Pandas Series, it calculates the entropy.
  y: variable with which calculate entropy.
  '''
  if isinstance(y, pd.Series):
    a = y.value_counts()/y.shape[0]
    entropy = np.sum(-a*np.log2(a+1e-9))
    return(entropy)

  else:
    raise('Object must be a Pandas Series.')

entropy(data.Gender)


0.9997114388674198

In [46]:

# Menghitung Index Gini

def gini_impurity(y):
  '''
  Given a Pandas Series, it calculates the Gini Impurity.
  y: variable with which calculate Gini Impurity.
  '''
  if isinstance(y, pd.Series):
    p = y.value_counts()/y.shape[0]
    gini = 1-np.sum(p**2)
    return(gini)

  else:
    raise('Object must be a Pandas Series.')

gini_impurity(data.Gender)


0.4998

In [47]:
def gini_index(y):
    '''
    Given a Pandas Series, it calculates the Gini Index.
    y: variable with which to calculate Gini Index.
    '''
    if isinstance(y, pd.Series):
        p = y.value_counts() / y.shape[0]
        gini = 1 - np.sum(p ** 2)
        return gini
    else:
        raise ValueError('Object must be a Pandas Series.')

def information_gain(y, mask, func=gini_index):
    '''
    It returns the Information Gain of a variable given a loss function.
    y: target variable.
    mask: split choice.
    func: function to be used to calculate Information Gain in case of classification.
    '''
    a = sum(mask)
    b = mask.shape[0] - a

    if a == 0 or b == 0:
        ig = 0
    else:
        if y.dtypes != 'O':
            ig = gini_index(y) - (a / (a + b) * gini_index(y[mask])) - (b / (a + b) * gini_index(y[-mask]))
        else:
            ig = func(y) - a / (a + b) * func(y[mask]) - b / (a + b) * func(y[-mask])

    return ig


information_gain(data.obese, data.Weight >= 100)



0.20307970798360334

In [48]:

# Menentukan Information Gain

def variance(y):
  '''
  Function to help calculate the variance avoiding nan.
  y: variable to calculate variance to. It should be a Pandas Series.
  '''
  if(len(y) == 1):
    return 0
  else:
    return y.var()

def information_gain(y, mask, func=entropy):
  '''
  It returns the Information Gain of a variable given a loss function.
  y: target variable.
  mask: split choice.
  func: function to be used to calculate Information Gain in case os classification.
  '''

  a = sum(mask)
  b = mask.shape[0] - a

  if(a == 0 or b ==0):
    ig = 0

  else:
    if y.dtypes != 'O':
      ig = variance(y) - (a/(a+b)* variance(y[mask])) - (b/(a+b)*variance(y[-mask]))
    else:
      ig = func(y)-a/(a+b)*func(y[mask])-b/(a+b)*func(y[-mask])

  return ig

information_gain(data.obese, data.Weight >= 100)


0.10145545873057096

In [49]:

# Menentukan Pemisahan Terbaik dari Sebuah Variabel

import itertools

def categorical_options(a):
  '''
  Creates all possible combinations from a Pandas Series.
  a: Pandas Series from where to get all possible combinations.
  '''
  a = a.unique()

  opt = []
  for L in range(0, len(a)+1):
      for subset in itertools.combinations(a, L):
          subset = list(subset)
          opt.append(subset)

  return opt[1:-1]

def max_information_gain_split(x, y, func=entropy):
  '''
  Given a predictor & target variable, returns the best split, the error and the type of variable based on a selected cost function.
  x: predictor variable as Pandas Series.
  y: target variable as Pandas Series.
  func: function to be used to calculate the best split.
  '''

  split_value = []
  ig = []

  numeric_variable = True if x.dtypes != 'O' else False


  # Create options according to variable type
  if numeric_variable:
    options = x.sort_values().unique()[1:]
  else:
    options = categorical_options(x)

  # Calculate ig for all values
  for val in options:
    mask =   x < val if numeric_variable else x.isin(val)
    val_ig = information_gain(y, mask, func)
    # Append results
    ig.append(val_ig)
    split_value.append(val)


  # Check if there are more than 1 results if not, return False
  if len(ig) == 0:
    return(None,None,None, False)

  else:
  # Get results with highest IG
    best_ig = max(ig)
    best_ig_index = ig.index(best_ig)
    best_split = split_value[best_ig_index]
    return(best_ig,best_split,numeric_variable, True)

weight_ig, weight_split, _, _ = max_information_gain_split(data['Weight'], data['obese'],)

print(
  "The best split for Weight is when the variable is less than ",
  weight_split,"\nInformation Gain for that split is:", weight_ig
)


The best split for Weight is when the variable is less than  103 
Information Gain for that split is: 0.10625190497954967


In [50]:

# Cara Memilih Pemisahan Terbaik

data.drop('obese', axis= 1).apply(max_information_gain_split, y = data['obese'])   
#   Menentukan Kedalaman Pohon

def get_best_split(y, data):
  '''
  Given a data, select the best split and return the variable, the value, the variable type and the information gain.
  y: name of the target variable
  data: dataframe where to find the best split.
  '''
  masks = data.drop(y, axis= 1).apply(max_information_gain_split, y = data[y])
  if sum(masks.loc[3,:]) == 0:
    return(None, None, None, None)

  else:
    # Get only masks that can be splitted
    masks = masks.loc[:,masks.loc[3,:]]

    # Get the results for split with highest IG
    split_variable = masks.iloc[0].astype(np.float32).idxmax()
    #split_valid = masks[split_variable][]
    split_value = masks[split_variable][1]
    split_ig = masks[split_variable][0]
    split_numeric = masks[split_variable][2]

    return(split_variable, split_value, split_ig, split_numeric)


def make_split(variable, value, data, is_numeric):
  '''
  Given a data and a split conditions, do the split.
  variable: variable with which make the split.
  value: value of the variable to make the split.
  data: data to be splitted.
  is_numeric: boolean considering if the variable to be splitted is numeric or not.
  '''
  if is_numeric:
    data_1 = data[data[variable] < value]
    data_2 = data[(data[variable] < value) == False]

  else:
    data_1 = data[data[variable].isin(value)]
    data_2 = data[(data[variable].isin(value)) == False]

  return(data_1,data_2)

def make_prediction(data, target_factor):
  '''
  Given the target variable, make a prediction.
  data: pandas series for target variable
  target_factor: boolean considering if the variable is a factor or not
  '''

  # Make predictions
  if target_factor:
    pred = data.value_counts().idxmax()
  else:
    pred = data.mean()

  return pred


In [51]:

# Melatih Decision Tree dengan Python

def train_tree(data,y, target_factor, max_depth = None,min_samples_split = None, min_information_gain = 1e-20, counter=0, max_categories = 20):
  '''
  Trains a Decission Tree
  data: Data to be used to train the Decission Tree
  y: target variable column name
  target_factor: boolean to consider if target variable is factor or numeric.
  max_depth: maximum depth to stop splitting.
  min_samples_split: minimum number of observations to make a split.
  min_information_gain: minimum ig gain to consider a split to be valid.
  max_categories: maximum number of different values accepted for categorical values. High number of values will slow down learning process. R
  '''

  # Check that max_categories is fulfilled
  if counter==0:
    types = data.dtypes
    check_columns = types[types == "object"].index
    for column in check_columns:
      var_length = len(data[column].value_counts())
      if var_length > max_categories:
        raise ValueError('The variable ' + column + ' has '+ str(var_length) + ' unique values, which is more than the accepted ones: ' +  str(max_categories))

  # Check for depth conditions
  if max_depth == None:
    depth_cond = True

  else:
    if counter < max_depth:
      depth_cond = True

    else:
      depth_cond = False

  # Check for sample conditions
  if min_samples_split == None:
    sample_cond = True

  else:
    if data.shape[0] > min_samples_split:
      sample_cond = True

    else:
      sample_cond = False

  # Check for ig condition
  if depth_cond & sample_cond:

    var,val,ig,var_type = get_best_split(y, data)

    # If ig condition is fulfilled, make split
    if ig is not None and ig >= min_information_gain:

      counter += 1

      left,right = make_split(var, val, data,var_type)

      # Instantiate sub-tree
      split_type = "<=" if var_type else "in"
      question =   "{} {}  {}".format(var,split_type,val)
      # question = "\n" + counter*" " + "|->" + var + " " + split_type + " " + str(val)
      subtree = {question: []}

      # Find answers (recursion)
      yes_answer = train_tree(left,y, target_factor, max_depth,min_samples_split,min_information_gain, counter)

      no_answer = train_tree(right,y, target_factor, max_depth,min_samples_split,min_information_gain, counter)

      if yes_answer == no_answer:
        subtree = yes_answer

      else:
        subtree[question].append(yes_answer)
        subtree[question].append(no_answer)

    # If it doesn't match IG condition, make prediction
    else:
      pred = make_prediction(data[y],target_factor)
      return pred

   # Drop dataset if doesn't match depth or sample conditions
  else:
    pred = make_prediction(data[y],target_factor)
    return pred

  return subtree

max_depth = 5
min_samples_split = 20
min_information_gain  = 1e-5

decision = train_tree(data,'obese',True, max_depth,min_samples_split,min_information_gain)
decision


{'Weight <=  103': [{'Height <=  175': [{'Weight <=  74': [{'Height <=  148': [1,
        0]},
      {'Height <=  162': [1, {'Weight <=  82': [0, 1]}]}]},
    0]},
  {'Height <=  189': [{'Weight <=  116': [{'Height <=  168': [1,
        {'Height <=  169': [0, 1]}]},
      1]},
    {'Weight <=  115': [0, 1]}]}]}

In [52]:

# Prediksi Menggunakan Decision Tree dengan Python

def classifier_data(observation, arbol):
  question = list(arbol.keys())[0]

  if question.split()[1] == '<=':

    if observation[question.split()[0]] <= float(question.split()[2]):
      answer = arbol[question][0]
    else:
      answer = arbol[question][1]

  else:

    if observation[question.split()[0]] in (question.split()[2]):
      answer = arbol[question][0]
    else:
      answer = arbol[question][1]

  # If the answer is not a dictionary
  if not isinstance(answer, dict):
    return answer
  else:
    residual_tree = answer
    return classifier_data(observation, answer)

#Prediction
obese_prediction = []
num_obs = 50

for i in range(num_obs):
  obs_pred = classifier_data(data.iloc[i,:], decision)
  obese_prediction.append(obs_pred)

print("Predictions: ",obese_prediction,
"\n\nReal values:", data.obese[:num_obs].to_numpy())


Predictions:  [1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1] 

Real values: [1 0 1 0 0 0 1 1 0 1 0 1 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 0 0 1 1 1 0
 1 1 0 1 1 0 1 1 0 1 1 0 1]


In [53]:

# Confusion Matrix

# Calculate the confusion matrix
from sklearn import metrics
observation = data.obese[:num_obs].to_numpy()
answer = obese_prediction
confusion_matrix = metrics.confusion_matrix(observation, answer)
# Menghitung Akurasi
Accuracy = metrics.accuracy_score(observation, answer)
# Menghitung Presisi
Precision = metrics.precision_score(observation, answer)
# Menghitung Sensitivitas
Sensitivity_recall = metrics.recall_score(observation, answer)
# Menghitung Spesifisitas
Specificity = metrics.recall_score(observation, answer, pos_label=0)
# Menghitung F-Score
F1_score = metrics.f1_score(observation, answer)

print({"Accuracy":Accuracy,"Precision":Precision,"Sensitivity_recall":Sensitivity_recall,"Specificity":Specificity,"F1_score":F1_score}) 


{'Accuracy': 0.96, 'Precision': 0.9375, 'Sensitivity_recall': 1.0, 'Specificity': 0.9, 'F1_score': 0.967741935483871}


# TUGAS

## Nomor 2

In [54]:
def gini_impurity(y):
    '''
    Given a Pandas Series, it calculates the Gini Impurity.
    y: variable with which calculate Gini Impurity.
    '''
    if isinstance(y, pd.Series):
        p = y.value_counts() / y.shape[0]
        gini = 1 - np.sum(p ** 2)
        return gini
    else:
        raise ValueError('Object must be a Pandas Series.')

def information_gain(y, mask, func=gini_impurity):
    '''
    It returns the Information Gain of a variable given a loss function.
    y: target variable.
    mask: split choice.
    func: function to be used to calculate Information Gain in case of classification.
    '''
    a = sum(mask)
    b = mask.shape[0] - a

    if a == 0 or b == 0:
        ig = 0
    else:
        if y.dtypes != 'O':
            ig = gini_impurity(y) - (a / (a + b) * gini_impurity(y[mask])) - (b / (a + b) * gini_impurity(y[-mask]))
        else:
            ig = func(y) - a / (a + b) * func(y[mask]) - b / (a + b) * func(y[-mask])

    return ig

## Nomor 3

In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# Mengunduh dan memuat data Iris dari URL
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
iris_data = pd.read_csv(url, names=column_names)

print(iris_data.head())

# Memisahkan fitur (X) dan target (y)
X = iris_data.drop('class', axis=1)
y = iris_data['class']

# Memisahkan data menjadi data pelatihan dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


   sepal_length  sepal_width  petal_length  petal_width        class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


In [56]:
# Membuat model decision tree dengan kriteria Gini
gini_tree = DecisionTreeClassifier(criterion='gini', random_state=42)
gini_tree.fit(X_train, y_train)

# Membuat model decision tree dengan kriteria Entropi
entropy_tree = DecisionTreeClassifier(criterion='entropy', random_state=42)
entropy_tree.fit(X_train, y_train)


DecisionTreeClassifier(criterion='entropy', random_state=42)

In [57]:
# Melakukan prediksi menggunakan model Gini
gini_predictions = gini_tree.predict(X_test)

# Melakukan prediksi menggunakan model Entropi
entropy_predictions = entropy_tree.predict(X_test)


In [58]:
# Menghitung confusion matrix, akurasi, presisi, sensitivitas, dan spesifisitas untuk model Gini
gini_conf_matrix = confusion_matrix(y_test, gini_predictions)
gini_accuracy = accuracy_score(y_test, gini_predictions)
gini_precision = precision_score(y_test, gini_predictions, average='weighted')
gini_recall = recall_score(y_test, gini_predictions, average='weighted')

# Menghitung confusion matrix, akurasi, presisi, sensitivitas, dan spesifisitas untuk model Entropi
entropy_conf_matrix = confusion_matrix(y_test, entropy_predictions)
entropy_accuracy = accuracy_score(y_test, entropy_predictions)
entropy_precision = precision_score(y_test, entropy_predictions, average='weighted')
entropy_recall = recall_score(y_test, entropy_predictions, average='weighted')

# Menampilkan hasil
print("Model dengan Kriteria Gini:")
print("Confusion Matrix:")
print(gini_conf_matrix)
print("Akurasi:", gini_accuracy)
print("Presisi:", gini_precision)
print("Sensitivitas:", gini_recall)

print("\nModel dengan Kriteria Entropi:")
print("Confusion Matrix:")
print(entropy_conf_matrix)
print("Akurasi:", entropy_accuracy)
print("Presisi:", entropy_precision)
print("Sensitivitas:", entropy_recall)


Model dengan Kriteria Gini:
Confusion Matrix:
[[19  0  0]
 [ 0 13  0]
 [ 0  0 13]]
Akurasi: 1.0
Presisi: 1.0
Sensitivitas: 1.0

Model dengan Kriteria Entropi:
Confusion Matrix:
[[19  0  0]
 [ 0 13  0]
 [ 0  1 12]]
Akurasi: 0.9777777777777777
Presisi: 0.9793650793650793
Sensitivitas: 0.9777777777777777
