In [1]:
# *****************************************************************************************************
#
# Decision Tree Classifier, using ID3 Algorithm 
#
# Library: 
#   pandas, scikit-learn, matplotlib
# Dataset:
#   World Economic Forum "Readiness for the future of production report 2018"  
#   https://www.weforum.org/reports/readiness-for-the-future-of-production-report-2018
#
# Created by: Andi Sama
# NIM: 2540136324
# Date created: April 2, 2023
# Last modified: April 15, 2023
# Program: Doctor of Computer Science (DCS)
# Institution: Binus University
# Subject: ISYS90219045 Advanced Knowledge System
# Lecturer: Prof. Dr. Sfenrianto, S.Kom, M.Kom
#
# *****************************************************************************************************

In [None]:
# import necessary library
#
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
# import numpy as np
#

In [None]:
# read the dataset, store in panda dataframe 'df'
#
df = pd.read_csv("data\WEF_FOP2.csv")
features = df[["Technology & Innovation","Human Capital","Global Trade & Investment","Institutional Framework", 
               "Sustainable Resources","Demand Environment"]]
classes = df["Quartile"]

In [None]:
# let's quickly see the dataset
#
df.head()

In [None]:
# take a look more at the dataset
#
pd.set_option('display.max_rows', 10)
df

In [None]:
# let's inspect the input for our rule-based algorithm (classification)
#
features

In [None]:
# let's inspect the target class for our rule-based algorithm (classification)
#
classes

In [None]:
# create a DecisionTreeClassifier object
#
clf = DecisionTreeClassifier(criterion='entropy')

# train the decision tree model on the dataset(input features, target classes)
#
clf.fit(features.values, classes.values)

In [None]:
# draw the decision tree
#
from sklearn.tree import plot_tree

plt.figure()
featurenames = ["TechInn", "HmCap", "GlbTrdInv", "InstFrm", "SusRsc", "DmdEnv"]
classnames = ["Q1", "Q2", "Q3", "Q4"]
trained = DecisionTreeClassifier().fit(features, classes)
plot_tree(trained, feature_names=featurenames, class_names=classnames, filled=True)
plt.title("Decision tree trained on all features")
# plt.show()
resolution = 1200
plt.savefig("ASM_decision_tree_ID3.png", format="png", dpi=resolution)


In [None]:
# see the complete dataframe as the dataset to our algorithm
#
df

In [None]:
#inspect the shape
#
features.shape

In [None]:
# # prediction from a single sample data, just to make sure for validation
# new_data=([4.5,5.21,5.41,6.19,5,6.19])    # saudi arabia
# new_features=pd.DataFrame(new_data).transpose()
# # n=new_features[0].to_frame()
# n=new_features
# clf.predict(n)
# n.shape

In [None]:
# prediction from a few sample data, just to make sure for validation
my_data=([4.5,5.21,5.41,6.19,5,6.19],             # saudi arabia, Q1
          [4,4.99,5.06,4.59,4.09,6.38],           # indonesia, Q2
          [3.18,3.65,2.72,3.67,2.88,5.37],        # pakistan, Q4
          [4.01,4.23,4.11,4.17,3.97,4.68])        # with one new data - just a fixed randomized data
new_features=pd.DataFrame(my_data).transpose()

In [None]:
#inspect the shape
#
new_features.shape

In [None]:
# n=new_features[0].to_frame()
#
new_features

In [None]:
# predict with new dataset: new_features
#
clf.predict(new_features.transpose())

In [None]:
# Generate eight randomized data for all six features each, within the boundary of the trained data 
# the boundaries are the minimum and maximum scores of "mean of all 6 features"
#
import statistics
import random
min_features = [min(features["Technology & Innovation"]), min(features["Human Capital"]),
                min(features["Global Trade & Investment"]), min(features["Institutional Framework"]), 
                min(features["Sustainable Resources"]), min(features["Demand Environment"])]
max_features = [max(features["Technology & Innovation"]), max(features["Human Capital"]),
                max(features["Global Trade & Investment"]), max(features["Institutional Framework"]), 
                max(features["Sustainable Resources"]), max(features["Demand Environment"])]

score_min_features = statistics.mean(min_features)
score_max_features = statistics.mean(max_features)

new_data = []
rows, cols = 8, 6
for i in range(rows):
    col = []
    for j in range (cols):
        col.append(round(random.uniform(score_min_features, score_max_features), 2))
    new_data.append(col)

In [None]:
# inspect the result: new_data 
#
new_data

In [None]:
# predict quartile (classes) with new_data, based on previously trained model
#
new_features=pd.DataFrame(new_data).transpose()
clf.predict(new_features.transpose())