In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,precision_score, accuracy_score, f1_score
from google.colab import drive
from sklearn.tree import DecisionTreeClassifier,export_graphviz
drive.mount('/content/drive')
dataset=pd.read_csv("/content/drive/MyDrive/normalized_dataset.csv")
"""print (dataset.shape)
dataset=dataset.sample(frac=1, replace=False)#shuffling dataset
print (dataset.shape)"""
labels=pd.DataFrame(dataset["label"])
dataset=dataset.drop("label",axis=1)
dataset=dataset.drop("Unnamed: 0",axis=1)
import graphviz

Mounted at /content/drive


In [2]:
X_train, X_test, y_train, y_test = train_test_split(dataset, labels, test_size=0.3, random_state=42)
keys=list(dataset.keys())
print (keys)
labels_=labels['label'].unique()


['brightness', 'contrast', 'pattern', 'color_difference', 'edge_density', 'fractional_dimension', 'b', 'g', 'r', 'var_b', 'var_g', 'var_r']


here we classify our data points considerig 6 classes using entropy

In [3]:
tree = DecisionTreeClassifier(criterion='entropy', max_depth=12, min_samples_split=5, min_samples_leaf=2)
tree.fit(X_train, y_train['label'].ravel())


In [4]:
y_pred = tree.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')

accuracy = accuracy_score(y_test, y_pred)

f1 = f1_score(y_test, y_pred, average='weighted')

print("Precision: ", precision)
print("Accuracy: ", accuracy)
print("F1 Score: ", f1)
print (cm)
dot_data = export_graphviz(tree, out_file=None,
                           feature_names=keys,
                           class_names=labels_,
                           filled=True, rounded=True,
                           special_characters=True)

graph = graphviz.Source(dot_data)
graph.format = 'pdf'
graph.render("/content/drive/MyDrive/decision_tree_graph_entropy")

Precision:  0.4063976870307444
Accuracy:  0.4015594541910331
F1 Score:  0.4020973970354214
[[73 13  7 49 24  0]
 [14 62 23 10 40 16]
 [ 6 40 48  2 31 32]
 [42  9  6 91 17  4]
 [12 44 24 19 73 22]
 [ 7 31 32 10 28 65]]


'/content/drive/MyDrive/decision_tree_graph_entropy.pdf'

here we classify our data points considerig 6 classes using gini

In [5]:

tree = DecisionTreeClassifier(criterion='gini', max_depth=12, min_samples_split=5, min_samples_leaf=2)
tree.fit(X_train, y_train['label'].ravel())
y_pred = tree.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')

accuracy = accuracy_score(y_test, y_pred)

f1 = f1_score(y_test, y_pred, average='weighted')

print("Precision: ", precision)
print("Accuracy: ", accuracy)
print("F1 Score: ", f1)
print (cm)
dot_data = export_graphviz(tree, out_file=None,
                           feature_names=keys,
                           class_names=labels_,
                           filled=True, rounded=True,
                           special_characters=True)

graph = graphviz.Source(dot_data)
graph.format = 'pdf'
graph.render("/content/drive/MyDrive/decision_tree_graph_gini")

Precision:  0.4222660830953833
Accuracy:  0.4152046783625731
F1 Score:  0.4156819876353758
[[82 18  3 49  9  5]
 [11 65 40 10 28 11]
 [ 5 36 66  5 14 33]
 [43 14  7 85 15  5]
 [20 54 19 17 64 20]
 [12 24 45  6 22 64]]


'/content/drive/MyDrive/decision_tree_graph_gini.pdf'

then we convert labels to binary

In [6]:
for index, row in labels.iterrows():
    if "fake" in row["label"]:
        row["label"]="fake"
    else :
        row["label"]="real"

In [8]:
X_train, X_test, y_train, y_test = train_test_split(dataset, labels, test_size=0.3, random_state=42)

and fit the tree using gini on binary classification

In [9]:

tree = DecisionTreeClassifier(criterion='gini', max_depth=12, min_samples_split=5, min_samples_leaf=2)
tree.fit(X_train, y_train['label'].ravel())
y_pred = tree.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')

accuracy = accuracy_score(y_test, y_pred)

f1 = f1_score(y_test, y_pred, average='weighted')

print("Precision: ", precision)
print("Accuracy: ", accuracy)
print("F1 Score: ", f1)
print (cm)
dot_data = export_graphviz(tree, out_file=None,
                           feature_names=keys,
                           class_names=labels_,
                           filled=True, rounded=True,
                           special_characters=True)

graph = graphviz.Source(dot_data)
graph.format = 'pdf'
graph.render("/content/drive/MyDrive/decision_tree_graph_gini_binary")

Precision:  0.5973878024907742
Accuracy:  0.5896686159844055
F1 Score:  0.5872458033290111
[[332 158]
 [263 273]]


'/content/drive/MyDrive/decision_tree_graph_gini_binary.pdf'

using entropy

In [94]:
tree = DecisionTreeClassifier(criterion='entropy', max_depth=1, min_samples_split=5, min_samples_leaf=2)
tree.fit(X_train, y_train['label'].ravel())


In [95]:
y_pred = tree.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')

accuracy = accuracy_score(y_test, y_pred)

f1 = f1_score(y_test, y_pred, average='weighted')

print("Precision: ", precision)
print("Accuracy: ", accuracy)
print("F1 Score: ", f1)
print (cm)
dot_data = export_graphviz(tree, out_file=None,
                           feature_names=keys,
                           class_names=labels_,
                           filled=True, rounded=True,
                           special_characters=True)

graph = graphviz.Source(dot_data)
graph.format = 'pdf'
graph.render("/content/drive/MyDrive/decision_tree_graph_entropy_binary")

Precision:  0.5845299067954873
Accuracy:  0.5818713450292398
F1 Score:  0.5677539421789889
[[194 296]
 [133 403]]


'/content/drive/MyDrive/decision_tree_graph_entropy_binary.pdf'

we see that even if we use one best feature we still get 1.0 accuracy

