# Decision Tree Code

This is taken from (https://www.kaggle.com/datasets/paradisejoy/top-hits-spotify-from-20002019). Originally this dataset had 2000 of the top songs from Spotify from the years 2000-2019 with 18 columns. The dataset we start with here is 100 of the top songs with 2 numerical attributes and class_labels with 2 values A or B.

In [1]:
import pandas as pd
import math
#sklearn if scikit-learn; details in https://scikit-learn.org/
from sklearn import tree

In [2]:
# This is my dataset used in last weeks homework.
Songs = pd.read_csv('cs654_homework4_dataset_Dembar_New.csv')

In [3]:
Songs = Songs[['duration_ms','popularity','class_label']]

In [4]:
Songs

Unnamed: 0,duration_ms,popularity,class_label
0,26,90,A
1,3,92,A
2,47,77,A
3,33,91,A
4,20,76,A
...,...,...,...
95,42,83,B
96,22,74,B
97,81,80,B
98,23,76,B


In [5]:
# I am generating 4 random datasets each with 20 rows from the original dataset of 100 rows.
df = Songs.sample(20, random_state = 8)
df2 = Songs.sample(20, random_state = 5)
df3 = Songs.sample(20, random_state = 7)
df4 = Songs.sample(20, random_state = 11)

In [6]:
# Here is a summary of my data in all 4 new datasets.
df.describe(),df2.describe(),df3.describe(),df4.describe()

(       duration_ms  popularity
 count    20.000000   20.000000
 mean     40.500000   69.450000
 std      20.466918   20.017032
 min       3.000000    0.000000
 25%      27.500000   63.750000
 50%      38.000000   69.500000
 75%      53.750000   80.750000
 max      81.000000   92.000000,
        duration_ms  popularity
 count    20.000000   20.000000
 mean     44.700000   66.400000
 std      19.841742   20.014205
 min       8.000000    1.000000
 25%      30.000000   63.000000
 50%      43.000000   67.000000
 75%      54.750000   77.750000
 max      82.000000   97.000000,
        duration_ms  popularity
 count    20.000000   20.000000
 mean     46.350000   72.100000
 std      20.538219   10.412038
 min      15.000000   60.000000
 25%      33.000000   64.000000
 50%      42.500000   71.000000
 75%      64.000000   79.250000
 max      97.000000   97.000000,
        duration_ms  popularity
 count    20.000000   20.000000
 mean     39.850000   67.200000
 std      13.623799   19.253434
 min 

In [7]:
# This is what the first dataset looks like now.
df

Unnamed: 0,duration_ms,popularity,class_label
17,28,85,A
23,44,63,A
1,3,92,A
44,28,57,A
55,50,69,B
99,66,79,B
86,15,70,B
29,26,55,A
62,20,0,B
33,43,60,A


### Functions to classify groups of 'popularity' and 'duration'
Here, I divided each group into 3 based upon max and min and that's how I decided upon the classification system. 

In [8]:
def f(x):
  if x['popularity'] > 31 and x['popularity'] < 62: return 'average'
  elif x['popularity'] > 62: return 'popular'
  else: return 'not_popular'

df['popularity_new'] = df.apply(f, axis=1)

In [9]:
def f(x):
  if x['popularity'] > 32 and x['popularity'] < 64: return 'average'
  elif x['popularity'] > 64: return 'popular'
  else: return 'not_popular'

df2['popularity_new'] = df2.apply(f, axis=1)

In [10]:
def f(x):
  if x['popularity'] > 32 and x['popularity'] < 64: return 'average'
  elif x['popularity'] > 64: return 'popular'
  else: return 'not_popular'

df3['popularity_new'] = df3.apply(f, axis=1)

In [11]:
def f(x):
  if x['popularity'] > 33 and x['popularity'] < 66: return 'average'
  elif x['popularity'] > 66: return 'popular'
  else: return 'not_popular'

df4['popularity_new'] = df4.apply(f, axis=1)

In [12]:
def g(x):
  if x['duration_ms'] > 27 and x['duration_ms'] < 54: return 'Medium'
  elif x['duration_ms'] > 54: return 'Long'
  else: return 'Short'

df['duration_ms_new'] = df.apply(g, axis=1)

In [13]:
def g(x):
  if x['duration_ms'] > 27 and x['duration_ms'] < 54: return 'Medium'
  elif x['duration_ms'] > 54: return 'Long'
  else: return 'Short'

df2['duration_ms_new'] = df2.apply(g, axis=1)

In [14]:
def g(x):
  if x['duration_ms'] > 32 and x['duration_ms'] < 64: return 'Medium'
  elif x['duration_ms'] > 64: return 'Long'
  else: return 'Short'

df3['duration_ms_new'] = df3.apply(g, axis=1)

In [15]:
def g(x):
  if x['duration_ms'] > 21 and x['duration_ms'] < 42: return 'Medium'
  elif x['duration_ms'] > 42: return 'Long'
  else: return 'Short'

df4['duration_ms_new'] = df4.apply(g, axis=1)

In [16]:
# final datasets with new labels to csv.
df.to_csv('SmallSongs.csv')
df2.to_csv('SmallSongs2.csv')
df3.to_csv('SmallSongs3.csv')
df4.to_csv('SmallSongs4.csv')
df # Here's the first one.

Unnamed: 0,duration_ms,popularity,class_label,popularity_new,duration_ms_new
17,28,85,A,popular,Medium
23,44,63,A,popular,Medium
1,3,92,A,popular,Short
44,28,57,A,average,Medium
55,50,69,B,popular,Medium
99,66,79,B,popular,Long
86,15,70,B,popular,Short
29,26,55,A,average,Short
62,20,0,B,not_popular,Short
33,43,60,A,average,Medium


In [17]:
# This is the function to help compute information Gain.
def info(x, y):
    x = float(x)
    y = float(y)
    if x == 0 or y == 0:
        return 0
    else:
        return (-1)* (x/(x+y))* math.log(x/(x+y), 2 ) + (-1)* (y/(x+y))* math.log(y/(x+y), 2 )

In [18]:
# Here is the first dataset with all labels included. The _new columns are what the functions above classified. 
df

Unnamed: 0,duration_ms,popularity,class_label,popularity_new,duration_ms_new
17,28,85,A,popular,Medium
23,44,63,A,popular,Medium
1,3,92,A,popular,Short
44,28,57,A,average,Medium
55,50,69,B,popular,Medium
99,66,79,B,popular,Long
86,15,70,B,popular,Short
29,26,55,A,average,Short
62,20,0,B,not_popular,Short
33,43,60,A,average,Medium


In [19]:
# this is a quick tally so that I could easily figure out my math below.
df['duration_ms_new'].value_counts(),df['popularity_new'].value_counts(), df['class_label'].value_counts()

(Medium    10
 Long       5
 Short      5
 Name: duration_ms_new, dtype: int64,
 popular        16
 average         3
 not_popular     1
 Name: popularity_new, dtype: int64,
 A    13
 B     7
 Name: class_label, dtype: int64)

# Dataset 1
In this dataset and all datasets I randomly generated 20 objects from my original dataset of Songs with 100 objects. In dataset number 1 (df) I have the following: Class Labels (A - 13, B - 7), Duration ( Medium - 10, Long - 5, and Short - 5), and Popularity (Popular - 16, Average - 3, and Not Popular - 1). Below is my math to find decision tree. 

In [20]:
# Information for whole decision on 'A' label_class for df
-(13.0/20)*math.log(13.0/20, 2) - (7.0/20)*math.log(7.0/20, 2)

0.934068055375491

In [21]:
# Check Function to see if it working. It is! Same result as above

info(13.0,7) # Info D df

0.934068055375491

In [22]:
# info D 'duration' Attribute dataset 1 (df)
(10.0/20) * info(8,2) + (5.0/20) * info(2,3) + (5.0/20) * info(3,2)

0.8464393446710154

In [23]:
# Gain(duration) = Info(D) - Info_duration(D) dataset 1(df)
# 0.934068055375491 - 0.8464393446710154 just check if it is right! It is!
info(13.0,7) - ((10.0/20) * info(8,2) + (5.0/20) * info(2,3) + (5.0/20) * info(3,2))

0.08762871070447553

In [24]:
# info D 'popularity' Attribute (df)
(16.0/20) * info(10,6) + (3.0/20) * info(3,0) + (1/20) * info(0,1)

0.763547202339972

In [25]:
# Gain (popularity) = Info(D) - Info_popularity(D) dataset 1 (df)
info(13.0,7) - ((16.0/20) * info(10,6) + (3.0/20) * info(3,0) + (1/20) * info(0,1))

0.170520853035519

# Decision Tree for dataset 1 (df)

# Using sklearn
Below I am getting numerical values for X and Y. X is pairs of duration and popularity, and Y is the class labels. I then use sklearn to fit the data and then to predict the data. 

In [26]:
X_list = df['duration_ms'].to_list()
Y_list = df['popularity'].to_list()
Z_list = df['class_label'].to_list()
X = list(zip(X_list, Y_list))
Y = Z_list

In [27]:
# I commented out to save room. But I wanted to see what this looked like.
# X, Y

In [28]:
# Train the decision tree; construct the decision tree; learn the decision tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, Y)

In [29]:
# predict a sample
clf.predict([[21, 50]])

array(['B'], dtype='<U1')

In [30]:
# predict another sample
clf.predict([[22, 85]])

array(['A'], dtype='<U1')

# Dataset 2

In [31]:
# Here is Dataset Number 2.
df2

Unnamed: 0,duration_ms,popularity,class_label,popularity_new,duration_ms_new
66,49,91,B,popular,Medium
32,34,67,A,popular,Medium
46,21,50,A,average,Short
28,82,63,A,average,Long
74,35,65,B,popular,Medium
23,44,63,A,average,Medium
10,41,1,A,not_popular,Medium
20,29,97,A,popular,Medium
17,28,85,A,popular,Medium
35,60,69,A,popular,Long


In [32]:
# Tally for reference for math below of dataset 2.
df2['duration_ms_new'].value_counts(),df2['popularity_new'].value_counts(), df2['class_label'].value_counts()

(Medium    13
 Long       5
 Short      2
 Name: duration_ms_new, dtype: int64,
 popular        12
 average         5
 not_popular     3
 Name: popularity_new, dtype: int64,
 A    13
 B     7
 Name: class_label, dtype: int64)

# Dataset 2 (df2)
In this dataset and all datasets I randomly generated 20 objects from my original dataset of Songs with 100 objects. In dataset number 2 (df2) I have the following: Class Labels (A - 13, B - 7), Duration ( Medium - 13, Long - 5, and Short - 2), and Popularity (Popular - 12, Average - 5, and Not Popular - 3). Below is my math to find decision tree. 

In [33]:
# Information for whole decision on 'A' label_class for dataset 2 (df2)
info(13.0,7) # Info D df2

0.934068055375491

In [34]:
# info D 'duration' Attribute (df2)
(13.0/20) * info(7,6) + (2.0/20) * info(2,0) + (5.0/20) * info(4,1)

0.8277048675770422

In [35]:
# Gain(duration) = Info(D) - Info_duration(D) dataset 2 (df2)
info(13.0,7) - ((13.0/20) * info(7,6) + (2.0/20) * info(2,0) + (5.0/20) * info(4,1))

0.10636318779844878

In [36]:
# info D 'popularity' Attribute (df2)
(12.0/20) * info(6,6) + (5.0/20) * info(5,0) + (3.0/20) * info(2,1)

0.7377443751081734

In [37]:
# Gain(popularity) = Info(D) - Info_popularity(D) dataset 2 (df2)
info(13.0,7) - ((12.0/20) * info(6,6) + (5.0/20) * info(5,0) + (3.0/20) * info(2,1))

0.1963236802673176

# Decision Tree for dataset 2 (df2)

# Using sklearn for (df2)
Now, I get numerical values for X and Y. X is pairs of duration and popularity, and Y is the class labels. I then use sklearn to fit the data and then to predict the data. 

In [38]:
X_list = df2['duration_ms'].to_list()
Y_list = df2['popularity'].to_list()
Z_list = df2['class_label'].to_list()
X = list(zip(X_list, Y_list))
Y = Z_list

In [39]:
# commented out to save room like above.
# X, Y

In [40]:
# Train the decision tree; construct the decision tree; learn the decision tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, Y)

In [41]:
# predict a sample
clf.predict([[8, 15]])

array(['A'], dtype='<U1')

In [42]:
# predict another sample
clf.predict([[14, 98]])

array(['A'], dtype='<U1')

# Dataset 3

In [43]:
# Here's dataset 3.
df3

Unnamed: 0,duration_ms,popularity,class_label,popularity_new,duration_ms_new
37,64,80,A,popular,Short
26,34,80,A,popular,Medium
78,34,60,B,average,Medium
91,36,81,B,popular,Medium
49,43,60,A,average,Medium
15,64,62,A,average,Short
93,24,79,B,popular,Short
71,22,64,B,not_popular,Short
86,15,70,B,popular,Short
22,42,72,A,popular,Medium


In [44]:
# Tally of dataset 3 for math below.
df3['duration_ms_new'].value_counts(),df3['popularity_new'].value_counts(), df3['class_label'].value_counts()

(Medium    9
 Short     7
 Long      4
 Name: duration_ms_new, dtype: int64,
 popular        13
 average         4
 not_popular     3
 Name: popularity_new, dtype: int64,
 A    12
 B     8
 Name: class_label, dtype: int64)

# Dataset 3 (df3)
In this dataset and all datasets I randomly generated 20 objects from my original dataset of Songs with 100 objects. In dataset number 3 (df3) I have the following: Class Labels (A - 12, B - 8), Duration ( Medium - 9, Long - 4, and Short - 7), and Popularity (Popular - 13, Average - 4, and Not Popular - 3). Below is my math to find decision tree.

In [45]:
# Information for whole decision on 'A' label_class for dataset 3 (df3)
info(12.0,8) # Info D df3

0.9709505944546686

In [46]:
# Info D for 'duration' Attribute for (df3)
(9.0/20) * info(5,4) + (4.0/20) * info(3,1) + (7.0/20) * info(4,3)

0.9530696994310146

In [47]:
# Gain(duration) = info(D) - info_'duration'(D) for dataset 3 (df3)
info(12.0,8) - ((9.0/20) * info(5,4) + (4.0/20) * info(3,1) + (7.0/20) * info(4,3))

0.017880895023653975

In [48]:
# Info D_popularity for (df3)
(13.0/20) * info(8,5) + (4.0/20) * info(3,1) + (3.0/20) * info(1,2)

0.9248037930698694

In [49]:
# Gain(popularity) = info(D) - info_'popularity'(D) for dataset 3 (df3)
info(12.0,8) - ((13.0/20) * info(8,5) + (4.0/20) * info(3,1) + (3.0/20) * info(1,2))

0.04614680138479921

# Decision Tree for dataset 3 (df3)

# Using sklearn for (df3)
Now, I get numerical values for X and Y. X is pairs of duration and popularity, and Y is the class labels. I then use sklearn to fit the data and then to predict the data. 

In [50]:
X_list = df3['duration_ms'].to_list()
Y_list = df3['popularity'].to_list()
Z_list = df3['class_label'].to_list()
X = list(zip(X_list, Y_list))
Y = Z_list

In [51]:
# commented out to save room
# X, Y

In [52]:
# Train the decision tree; construct the decision tree; learn the decision tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, Y)

In [53]:
# predict a sample
clf.predict([[7, 9]])

array(['B'], dtype='<U1')

In [54]:
# predict another sample
clf.predict([[11, 91]])

array(['B'], dtype='<U1')

# Dataset 4

In [55]:
# Here's dataset 4.
df4

Unnamed: 0,duration_ms,popularity,class_label,popularity_new,duration_ms_new
46,21,50,A,average,Short
49,43,60,A,average,Long
22,42,72,A,popular,Short
58,28,76,B,popular,Medium
41,53,77,A,popular,Long
98,23,76,B,popular,Medium
62,20,0,B,not_popular,Short
29,26,55,A,average,Medium
30,56,83,A,popular,Long
51,62,70,B,popular,Long


In [56]:
# Tally of dataset 4 to help with math below.
df4['duration_ms_new'].value_counts(),df4['popularity_new'].value_counts(), df4['class_label'].value_counts()

(Long      9
 Medium    8
 Short     3
 Name: duration_ms_new, dtype: int64,
 popular        12
 average         6
 not_popular     2
 Name: popularity_new, dtype: int64,
 B    10
 A    10
 Name: class_label, dtype: int64)

# Dataset 4 (df4)
In this dataset and all datasets I randomly generated 20 objects from my original dataset of Songs with 100 objects. In dataset number 4 (df4) I have the following: Class Labels (A - 10, B - 10), Duration ( Medium - 8, Long - 9, and Short - 3), and Popularity (Popular - 12, Average - 6, and Not Popular - 2). Below is my math to find decision tree.

In [57]:
# Information for whole decision on 'A' label_class for dataset 4 (df4)
info(10.0,10) # Info D df4

1.0

In [58]:
# Info D for 'duration' Attribute for (df4)
(9.0/20) * info(6,3) + (8.0/20) * info(2,6) + (3.0/20) * info(2,1)

0.8754887502163469

In [59]:
# Gain('duration') for d4 = Info(D) - Info_'duration'(D) for dataset 4 (df4)
info(10.0,10) - ((9.0/20) * info(6,3) + (8.0/20) * info(2,6) + (3.0/20) * info(2,1))

0.12451124978365313

In [60]:
# Info D for 'popularity' Attribute for (d4)
(12.0/20) * info(6,6) + (6.0/20) * info(4,2) + (2.0/20) * info(0,2)

0.8754887502163469

In [61]:
# Gain ('popularity') for d4 = Info(D) - Info_'popularity'(D) for dataset 4 (df4)
info(10.0,10) - ((12.0/20) * info(6,6) + (6.0/20) * info(4,2) + (2.0/20) * info(0,2))

0.12451124978365313

# Decision Tree dataset 4 (df4)
Because the score for both 'duration' and 'popularity' is the same the result here could be incorrect.

# Using sklearn for (df4)
Now, I get numerical values for X and Y. X is pairs of duration and popularity, and Y is the class labels. I then use sklearn to fit the data and then to predict the data. 

In [62]:
X_list = df4['duration_ms'].to_list()
Y_list = df4['popularity'].to_list()
Z_list = df4['class_label'].to_list()
X = list(zip(X_list, Y_list))
Y = Z_list

In [63]:
# commented out to save room
# X, Y

In [64]:
# Train the decision tree; construct the decision tree; learn the decision tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, Y)

In [65]:
# predict a sample
clf.predict([[5, 2]])

array(['B'], dtype='<U1')

In [66]:
# predict another sample
clf.predict([[10, 90]])

array(['B'], dtype='<U1')