# **Training a maximum entropy classifier**

This code bit predicts music preference based on age and gender and returns dataset entropy and average prediction accuracy over 30 runs.<br>
Run the code by clicking <b>Run All</b>.

In [61]:
#import pandas, scipy and sklearn packages

import pandas as pd
import scipy.stats
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
from itertools import repeat
import numpy as np

**1. Read in the dataset**

In [64]:
df = pd.read_csv('cleanedfile.csv')
df = df[df.name != '^IXIC']
df = df.drop(columns = ['Unnamed: 0', 'name'])
df
df = df.tail(19994)
df.describe()



Unnamed: 0,open,close,high,low,volume
count,19994.0,19994.0,19994.0,19994.0,19994.0
mean,205.911975,205.876058,207.916333,203.701407,23241050.0
std,332.316771,332.165818,335.255867,328.828999,26349790.0
min,0.92,0.94,0.94,0.91,100.0
25%,27.78,27.7825,28.11,27.4525,4559628.0
50%,74.15,74.225,74.98,73.35,13429770.0
75%,214.3,214.305,216.9675,211.615,34089090.0
max,2038.11,2039.51,2050.5,2013.0,591078600.0


**2. Split the dataset**

In [65]:
# Run this section to inspect X
X = df.drop(columns = ['volume', 'open', 'close'])
X

Unnamed: 0,high,low
16868,32.25,30.13
16870,35.22,34.25
16871,3.57,3.36
16873,32.00,30.22
16874,31.75,29.31
...,...,...
41655,138.35,132.80
41656,212.05,201.00
41657,1195.67,1150.00
41658,1804.90,1745.23


In [66]:
# Uncomment this section to inpect y
y = df['volume']
y

16868     8073700.0
16870    24082300.0
16871     2898400.0
16873     4934600.0
16874     5143700.0
            ...    
41655    38515386.0
41656    46882843.0
41657     1813141.0
41658     5277898.0
41659    17331221.0
Name: volume, Length: 19994, dtype: float64

**Turning floats into 'categories':**

In [67]:
co25 = 4.559628e+06
co50 = 1.342977e+07
co75 = 3.408909e+07
#dict = y.to_dict()
#print(dict) 
cat = []
for value in y:
    if value < co25: 
        cat.append('0-25')
    elif value < co50:
        cat.append('25-50')
    elif value < co75:
        cat.append('50-75') 
    else:
        cat.append('75-100')
dict = {
    'volume': cat, 'nonsense': cat
}
df2 = pd.DataFrame(dict)
y = df2['volume']

  

**3. Compute entropy of data set**

In [68]:
# Compute the maximum entropy value
k = y.unique().size
maxE = np.log2(k)
p_data = y.value_counts(normalize=True)           # counts occurrence of each value
entropy = scipy.stats.entropy(p_data)  # get entropy from counts

# normalize the value to be between 0 and 1.
normalizedE = entropy/maxE

**4. Testing: entropy-based decision tree classifier averaged over 30 runs**

In [69]:
avg_score = 0.0
ntimes = 30

for i in repeat(None, ntimes):

    # train model with 80% of the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # prediction using entropy
    # Note: You can replace 'entropy' by 'gini' to get the classifier to use the gini index criterion.
    model = DecisionTreeClassifier(criterion='entropy')
    model.fit(X_train,y_train)
    predictions = model.predict(X_test)

    # compute model accuracy
    avg_score += accuracy_score(y_test, predictions)

avg_score /= ntimes

**5. print outputs**

In [70]:
print('normalized entropy value: %.3f'% normalizedE)
print('average accuracy score: %.3f' % avg_score)

# output visual (can be visualized with visual code)
tree.export_graphviz(model, out_file='SupervisedModel.dot',
                    feature_names=['high', 'low'],
                    class_names=sorted(y.unique()),
                    label='all',
                    rounded=True,
                    filled=True) 
                    

normalized entropy value: 0.693
average accuracy score: 0.571
