# **Maximum Entropy Classifier**

This program predicts music preference based on age and gender and returns data entropy and average prediction accuracy over 30 runs.<br>
Run the code by clicking <b>Run All</b>.

In [1]:
#import pandas, scipy and sklearn packages

import pandas as pd
import scipy.stats
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
from itertools import repeat
import numpy as np

**1. Read in the dataset**

In [2]:
df = pd.read_csv('SupervisedLearning/supervised_test.csv')

#TODO: Write code below to inspect the first five rows of the data frame
df.head()

Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,26,1,Jazz
4,29,1,Jazz


**2. Split the dataset**

In [3]:
# Run this section to inspect X
X = df.drop(columns = ['genre'])

#TODO: Write code to inspect X
X

Unnamed: 0,age,gender
0,20,1
1,23,1
2,25,1
3,26,1
4,29,1
5,30,1
6,31,1
7,33,1
8,37,1
9,20,0


In [4]:
# Uncomment this section to inpect y
y = df['genre']

#TODO: Write code to inspect y
y

0        HipHop
1        HipHop
2        HipHop
3          Jazz
4          Jazz
5          Jazz
6     Classical
7     Classical
8     Classical
9         Dance
10        Dance
11        Dance
12     Acoustic
13     Acoustic
14     Acoustic
15    Classical
16    Classical
17    Classical
Name: genre, dtype: object

**3. Compute entropy of data set**

In [5]:
# Compute the maximum entropy value
k = y.unique().size
maxE = np.log2(k)
p_data = y.value_counts(normalize=True)           # counts occurrence of each value
entropy = scipy.stats.entropy(p_data)  # get entropy from counts

# normalize the value to be between 0 and 1.
normalizedE = entropy/maxE

#TODO: Write code to display the entropy value
entropy

1.5607104090414068

**4. Testing: entropy-based decision tree classifier averaged over 30 runs**

In [6]:
avg_score = 0.0
ntimes = 30

for i in repeat(None, ntimes):

    # train model with 80% of the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # prediction using entropy
    # Note: You can replace 'entropy' by 'gini' to get the classifier to use the gini index criterion.
    model = DecisionTreeClassifier(criterion='entropy')
    model.fit(X_train,y_train)
    predictions = model.predict(X_test)

    # compute model accuracy
    avg_score += accuracy_score(y_test, predictions)

avg_score /= ntimes

**5. print outputs**

In [7]:
print('normalized entropy value: %.3f'% normalizedE)
print('average accuracy score: %.3f' % avg_score)


normalized entropy value: 0.672
average accuracy score: 0.817


**Output the tree dot file**

In [8]:
# output visual (can be visualized with visual code)
tree.export_graphviz(model, out_file='SupervisedLearning/EntropySupervisedModel.dot',
                    feature_names=['age', 'gender'],
                    class_names=sorted(y.unique()),
                    label='all',
                    rounded=True,
                    filled=True)

**6. Earn Your Wings: Do it yourself & Gini index test**

Test this algorithm on your own data set.
Repeat steps 4 and 5 with the gini index criterion, and save the visualization with *gini*. 

Open the dot files in VS Code, take screenshots of both trees and insert them in the text cell below.

In [9]:
# Imports necessary packages
from sklearn import preprocessing

# Create new data frame
df = pd.read_csv('SupervisedLearning/waste water treatment.csv')

# #Write code below to inspect the first five rows of the data frame
df.head()

Unnamed: 0,Variable,VariableDescription,Country,Year,PercentageValue
0,TOTPUBSEW,Total public sewerage (% of resident populatio...,Australia,2010,92.79
1,TOTPUBSEW,Total public sewerage (% of resident populatio...,Australia,2011,93.84
2,TOTPUBSEW,Total public sewerage (% of resident populatio...,Australia,2012,94.1
3,TOTPUBSEW,Total public sewerage (% of resident populatio...,Australia,2013,94.08
4,TOTPUBSEW,Total public sewerage (% of resident populatio...,Australia,2014,92.57


In [10]:
# Create input dataset
X = df

In [11]:
# Write code to inspect the shape of X
X.shape

(3048, 5)

In [12]:
# Clean input dataset
for i in range(1900, 3048): 
    X = X.drop(i)

for i in range(0, 1900):
    if X.VariableDescription[i] == "Unspecified (other) treatment ": 
        X = X.drop(i)

In [13]:
# Write code to inspect the shape of X
X.shape

(1900, 5)

In [14]:
# Create output dataset
y = X['Country']
# y = X.drop(columns = ['VariableDescription','Variable','Year','PercentageValue'])

# Write code to inspect y
y

0       Australia
1       Australia
2       Australia
3       Australia
4       Australia
          ...    
1895      Austria
1896      Austria
1897      Austria
1898      Austria
1899      Austria
Name: Country, Length: 1900, dtype: object

In [15]:
# Create input and output datasets
X = X.drop(columns = ['Country'])
X = X.drop(columns = ['Variable'])

# Write code to inspect X
X

Unnamed: 0,VariableDescription,Year,PercentageValue
0,Total public sewerage (% of resident populatio...,2010,92.7900
1,Total public sewerage (% of resident populatio...,2011,93.8400
2,Total public sewerage (% of resident populatio...,2012,94.1000
3,Total public sewerage (% of resident populatio...,2013,94.0800
4,Total public sewerage (% of resident populatio...,2014,92.5700
...,...,...,...
1895,Public total treatment (connected to a wastewa...,2010,93.8627
1896,Public total treatment (connected to a wastewa...,2012,94.5000
1897,Public total treatment (connected to a wastewa...,2014,94.9676
1898,Public total treatment (connected to a wastewa...,2016,95.2000


In [16]:
# Write code to inspect the shape of X
X.shape

(1900, 3)

In [17]:
# Write code to inspect the shape of y
y.shape

(1900,)

In [18]:
# X is expected to be an array here. If it's a dataframe, get the array version by running:
X = X.values

# Write code to inspect X
X

array([['Total public sewerage (% of resident population connected to urban wastewater collecting system = PUBTOTTR + PUBNOTR)',
        2010, 92.79],
       ['Total public sewerage (% of resident population connected to urban wastewater collecting system = PUBTOTTR + PUBNOTR)',
        2011, 93.84],
       ['Total public sewerage (% of resident population connected to urban wastewater collecting system = PUBTOTTR + PUBNOTR)',
        2012, 94.1],
       ...,
       ['Public total treatment (connected to a wastewater treatment plant = PUBMECTR + PUBBIOTR + PUBADVTR + OTHERTR)',
        2014, 94.9676],
       ['Public total treatment (connected to a wastewater treatment plant = PUBMECTR + PUBBIOTR + PUBADVTR + OTHERTR)',
        2016, 95.2],
       ['Public total treatment (connected to a wastewater treatment plant = PUBMECTR + PUBBIOTR + PUBADVTR + OTHERTR)',
        2018, 95.94]], dtype=object)

In [19]:
# Turns strings under "VariableDescription" column into integers
le_df = preprocessing.LabelEncoder()
# le_df.fit(['Primary treatment','Secondary treatment'])
le_df.fit(['Total public sewerage (% of resident population connected to urban wastewater collecting system = PUBTOTTR + PUBNOTR)','Connected to a wastewater treatment plant without treatment','Primary treatment','Secondary treatment','Tertiary treatment','Population connected to independent treatment','Total treatment ( = PUBTOTTR + INDEPDTR)','Unspecified (other) treatment','Not connected to public sewerage or independent treatment','Public total treatment (connected to a wastewater treatment plant = PUBMECTR + PUBBIOTR + PUBADVTR + OTHERTR)'])
X[:,0] = le_df.transform(X[:,0]) 

In [20]:
# Compute the maximum entropy value
k = y.unique().size
maxE = np.log2(k)
p_data = y.value_counts(normalize=True)           # counts occurrence of each value
entropy = scipy.stats.entropy(p_data)  # get entropy from counts

# normalize the value to be between 0 and 1.
normalizedE = entropy/maxE

# Write code to display the entropy value
entropy

3.3644574543396373

In [21]:
avg_score = 0.0
ntimes = 30

for i in repeat(None, ntimes):

    # train model with 80% of the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # prediction using entropy
    # Note: You can replace 'entropy' by 'gini' to get the classifier to use the gini index criterion.
    model = DecisionTreeClassifier(criterion='entropy')
    model.fit(X_train,y_train)
    predictions = model.predict(X_test)

    # compute model accuracy
    avg_score += accuracy_score(y_test, predictions)

avg_score /= ntimes

In [22]:
print('normalized entropy value: %.3f'% normalizedE)
print('average accuracy score: %.3f' % avg_score)

normalized entropy value: 0.656
average accuracy score: 0.371


In [23]:
# output visual (can be visualized with visual code)
tree.export_graphviz(model, out_file='SupervisedLearning/waste_water_treatment_entropy.dot',
                    feature_names=['VariableDescription', 'PercentageValue', 'Year'],
                    class_names=sorted(y.unique()),
                    label='all',
                    rounded=True,
                    filled=True)

![Screenshot1](/home/SupervisedLearning/zoom2.png)
![Screenshot2](/home/SupervisedLearning/zoom1.png)
![screenshot3](/home/SupervisedLearning/full_view.png)