In this notebook, we present various usecases for categorical values.

In [None]:
# Preamble to run notebook in context of source package.
# NBVAL_IGNORE_OUTPUT
import sys
sys.path.insert(0, '../')

In [None]:
from IPython.core.display import display, HTML
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcdefaults()
import numpy as np
import pandas as pd
# import pprint

from ptype.Column import Column2ARFF
from ptype.Ptype import Ptype
from ptype.utils import evaluate_types
from utils import *

I plan to discuss two use-cases: 
- misclassifying categorical values as anomalies due to unsupported characters (Eucalyptus Soil Conservation Dataset),
- merging categorical values when we have string variability issues (USP05 Dataset).


### Eucalyptus Soil Conservation Dataset

In [None]:
df = pd.read_csv('../data/eucalyptus.csv')
df.head()

### The Analytical Task

We use this dataset for an analytical task where we predict the "Utility" of given conditions.

### A Solution using Standard Python Libraries
Let's now develop a simple solution for this problem.

In [None]:
features = ['Sp', 'Locality', 'Altitude', 'Rainfall', 'Ht', 'Surv', 'Vig']
target = ['Utility']

X = df[features]
y = df[target]

df = df[features+target]
df.head()

Both Sp and Locality are categorical columns. But Pandas cannot automatically label data columns with categorical types. Instead, they are classified as object.

In [None]:
df.dtypes

In [None]:
# to see the error message, uncomment the following

# clf = DecisionTreeClassifier(min_samples_leaf=3, max_leaf_nodes=5, max_depth=4)
                    
clf = LogisticRegression(multi_class="multinomial", max_iter=10000, penalty="l2")

# clf.fit(X, y)

We notice that we need to encode the categorical variables using one-hot encoding.

In [None]:
df = pd.get_dummies(df, columns=['Sp', 'Locality'])

target = 'Utility'
features = list(set(df.columns) - set([target]))

X = df[features]
y = df[target].values

df = df[features+[target]]
df.head()

In [None]:
clf = LogisticRegression(multi_class="multinomial", max_iter=10000, penalty="l2")
# clf = DecisionTreeClassifier(min_samples_leaf=3, max_leaf_nodes=40, max_depth=20)

# clf.fit(X, y)

In [None]:
df.isnull().sum()

In [None]:
n = df.shape[0]
df.dropna(subset=["Ht", "Surv", "Vig"], axis=0, inplace=True)
print("# rows deleted = " + str(n-df.shape[0]))

# update the indices
df.reset_index(drop=True, inplace=True)

In [None]:
target = 'Utility'
features = list(set(df.columns) - set([target]))

X = df[features]
y = df[target].values

clf = LogisticRegression(multi_class="multinomial", max_iter=20000, penalty="l2")
clf.fit(X, y)
y_hat = clf.predict(X)

print('Overall Accuracy:', round(metrics.accuracy_score(y, y_hat), 2))

In [None]:
labels = ['none', 'low', 'average', 'good', 'best']
cm = confusion_matrix(y, y_hat, labels)

plot_confusion_matrix(cm, labels)
# note that none is not missing data (none<0.1, low<1.48, average<2.65, good<3.83, best<5.00)
# (see https://www.cs.waikato.ac.nz/ml/publications/1996/Thomson-McQueen-96.pdf)

Let's now reproduce the error and see how we can use ptype to resolve the issue.

In [None]:
df = pd.read_csv('../data/eucalyptus.csv', dtype='str', keep_default_na=False)

features = ['Sp', 'Locality', 'Altitude', 'Rainfall', 'Ht', 'Surv', 'Vig']
target = ['Utility']

X = df[features]
y = df[target]

df = df[features+target]
df.head()

In [None]:
ptype = Ptype()

schema = ptype.fit_schema(df)
schema
# pprint.pprint(schema)

In [None]:
schema['Locality'].categorical_values

In [None]:
print(schema['Locality'].get_missing_values())

print(schema['Locality'].get_anomalous_values())

In [None]:
ptype.show_schema()

In [None]:
df = ptype.transform_schema(df, schema)

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
cat_columns = [column for column in schema if column != 'Utility' and schema[column].arff_type == 'nominal' ]
df = pd.get_dummies(df, columns=cat_columns)

target = 'Utility'
features = list(set(df.columns) - set([target]))

X = df[features]
y = df[target].values

df = df[features+[target]]
df.head()

In [None]:
clf = LogisticRegression(multi_class="multinomial", max_iter=10000, penalty="l2")
# clf.fit(X, y)

In [None]:
df.isnull().sum()

In [None]:
n = df.shape[0]
df.dropna(subset=["Ht", "Surv", "Vig"], axis=0, inplace=True)
print("# rows deleted = " + str(n-df.shape[0]))

# update the indices
df.reset_index(drop=True, inplace=True)

In [None]:
target = 'Utility'
features = list(set(df.columns) - set([target]))

X = df[features]
y = df[target].values

clf = LogisticRegression(multi_class="multinomial", max_iter=20000, penalty="l2")
clf.fit(X, y)
y_hat = clf.predict(X)

print('Overall Accuracy:', round(metrics.accuracy_score(y, y_hat), 2))

In [None]:
labels = ['none', 'low', 'average', 'good', 'best']
cm = confusion_matrix(y, y_hat, labels)

plot_confusion_matrix(cm, labels)
# note that none is not missing data (none<0.1, low<1.48, average<2.65, good<3.83, best<5.00)
# (see https://www.cs.waikato.ac.nz/ml/publications/1996/Thomson-McQueen-96.pdf)

### USP05 Dataset

In [None]:
df = pd.read_csv('../data/usp05.csv')
df.head()