In [6]:
# Custom libraries
from datascienceutils import plotter
from datascienceutils import analyze
from datascienceutils import predictiveModels as pm
from datascienceutils import sklearnUtils as sku

from IPython.display import Image
# Standard libraries
import json
%matplotlib inline
import datetime
import numpy as np
import pandas as pd
import random

from sklearn import cross_validation
from sklearn import metrics

from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource
from bokeh.charts import Histogram
import bokeh
output_notebook(bokeh.resources.INLINE)

# Set pandas display options
pd.set_option('display.width', pd.util.terminal.get_terminal_size()[0])
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 800)

ModuleNotFoundError: No module named 'datascienceutils'

In [2]:
# Data set from https://archive.ics.uci.edu/ml/machine-learning-databases/audiology/ i.e: famous uci ml data set repository

with open('./data/audiology.data', 'r') as fd:
    data = fd.readlines()

In [3]:
from pprint import pprint
with open('./data/audiology.names', 'r') as fd:
    pprint(fd.readlines())

 '         used for any publication whatsoever.\n',
 '\n',
 '1. Title: Audiology Database\n',
 '\n',
 '2. Sources:\n',
 '    (a) Original Owner: Professor Jergen at Baylor College of Medicine\n',
 '    (b) Donor: Bruce Porter (porter@fall.cs.utexas.EDU)\n',
 '    (c) Date Received: 12/3/1987\n',
 '\n',
 '3. Past Usage: \n',
 '   -- See: Bareiss, E. Ray, & Porter, Bruce (1987).  Protos: An '
 'Exemplar-Based\n',
 '      Learning Apprentice.  In the Proceedings of the 4th International\n',
 '      Workshop on Machine Learning, 12-23, Irvine, CA: Morgan Kaufmann.\n',
 '\n',
 '4. Relevant Information:\n',
 '   -- Contact Ray Bareiss (rbareiss@uunet.uucp ??), now at Vanderbilt \n',
 '      University, for more information.\n',
 '   -- Domain expert: Professor Craig Wier of the University of Texas, '
 'Austin.\n',
 '\n',
 '5. Number of instances: 200 training cases, 26 test cases\n',
 '\n',
 '6. Number of attributes: ???\n',
 '\n',
 '7. Attribute information: (all attributes are nominally va

In [4]:

all_obs = set()

def parse_line(line):
    global all_obs
    line = line.strip('\n')
    line = line.strip(']')
    line = line.strip('[')
    all_f = line.split(',')
    caseid = all_f[0]
    classif = all_f[1]
    descs = all_f[2:]
    descs[0] = descs[0].strip('[')
    features = list()
    for ea in descs:
        all_obs.add(ea)
    descs = ','.join(descs)
    return [caseid, classif, descs]

In [5]:
audiology_df = pd.DataFrame(columns=['case_id', 'classification', 'case_features']) #'age_gt_60', 'boneAbnormal','airBoneGap', 'ar_c(normal)'])
for idx, each in enumerate(data):
    if bool(each):
        line = parse_line(each)
        audiology_df.loc[idx] = line
    

NameError: name 'pd' is not defined

In [None]:
audiology_df.head()

## Looks like the case_features are all text labels/observations by doctors. Let's split them into features and make them boolean.

In [None]:
print(audiology_df.groupby('classification').count())

In [None]:
#def check_defect_presence():
#    if ea in all_obs:
#       pass
for ea in all_obs:
    audiology_df[ea] = audiology_df['case_features'].apply( lambda x: True if ea in x else False)
audiology_df.drop('case_features', 1, inplace=True)

In [None]:
audiology_df.head()

## OKay, based on the above data set sample, the only meaningful thing we can try is to see if we can predict the case classification based on any of the observed features.

## We have 87 features,(I'm assuming these are labels that came out of human judgment) and most of it is false.. aka this is a sparsely populated dataset in these dimensions, and most likely the dimensions are not orthogonal(aka independent) to(of) each other. 

## Due to these reasons, 
    * a tree based prediction is best(since it is all boolean features)
    * Xgboost since it is mostly False/empty features.(aka sparse features)


In [1]:
audiology_df.head()

NameError: name 'audiology_df' is not defined

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(audiology_df['classification'].unique())
audiology_df['classification'] = le.transform(audiology_df['classification'])
target = audiology_df.classification

audiology_df.drop(['case_id', 'classification'], 1, inplace=True)


In [None]:
audiology_df.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(audiology_df, target, test_size=0.3)
tree_model = pm.train(X_train, y_train, 'tree')
tree_model.fit(X_train, y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((tree_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % tree_model.score(X_test, y_test))

In [None]:
plotter.show_tree_model(tree_model, model_type='tree')

In [None]:
# Train the model using the training sets
xgb_model = pm.train(X_train, y_train, 'xgboost')
xgb_model.fit(X_train, y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((xgb_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction

In [None]:
plotter.show_tree_model(xgb_model, model_type='xgboost')