# Looking at Mushroom dataset from UCI repository : http://archive.ics.uci.edu/ml/datasets/Mushroom                  

## Simple way to load the categorical data using Pandas

In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
print os.getcwd()
data_dir = 'C:/Users/kesj/data/miscData/UCIml/'
data_dir+='mushrooms/'
os.chdir(data_dir)

C:\Users\kesj\Documents\IPython Notebooks\GeneralDataAnalysis\ipythonNotebooks


In [3]:
infile = 'agaricus-lepiota.data'

In [4]:
#column names are in .names file
colNames = ['tgtclass', 'cap-shape','cap-surface','cap-color','bruises','odor',\
'gill-attachment','gill-spacing','gill-size','gill-color','stalk-shape','stalk-root',\
'stalk-surface-above-ring','stalk-surface-below-ring','stalk-color-above-ring',\
'stalk-color-below-ring','veil-type','veil-color','ring-number','ring-type','spore-print-color',\
'population','habitat']
len(colNames) #this is the number of attributes + 1 (corresponding to the tgtclass --> first column)
# 1st column has edible=e; poisonous=p

23

In [6]:
mraw = pd.read_csv(infile,header=None,names=colNames,na_values='?')
np.shape(mraw)

(8124, 23)

##### Replace the '-' in column names with underscore '_'

In [7]:
mraw.columns = map(lambda x: x.lower().replace('-','_'), mraw.columns)

In [8]:
mraw

Unnamed: 0,tgtclass,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
5,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
6,e,b,s,w,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,n,m
7,e,b,y,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,s,m
8,p,x,y,w,t,p,f,c,n,p,...,s,w,w,p,w,o,p,k,v,g
9,e,b,s,y,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m


### Cope with missing data. In this example only the 11th attribute 'stalk_root' has missing values, denoted by '?'; mapped to NaN

In [9]:
pd.isnull(mraw).any()

tgtclass                    False
cap_shape                   False
cap_surface                 False
cap_color                   False
bruises                     False
odor                        False
gill_attachment             False
gill_spacing                False
gill_size                   False
gill_color                  False
stalk_shape                 False
stalk_root                   True
stalk_surface_above_ring    False
stalk_surface_below_ring    False
stalk_color_above_ring      False
stalk_color_below_ring      False
veil_type                   False
veil_color                  False
ring_number                 False
ring_type                   False
spore_print_color           False
population                  False
habitat                     False
dtype: bool

In [10]:
# how many are missing?
print mraw['stalk_root'].value_counts()

nmissing = sum(pd.isnull(mraw['stalk_root']))
ntotal = len(mraw)
print "\n\n%f percent missing for this attribute" % (nmissing/float(ntotal) * 100)

b    3776
e    1120
c     556
r     192
dtype: int64


30.526834 percent missing for this attribute


In [11]:
mraw.tgtclass.value_counts()

e    4208
p    3916
dtype: int64

## Aside on how missing values are assigned within a pipeline
There are several options
1. is to use pd.get_dummies 
    * if missing you need to fillna first as some other value
2. is to use sklearn.feature_extraction.DictVectorizer
3. is to use sklearn.*.One_hot_encoder

In [17]:
sample = pd.concat([mraw.head(),mraw.tail()])


In [18]:
pd.get_dummies(sample.stalk_root)

Unnamed: 0,c,e
0,0,1
1,1,0
2,1,0
3,0,1
4,0,1
8119,0,0
8120,0,0
8121,0,0
8122,0,0
8123,0,0


In [19]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

In [20]:
DVpipe = Pipeline([ ('dv_stalkroot', DictVectorizer())])

In [24]:
DVpipe.fit(sample.stalk_root.as_matrix())

AttributeError: 'str' object has no attribute 'iteritems'

<h1> I want to convert from characters to integers for use with Marketbasket analysis.</h1>
<p> The plan is to loop through each column, map letters to integers and add 10*column number to the value.</p>

In [None]:
colCount = 0
cdValues =[]
for col in mraw.columns:
    print len(mraw[col].unique()), colCount*10
    cdValues.append(mraw[col].unique())
    colCount+=1

In [None]:
cdValues

### since this is approx 30% of the examples; probably best to omit this column for now.

In [None]:
#Explore a couple of attributes
print colNames[5]
print mraw['odor'].value_counts()
print "--------------------------"
print colNames[22]
print mraw['habitat'].value_counts()
# note that they are already coded by 1 letter abbreviations


In [None]:
#make a little bar plot example
fig, axes = plt.subplots(2,1)

mraw['odor'].value_counts().plot(kind='bar',color='c',ax=axes[0])

mraw['habitat'].value_counts().plot(kind='bar',color='m',ax=axes[1])


In [None]:
# nifty tool to see cross-tabulation of features
pd.crosstab(mraw['habitat'],mraw['odor'])

### DROP stalk-root column

In [None]:
mm = mraw.dropna(axis=1)
cnames = mm.columns
shape(mm)

### Explicitly pull out target class and map to a binary class

In [None]:
mmY = mm['tgtclass'].values
mushY = map(lambda x: 0 if x=='e' else 1, mmY)
# map edible=e to '0', poisonous=p to 1 (only 2 classes here)

In [None]:
print "%g are 1 (poisonous) and %g are 0 (edible) \n" % (len(filter(lambda x: x == 1, mushY)), len(filter(lambda x: x==0,mushY)))

### Find out how many values are present for each attribute in this dataset

In [None]:
attributeCounts = []
for i in xrange(1,len(mm.columns)):
    colname = str(cnames[i])
    print i, colname
    #le.fit(mraw[colname])
    #b = le.transform(mraw[colname])
    aa = mm[colname].value_counts()
    print aa
    print "------------------------------------"
    attributeCounts.append([colname, len(aa), aa.keys(), aa.values])

In [None]:
len(attributeCounts)

In [None]:
total_attribute_space = 0 # find out how large of an array is required if a one-hot encoder is used; more on this later
for i in xrange(0,len(attributeCounts)):
    print i, attributeCounts[i][0],attributeCounts[i][1]
    total_attribute_space +=attributeCounts[i][1]
        
print "---------------------------------------------"
print "\n this corresponds to an expanded set of %g features from %g initial features" % (total_attribute_space,len(attributeCounts))

### This would be tedious to encode manually for each value of each attribute

#### I need to create a dictionary for each column  import the dictionary that is located in the .names file

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [None]:
Ncol = len(mm.columns)
cnames = mm.columns

In [None]:
mshape = shape(mm)
Nobserv = len(mm)

mnew = pd.DataFrame(np.arange(mshape[0]*Ncol).reshape(mshape[0],mshape[1]),index=arange(mshape[0]),columns=mm.columns)
# create a new data frame with empty values
shape(mnew)

In [None]:
# transform each column into categorical labels
for i in xrange(0,len(mm.columns)):
    colname = str(cnames[i])
    #print i, colname,mm[colname].ix[0]
    le.fit(mm[colname])
    b = le.transform(mm[colname])
    mnew[colname] = b

In [None]:
# Nifty tool to explore relationships between the elements
from pandas.tools.plotting import scatter_matrix

scatter_matrix(mnew[['odor','spore_print_color','habitat','cap_color']], figsize=(12, 12), marker='o',alpha=0.3);

# Convert to arrays for analysis using a Decision Tree

In [None]:
#mushY already created above
mushX = mnew.drop(['tgtclass'],axis=1).values

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()


In [None]:
#little script for visualizing/saving Decision Tree
from StringIO import StringIO
from IPython.core.display import HTML
dot_data = StringIO()

fullDT_clf = clf.fit(mushX,mushY)
dot_data = tree.export_graphviz(fullDT_clf,out_file=dot_data)
#print dot_data.getvalue()

import pydot
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_png('full_tree.png')
HTML('<img src="full_tree.png">')


In [None]:
predY = fullDT_clf.predict(mushX)

In [None]:
from sklearn import metrics 
print metrics.confusion_matrix(mushY,predY)
print metrics.accuracy_score(mushY,predY)

### now assess the predictive power using a training set

In [None]:
#simple cross_validation
from sklearn import cross_validation
# withhold 30% for testing; setting seed to a value so this set is reproducible
X_train, X_test, y_train,y_test = cross_validation.train_test_split(mushX,mushY,test_size=0.3,random_state=42)
print X_train.shape,y_train.shape
print X_test.shape, y_test.shape

dt_clf = tree.DecisionTreeClassifier().fit(X_train,y_train)
dot_data = StringIO()
dot_data = tree.export_graphviz(dt_clf,out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_png('dt_test30pct.png')

In [None]:
from sklearn import metrics
predY = dt_clf.predict(X_test)
score = dt_clf.score(X_test, y_test)
print score
print metrics.precision_score(y_test,predY)

In [None]:
metrics.confusion_matrix(y_test,predY)

In [None]:
len(y_train[y_train == 1])

## Explore the key Features

### features used in the splits: X[8], X[18], X[19], X[20], X[7], X[4], X[12], X[16], X[9], X[11], X[1], X[3], X[2]

In [None]:
mnew[str(cnames[9])].value_counts()

In [None]:
mnew.groupby('tgtclass')[str(cnames[19])].value_counts()

In [None]:
gg = mnew[mnew[str(cnames[9])] <= 3.5].groupby('tgtclass')
gg.gill_color.count()

In [None]:
mushX[:10,8]

In [None]:
mnew[str(cnames[9])].head(10)

In [None]:
print cnames[18], cnames[19],cnames[7],cnames[20]

In [None]:
X[:10,6]

In [None]:
len((y[X[:,6] <= 0.5] == 1).nonzero()[0])

In [None]:
len((mushX[:,8] <= 3.5).nonzero()[0])

In [None]:
len((mushY[mushX[:,8] >3.5] == 1).nonzero()[0])

In [None]:
mm.gill_color.unique()

In [None]:
print mm.gill_color.head(13), mushX[:13,8]

In [None]:
mushX[:13,8]

In [None]:
aindx = (mm.gill_color == 'h').nonzero()[0][0]
print aindx
print mushX[aindx,8]

In [None]:
aindx = (mm.gill_color == 'o').nonzero()[0][0]
print aindx
print mushX[aindx,8]

In [None]:
# list the left hand side split values in decision tree
mnew[mnew['gill_color'] <= 3.5].groupby('tgtclass')['tgtclass'].value_counts()

In [None]:
mnew['gill_color'].value_counts()

### Documentation in .names file suggests logical rules including p1 odor=Not(almond.OR.anis.OR.none) captures 98.5 % accuracy

#### Why did DT miss this?

#### odor corresponds to X[4]

In [None]:
print cnames[5]
print mm[str(cnames[5])].value_counts()
print unique(mushX[4])

#### almond = a = 0; anise = l = 3; none = n = 5 from the LabelEncoder used

In [None]:
# used to dermine above results; change name to desired key
search_char = 'n'
aindx = (mm.odor == search_char).nonzero()[0][0]
#print aindx
print search_char, mushX[aindx,4]

### So we missed this with our DT because the keys for splitting on odor are not continous (just assigned as encountered by the LabelEncoder

In [None]:
#define groups using this rule
p1odor = mm[(mm.odor == 'n') | (mm.odor == 'l') | (mm.odor == 'a')]
notp1odor = mm[(mm.odor != 'n') & (mm.odor != 'l') & (mm.odor != 'a')]

In [None]:
print p1odor.groupby('tgtclass')['tgtclass'].count()
print "_______________________________"
notp1odor.groupby('tgtclass')['tgtclass'].count()

### Indeed this only misses 120 out of 8124 observations (i.e. fp = 120) so accuracy is 98.52%

# If we use an expanded attribute set such as one-hot (i.e. map each value a given attribute can take to a new dimension) can we identify this type of rule?

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
enc.fit(mushX)
mushX2 = enc.transform(mushX).toarray()
#mushX2 is the extended attribute space now
shape(mushX2)

## Repeat DT on these 112 attributes

In [None]:
#little script for visualizing/saving Decision Tree
fullDT_X_clf = clf.fit(mushX2,mushY)
dot_data = StringIO()
dot_data = tree.export_graphviz(fullDT_X_clf,out_file=dot_data)

graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_png('full_X_tree.png')
#HTML('<img src="full_X_tree.png">')

### Now the split involves 9 attributes: 27, 20, 95, 24, 58, 28, 1, 35, 7

In [None]:
splitAttrib = [27,20,95,24,58,28,1,35,7]

In [None]:
print len(enc.feature_indices_)
enc.feature_indices_

In [None]:
enc.n_values_

In [None]:
splitAttribOrigIND = [4,3, 18,4,11,4,0,7,1]


In [None]:
[cnames[x+1] for x in splitAttribOrigIND]

In [None]:
# Breakdown of rhs of first split in full_DT_X
print mnew[mnew.odor == 5].groupby('tgtclass')['tgtclass'].count()
print "________________________"
# Breakdown of lhs of first split in full_DT_X
print mnew[mnew.odor != 5].groupby('tgtclass')['tgtclass'].count()

In [None]:
# used to dermine above results; change name to desired key
search_char = 'p'
aindx = (mm.odor == search_char).nonzero()[0][0]
#print aindx
print search_char, mushX[aindx,4]

## Look at the accuracy as it depends upon the depth (i.e. complexity of the Decision Tree)

In [None]:
dmax=3
three_clf = tree.DecisionTreeClassifier(max_depth=dmax)
fullDT_three= three_clf.fit(mushX2,mushY)
dot_data = StringIO()
dot_data = tree.export_graphviz(fullDT_three,out_file=dot_data)

graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_png('full_three_tree.png')

In [None]:
#check the accuracy and precision 
predY = fullDT_three.predict(mushX2)
score = fullDT_three.score(mushX2,mushY)
print score
#print metrics.precision_score(mushY,predY) gives same result as 'score' for categorical assignment
print metrics.precision_score(mushY,predY)

#### Note that if we do the same level of complexity using the original 21 attributes we get the follwoing

In [None]:
fullDT_three_oa= three_clf.fit(mushX,mushY)
dot_data = StringIO()
dot_data = tree.export_graphviz(fullDT_three_oa,out_file=dot_data)

graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_png('full_three_OrigAttributes_tree.png')
#check the accuracy and precision 
predYoa = fullDT_three_oa.predict(mushX)
score = fullDT_three_oa.score(mushX,mushY)
print score
#print metrics.precision_score(mushY,predY) gives same result as 'score' for categorical assignment
print metrics.precision_score(mushY,predYoa)

#### Consider a max_depth of 3: dmax = 3 and a training sample that is 70% of the data

In [None]:
#simple cross_validation
from sklearn import cross_validation
# withhold 30% for testing; setting seed to a value so this set is reproducible
X2_train, X2_test, y2_train,y2_test = cross_validation.train_test_split(mushX2,mushY,test_size=0.3,random_state=42)
X_train, X_test, y_train,y_test = cross_validation.train_test_split(mushX,mushY,test_size=0.3,random_state=42)
print X_train.shape,y_train.shape,X2_train.shape,y2_train.shape
print X_test.shape, y_test.shape,X2_test.shape,y2_test.shape

clf_train7_3_oa = three_clf.fit(X_train,y_train)
dot_data = StringIO()
dot_data = tree.export_graphviz(clf_train7_3_oa,out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_png('dt_train70pct_dmax3_oa.png')
oaYpred = clf_train7_3_oa.predict(X_test)

clf_train7_3_x = three_clf.fit(X2_train,y2_train)
dot_data = StringIO()
dot_data = tree.export_graphviz(clf_train7_3_x,out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_png('dt_train70pct_dmax3_x.png')

#check the accuracy and precision 

xYpred = clf_train7_3_x.predict(X2_test)

print "\nUsing original 21 attributes accuracy is %.3f" % metrics.accuracy_score(y_test,oaYpred)
print "Using extended 112 attributes accuracy is %.3f" % metrics.accuracy_score(y2_test,xYpred)


# What about Random Forests?

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=10)
# use above splits to test & train upon
rf_clf.fit(X_train,y_train)
print "Accuracy using original 21 attributes is %.3f" % rf_clf.score(X_test,y_test)

rf_clf.fit(X2_train,y2_train)
print "Accuracy using extended 112 attributes is %.3f" % rf_clf.score(X2_test,y2_test)

In [None]:
rf2 = RandomForestClassifier(n_estimators=10,compute_importances=True,max_depth=3)

In [None]:
rf2.fit(X_train,y_train)
print "Accuracy using original 21 attributes is %.3f" % rf2.score(X_test,y_test)
bar(arange(21),rf2.feature_importances_,color='r')

In [None]:
rf2.fit(X2_train,y2_train)
print "Accuracy using extended 112 attributes is %.3f" % rf2.score(X2_test,y2_test)

bar(arange(112),rf2.feature_importances_)


In [None]:
rf2 = RandomForestClassifier(n_estimators=100,compute_importances=True,max_depth=3)
rf2.fit(X2_train,y2_train)
print "Accuracy using extended 112 attributes is %.3f" % rf2.score(X2_test,y2_test)

bar(arange(112),rf2.feature_importances_)


In [None]:
rf2 = RandomForestClassifier(n_estimators=100,compute_importances=True,max_depth=3)
rf2.fit(X_train,y_train)
print "Accuracy using original 21 attributes is %.3f" % rf2.score(X_test,y_test)

bar(arange(21),rf2.feature_importances_,color='r')

## Define another rule from .names

In [None]:
#dfine groups using rule P4 habitat=leaves.and.cap_color=white
ruleP4 = mm[(mm.habitat =='l') & (mm.cap_color == 'w')]
notruleP4 = mm[(mm.habitat !='l') & (mm.cap_color != 'w')]
#output resuls
print ruleP4.groupby('tgtclass')['tgtclass'].count()
print "_______________________________"
notruleP4.groupby('tgtclass')['tgtclass'].count()

In [None]:
#create a series of training sizes from 0 to 1
trnsize = 0.05*arange(21)
trnsize

In [None]:
i=0
for t in trnsize:
    X_train, X_test, y_train,y_test = cross_validation.train_test_split(mushX,mushY,test_size=1-t)
    
    i+=1
    

# Create a sample market basket file (csv format) 

In [None]:
ntrans = 1000
maxitemcode = 50
maxitems = 20 # maximum number of items in a transaction
itemsetcodes = arange(1,maxitemcode)
marketbasket = []
for i in xrange(0,ntrans):
    kitems = random.randint(1,maxitems)
    #print i
    transaction = []
    for k in range(kitems):
        jitem = random.randint(1,maxitemcode)
        print jitem,
        transaction.append(jitem)
    print
    marketbasket.append(transaction)
    

In [None]:
transaction, marketbasket


### output to csv file.

In [None]:
import csv
with open('marketbask1.csv','w') as csvfile:
    fwriter = csv.writer(csvfile, delimiter=',')
    for i in range(len(marketbasket)):
        fwriter.writerow(marketbasket[i])
csvfile.close()

In [None]:
key = 47
count = 0
for transaction in marketbasket:
    for x in transaction:
        if x == key: 
            count += 1

print key, count