# Data Processing with Scikit Learn
## A continuation of Week 13 data with machine learning
### Anthony Paveglio
---
This project will use the machine learning python library _scikit-learn_ to study the data presented in Week 13 regarding mushroom attributes. These attributes include physical attributes as well as the classification whether or not the mushroom is poisonous or edible. In week 13 I made comparisons of the data between mushroom odor, color, habtats and their classification (Poisonous, edible) to attempt to discover any trends. For example, mushrooms with a specific odor may always be poisonous.

In [56]:
import sklearn
import sklearn.preprocessing
import sklearn.linear_model
import sklearn.model_selection
import pandas
import numpy
import seaborn
import matplotlib

# 1. Importing mushroom data
We must first import the mushroom data file as well as append the correct headers.

In [57]:
#All of the column headers found in agaricus-lepiota.names
agaricusLepiotaHeaders = ['class','cap-shape','cap-surface','cap-color','bruises',
                         'odor','gill-attachment','gill-spacing','gill-size', 'gill-color',
                         'stalk-shape','stalk-root','stalk-surface-above-ring','stalk-surface-below-ring', 
                          'stalk-color-above-ring','stalk-color-below-ring','veil-type','veil-color',
                         'ring-number','ring-type','spore-print-color','population','habitat']

#The data in this file is comma seperated even though the extension is a generic .data
agaricusLepiotaData = pandas.read_csv('agaricus-lepiota.data', names=agaricusLepiotaHeaders)

agaricusLepiotaData.head(10)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
5,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
6,e,b,s,w,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,n,m
7,e,b,y,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,s,m
8,p,x,y,w,t,p,f,c,n,p,...,s,w,w,p,w,o,p,k,v,g
9,e,b,s,y,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m


# 2. Converting to numeric
## More efficent approach than my week 13 submission

In [58]:
binaryCodedData = pandas.get_dummies(agaricusLepiotaData)

binaryCodedData.head(10)

Unnamed: 0,class_e,class_p,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
4,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
5,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
6,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,1,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
8,0,1,0,0,0,0,0,1,0,0,...,0,1,0,0,1,0,0,0,0,0
9,1,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0


# 3. Linear Regression

In [59]:
linearRegModel = sklearn.linear_model.LinearRegression()

y = binaryCodedData.iloc[:, 0:2]
x = binaryCodedData.iloc[:, 2:]

x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(
    x, 
    y, 
    test_size=0.33, 
    random_state=42)

trainingResults = linearRegModel.fit(x_train, y_train)

learningResults = linearRegModel.predict(x_test)

columnNames = ['Prediction: Edible', 'Prediction: Poisonous']

learningDataStructured = pandas.DataFrame(learningResults, columns=columnNames)

learningDataStructured['Actual Value: Edible'] = y_test['class_e'].values
learningDataStructured['Actual Value: Poisonous'] = y_test['class_p'].values

learningDataStructured

Unnamed: 0,Prediction: Edible,Prediction: Poisonous,Actual Value: Edible,Actual Value: Poisonous
0,1.000000e+00,3.330669e-16,1,0
1,1.998401e-15,1.000000e+00,0,1
2,2.442491e-15,1.000000e+00,0,1
3,1.000000e+00,-9.992007e-16,1,0
4,1.998401e-15,1.000000e+00,0,1
5,1.776357e-15,1.000000e+00,0,1
6,-2.220446e-16,1.000000e+00,0,1
7,-2.220446e-16,1.000000e+00,0,1
8,1.000000e+00,3.330669e-16,1,0
9,1.000000e+00,6.661338e-16,1,0


In [60]:
coefficentData = pandas.DataFrame(
    trainingResults.coef_, columns=x.columns, index=['coefficients_e', 'coefficients_p'])

display(coefficentData)

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
coefficients_e,-3.391873e-15,1.31839e-14,-2.969847e-15,-1.44329e-15,-3.774758e-15,-2.164935e-15,-0.010582,-0.010582,-0.010582,-0.010582,...,0.058787,0.058787,0.058787,-0.115716,-0.115716,-0.115716,-0.115716,-0.115716,-0.115716,0.384279
coefficients_p,2.421824e-15,-9.65894e-15,1.776357e-15,1.137979e-15,3.587408e-15,1.44329e-15,0.009724,0.009724,0.009724,0.009724,...,-0.054019,-0.054019,-0.054019,0.11201,0.11201,0.11201,0.11201,0.11201,0.11201,-0.387187
