In [1]:
# Import dependencies 
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Load data (combined and clean)
import csv
data = pd.read_csv('C:/Users/julia/OneDrive/Desktop/UNC_Class_Folder/Team1_FinalProject/Resources/mushrooms_combined_df_clean.csv')
data.head(6)


Unnamed: 0.1,Unnamed: 0,Poisonous or Edible,Cap-Shape,Cap-Surface,Cap-Color,Bruises,Odor,Gill-attachment,gill-spacing,Gill-size,...,stalk-surface-below-ring,stalk-color-above-ring,stalk color below ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,p,x,s,n,t,p,f,c,n,...,s,w,w,p,w,o,p,k,s,u
1,1,e,x,f,n,f,n,f,w,b,...,f,w,w,p,w,o,e,k,a,g
2,2,e,x,s,w,f,n,f,w,b,...,f,w,w,p,w,o,e,k,s,g
3,3,e,x,f,g,f,n,f,w,b,...,f,w,w,p,w,o,e,k,s,g
4,4,e,f,f,w,f,n,f,w,b,...,f,w,w,p,w,o,e,n,a,g
5,5,e,x,f,w,f,n,f,w,b,...,f,w,w,p,w,o,e,k,s,g


In [3]:
# Checking null values
data.isnull().sum()

Unnamed: 0                  0
Poisonous or Edible         0
Cap-Shape                   0
Cap-Surface                 0
Cap-Color                   0
Bruises                     0
Odor                        0
Gill-attachment             0
gill-spacing                0
Gill-size                   0
Gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk color below ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [7]:
# Checking # of Unique Values 
data['Poisonous or Edible'].unique()

array(['p', 'e'], dtype=object)

^ We have two classifications: Poisonous or Edible 

In [8]:
data.shape

(5644, 24)

^ We have 23 features (row 1 = title) and 5644 instances 

In [9]:
# Convert all unique values to integers 
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
for col in data.columns:
    data[col] = labelencoder.fit_transform(data[col])
 
data.head()

Unnamed: 0.1,Unnamed: 0,Poisonous or Edible,Cap-Shape,Cap-Surface,Cap-Color,Bruises,Odor,Gill-attachment,gill-spacing,Gill-size,...,stalk-surface-below-ring,stalk-color-above-ring,stalk color below ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,1,5,2,4,1,6,1,0,1,...,2,5,5,0,0,1,3,1,3,5
1,1,0,5,0,4,0,5,1,1,0,...,0,5,5,0,0,1,0,1,0,1
2,2,0,5,2,6,0,5,1,1,0,...,0,5,5,0,0,1,0,1,3,1
3,3,0,5,0,3,0,5,1,1,0,...,0,5,5,0,0,1,0,1,3,1
4,4,0,2,0,6,0,5,1,1,0,...,0,5,5,0,0,1,0,2,0,1


In [12]:
#Check the encoded values 
data['stalk-color-above-ring'].unique()

array([5, 3, 0, 4, 2, 1, 6])

In [14]:
print(data.groupby('Poisonous or Edible').size())

Poisonous or Edible
0    3488
1    2156
dtype: int64


In [16]:
#Separating features and labels 
X = data.iloc[:,1:24]  # all rows, all the features and no labels
y = data.iloc[:, 0]  # all rows, label only
X.head()
y.head()

0    0
1    1
2    2
3    3
4    4
Name: Unnamed: 0, dtype: int64

In [17]:
X.describe()

Unnamed: 0,Poisonous or Edible,Cap-Shape,Cap-Surface,Cap-Color,Bruises,Odor,Gill-attachment,gill-spacing,Gill-size,Gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk color below ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,5644.0,5644.0,5644.0,5644.0,5644.0,5644.0,5644.0,5644.0,5644.0,5644.0,...,5644.0,5644.0,5644.0,5644.0,5644.0,5644.0,5644.0,5644.0,5644.0,5644.0
mean,0.381999,3.420269,1.627215,4.272856,0.564139,3.564848,0.996811,0.181432,0.124734,3.536853,...,1.619419,3.949681,3.932672,0.0,0.001417,1.014883,2.096386,1.21545,3.71297,1.236003
std,0.485919,1.659641,1.336497,1.838018,0.495913,1.765806,0.056388,0.38541,0.330447,2.281428,...,0.72162,1.526058,1.525402,0.0,0.037625,0.1656,1.192716,1.059125,1.328741,1.597981
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,3.0,0.0,2.0,1.0,0.0,0.0,2.0,...,1.0,3.0,3.0,0.0,0.0,1.0,1.0,0.0,3.0,0.0
50%,0.0,5.0,2.0,4.0,1.0,5.0,1.0,0.0,0.0,4.0,...,2.0,5.0,5.0,0.0,0.0,1.0,3.0,1.0,4.0,1.0
75%,1.0,5.0,3.0,6.0,1.0,5.0,1.0,0.0,0.0,6.0,...,2.0,5.0,5.0,0.0,0.0,1.0,3.0,2.0,5.0,1.0
max,1.0,5.0,3.0,7.0,1.0,6.0,1.0,1.0,1.0,8.0,...,3.0,6.0,6.0,0.0,1.0,2.0,3.0,5.0,5.0,5.0


In [18]:
y.head()

0    0
1    1
2    2
3    3
4    4
Name: Unnamed: 0, dtype: int64

In [19]:
#Correlation of columns in dataframe
data.corr()

Unnamed: 0.1,Unnamed: 0,Poisonous or Edible,Cap-Shape,Cap-Surface,Cap-Color,Bruises,Odor,Gill-attachment,gill-spacing,Gill-size,...,stalk-surface-below-ring,stalk-color-above-ring,stalk color below ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
Unnamed: 0,1.0,-0.326163,-0.06217,0.160532,-0.258951,0.571745,0.246979,-0.096958,-0.227305,0.087383,...,0.84246,0.164383,0.165644,,0.064548,0.132033,0.633978,0.439505,0.1916,-0.215836
Poisonous or Edible,-0.326163,1.0,0.053155,0.046859,0.150741,-0.435562,-0.455566,-0.071945,-0.26416,0.215289,...,-0.363604,-0.317244,-0.308613,,0.047921,0.008615,-0.2152,-0.507034,0.203882,0.297412
Cap-Shape,-0.06217,0.053155,1.0,-0.068688,-0.056421,-0.097782,1.1e-05,0.002963,0.051432,0.103443,...,-0.02978,-0.030272,-0.0297,,-0.043596,-0.118191,-0.02942,-0.062007,0.048283,-0.057451
Cap-Surface,0.160532,0.046859,-0.068688,1.0,0.093355,0.22859,-0.108697,-0.058104,-0.204448,-0.042356,...,0.162293,0.045018,0.046099,,0.038702,0.044289,0.190188,0.054747,-0.00039,0.106918
Cap-Color,-0.258951,0.150741,-0.056421,0.093355,1.0,-0.165794,-0.400652,0.059693,0.062189,0.102676,...,-0.047284,0.046088,0.056865,,0.055906,-0.006358,-0.142629,-0.105214,-0.171096,0.196001
Bruises,0.571745,-0.435562,-0.097782,0.22859,-0.165794,1.0,0.235055,0.064351,-0.439183,-0.040178,...,0.552523,0.243577,0.256369,,-0.042863,0.122162,0.861993,0.292185,0.196296,-0.114367
Odor,0.246979,-0.455566,1.1e-05,-0.108697,-0.400652,0.235055,1.0,-0.01394,0.195187,0.049305,...,0.133688,0.131815,0.122807,,0.030623,0.094874,0.041458,0.397319,-0.059285,-0.223349
Gill-attachment,-0.096958,-0.071945,0.002963,-0.058104,0.059693,0.064351,-0.01394,1.0,0.02663,0.021353,...,-0.108225,0.10934,0.108756,,0.002131,0.346682,0.004571,-0.202135,0.115499,0.043755
gill-spacing,-0.227305,-0.26416,0.051432,-0.204448,0.062189,-0.439183,0.195187,0.02663,1.0,0.178484,...,-0.266521,0.326465,0.302918,,0.080026,-0.042316,-0.596261,0.286257,-0.59591,-0.106367
Gill-size,0.087383,0.215289,0.103443,-0.042356,0.102676,-0.040178,0.049305,0.021353,0.178484,1.0,...,0.169387,0.262655,0.233227,,0.099801,-0.033931,0.21049,0.287764,-0.008851,0.169763


In [20]:
#Standardize the data - Scale the data to be between -1 and 1
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X=scaler.fit_transform(X)
X

array([[ 1.2719319 ,  0.95193532,  0.27895188, ..., -0.20344067,
        -0.53662286,  2.35567928],
       [-0.78620561,  0.95193532, -1.21763027, ..., -0.20344067,
        -2.79459959, -0.14770122],
       [-0.78620561,  0.95193532,  0.27895188, ..., -0.20344067,
        -0.53662286, -0.14770122],
       ...,
       [-0.78620561,  0.95193532,  1.02724296, ...,  3.573596  ,
         0.96869495,  1.72983415],
       [ 1.2719319 ,  0.95193532,  1.02724296, ...,  3.573596  ,
        -2.04194068, -0.77354635],
       [ 1.2719319 , -0.85584494,  1.02724296, ...,  3.573596  ,
        -2.04194068, -0.77354635]])