In [1]:
import numpy as np
import pandas as pd

wine_all = pd.read_csv('winequality.csv')
wine_all.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [2]:
dropped_wine = ['type']
wine         = wine_all.drop(dropped_wine, axis = 1)
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
wine.isnull().sum()

fixed acidity           10
volatile acidity         8
citric acid              3
residual sugar           2
chlorides                2
free sulfur dioxide      0
total sulfur dioxide     0
density                  0
pH                       9
sulphates                4
alcohol                  0
quality                  0
dtype: int64

In [4]:
wine = wine.dropna()
wine.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [5]:
# factorize using astype
# wine['quality'] = wine['quality'].astype('category')

In [6]:
#Creating test and train data
wine['is_train'] = np.random.uniform(0, 1, len(wine)) <= .80
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,is_train
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,True
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,True
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,False
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,True
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,True


In [7]:
#Creating dataframes with test rows and training rows
train, test = wine[wine['is_train'] == True], wine[wine['is_train'] == False]

#Show the number of observations for the test and training dataframes
print('Number of observations in the training data : ', len(train))
print('Number of observations in the test data : ', len(test))

Number of observations in the training data :  5163
Number of observations in the test data :  1300


In [8]:
#Create a list of the feature column's names
features = wine.columns[:-2]
features

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol'],
      dtype='object')

In [9]:
#Converting each species name into digits
y = train['quality']
y

0       6
1       6
3       6
4       6
5       6
       ..
6489    6
6490    6
6492    5
6494    6
6496    6
Name: quality, Length: 5163, dtype: int64

In [10]:
#Loading scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

#Creating a random forest classifier
clf = RandomForestClassifier(n_jobs = 2, random_state = 0, n_estimators= 100)
clf.fit(train[features], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [11]:
clf.fit(train[features], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [12]:
#Viewing the predicted probabilities of the first 10 observations
clf.predict_proba(test[features])[0:10]

array([[0.  , 0.03, 0.12, 0.79, 0.04, 0.02, 0.  ],
       [0.  , 0.  , 0.23, 0.74, 0.03, 0.  , 0.  ],
       [0.01, 0.03, 0.21, 0.74, 0.01, 0.  , 0.  ],
       [0.  , 0.06, 0.34, 0.4 , 0.19, 0.01, 0.  ],
       [0.  , 0.04, 0.1 , 0.69, 0.15, 0.02, 0.  ],
       [0.02, 0.07, 0.31, 0.49, 0.1 , 0.01, 0.  ],
       [0.  , 0.  , 0.12, 0.58, 0.24, 0.06, 0.  ],
       [0.  , 0.18, 0.57, 0.24, 0.01, 0.  , 0.  ],
       [0.  , 0.15, 0.1 , 0.45, 0.22, 0.07, 0.01],
       [0.01, 0.24, 0.66, 0.09, 0.  , 0.  , 0.  ]])

In [13]:
#mapping names for the plants for each predicted plant class
preds = clf.predict(test[features])
preds[0:5]

array([6, 6, 6, 6, 6])

In [14]:
test['quality'][0:5]

2     6
7     6
8     6
9     6
10    5
Name: quality, dtype: int64

In [15]:
#creating confusion matrix
pd.crosstab(test['quality'], preds, rownames = ['Actual Quality'], colnames = ['Predicted Quality'])

Predicted Quality,4,5,6,7,8
Actual Quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3,0,5,3,0,0
4,5,28,9,0,0
5,1,298,110,1,0
6,0,93,447,33,0
7,0,2,109,112,0
8,0,0,17,14,11
9,0,0,1,1,0


In [16]:
errors = abs(preds - test['quality'])
errors

2       0
7       0
8       0
9       0
10      1
       ..
6470    0
6472    0
6487    0
6491    1
6495    1
Name: quality, Length: 1300, dtype: int64

In [17]:
print('Metrics for Random Forest Trained on Expanded Data')
print('Average absolute error:', round(np.mean(errors), 2), 'degrees.')

Metrics for Random Forest Trained on Expanded Data
Average absolute error: 0.36 degrees.


In [18]:
# Calculate mean absolute percentage error (MAPE)
mape = np.mean(100 * (errors / test['quality']))
mape

6.471672771672797

In [19]:
# Calculate and display accuracy
accuracy = 100 - mape
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 93.53 %.
