## Bethany's Code for Decision Trees

In [25]:
# import lines
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split

In [2]:
# reading csv dataset into data object
data = pd.read_csv('winequality-red.csv', sep=';')

In [3]:
# first five rows of dataframe
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
data['quality'].describe()

count    1599.000000
mean        5.636023
std         0.807569
min         3.000000
25%         5.000000
50%         6.000000
75%         6.000000
max         8.000000
Name: quality, dtype: float64

In [5]:
# separate X and y objects into features and target variables
X = data.drop(columns=['quality'])
print(X)

      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0               7.4             0.700         0.00             1.9      0.076   
1               7.8             0.880         0.00             2.6      0.098   
2               7.8             0.760         0.04             2.3      0.092   
3              11.2             0.280         0.56             1.9      0.075   
4               7.4             0.700         0.00             1.9      0.076   
...             ...               ...          ...             ...        ...   
1594            6.2             0.600         0.08             2.0      0.090   
1595            5.9             0.550         0.10             2.2      0.062   
1596            6.3             0.510         0.13             2.3      0.076   
1597            5.9             0.645         0.12             2.0      0.075   
1598            6.0             0.310         0.47             3.6      0.067   

      free sulfur dioxide  

In [6]:
y = data['quality']
print(y)

0       5
1       5
2       5
3       6
4       5
       ..
1594    5
1595    6
1596    6
1597    5
1598    6
Name: quality, Length: 1599, dtype: int64


In [7]:
# splitting data
# good rule of thumb is to split into 70% train and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

In [8]:
# checking shape of train and test sets
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1119, 11)
(480, 11)
(1119,)
(480,)


In [9]:
# FIRST MODEL
# instantiation and fitting Random Forest using RandomForestClassifier
# Parameters: n_estimators=2
forest = RandomForestClassifier(n_estimators=2, random_state=42).fit(X_train, y_train)

In [10]:
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))

# already a sign of overfitting from the first model

Accuracy on training set: 0.855
Accuracy on test set: 0.546


In [11]:
# SECOND MODEL
# instantiation and fitting Random Forest using RandomForestClassifier
# Parameters: n_estimators=50
forest = RandomForestClassifier(n_estimators=50, random_state=42).fit(X_train, y_train)

In [12]:
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))

Accuracy on training set: 1.000
Accuracy on test set: 0.652


In [13]:
# THIRD MODEL
# instantiation and fitting Random Forest using RandomForestClassifier
# Parameters: n_estimators=12; high enough to separate from model where n_estimators=50
forest = RandomForestClassifier(n_estimators=12, random_state=42).fit(X_train, y_train)

In [14]:
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))

Accuracy on training set: 0.991
Accuracy on test set: 0.633


In [15]:
# FOURTH MODEL
# instantiation and fitting Random Forest using RandomForestClassifier
# Parameters: n_estimators=12 and max_features=11
# trying high number of max_features
forest = RandomForestClassifier(n_estimators=12, max_features=11, random_state=42).fit(X_train, y_train)

In [16]:
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))

Accuracy on training set: 0.991
Accuracy on test set: 0.662


In [17]:
# Predicted Probabilities matrix for best performing model - FOURTH MODEL
# showing that there are 6 classes
print("Shape of probabilities for wine dataset: {}".format(forest.predict_proba(X_test).shape))

Shape of probabilities for wine dataset: (480, 6)


In [18]:
print("Predicted probabilities: \n{}".format(forest.predict_proba(X_test[:6])))

Predicted probabilities: 
[[0.         0.         0.66666667 0.25       0.08333333 0.        ]
 [0.         0.16666667 0.5        0.33333333 0.         0.        ]
 [0.08333333 0.         0.33333333 0.58333333 0.         0.        ]
 [0.         0.16666667 0.58333333 0.25       0.         0.        ]
 [0.         0.         0.         1.         0.         0.        ]
 [0.         0.         0.75       0.25       0.         0.        ]]


In [19]:
# FIFTH MODEL
# instantiation and fitting Random Forest using RandomForestClassifier
# Parameters: n_estimators=12 and max_features=1
# trying low number of max_features
forest = RandomForestClassifier(n_estimators=12, max_features=1, random_state=42).fit(X_train, y_train)

In [20]:
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))

Accuracy on training set: 0.989
Accuracy on test set: 0.640


In [21]:
# SIXTH MODEL
# instantiation and fitting Random Forest using RandomForestClassifier
# Parameters: n_estimators=12, max_features=1, max_depth=10
# trying to reduce overfitting with max_depth=10
forest = RandomForestClassifier(n_estimators=12, max_features=1, max_depth=10, random_state=42).fit(X_train, y_train)

In [22]:
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))

Accuracy on training set: 0.895
Accuracy on test set: 0.635


In [23]:
# SEVENTH MODEL
# instantiation and fitting Random Forest using RandomForestClassifier
# Parameters: n_estimators=12, max_features=1, max_depth=10
# trying to reduce overfitting with max_depth=5
forest = RandomForestClassifier(n_estimators=12, max_features=1, max_depth=5, random_state=42).fit(X_train, y_train)

In [24]:
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))

Accuracy on training set: 0.686
Accuracy on test set: 0.571
