In [45]:
import pandas as pd
from pandas import DataFrame

#make the necessary imports and load the data and answer the following Questions
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier as DTree

from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, accuracy_score,classification_report
from sklearn.naive_bayes import MultinomialNB

## Classification using Decision Trees and Multinomial Naive Bayes
This question requires you to classify wine based on the quality ('quality' is your target variable) using the features available to you in winequality-white.csv 

In [46]:
#Load 'winequality-white.csv'; use separator as ';'

wine = pd.read_csv('winequality-white.csv', delimiter=';')

print(wine.head(3))

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.0              0.27         0.36            20.7      0.045   
1            6.3              0.30         0.34             1.6      0.049   
2            8.1              0.28         0.40             6.9      0.050   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 45.0                 170.0   1.0010  3.00       0.45   
1                 14.0                 132.0   0.9940  3.30       0.49   
2                 30.0                  97.0   0.9951  3.26       0.44   

   alcohol  quality  
0      8.8        6  
1      9.5        6  
2     10.1        6  


### Data Exploration & Wrangling (5 questions * 4 points)
1.	What is the structure of your dataset? (shape)
2.	What is/are the main feature(s) in your dataset? (column names)
3.	List the features as Categorical or Continuous. (head/tail)
4.	Describe the statistical features (viz. mean, median, standard deviation) of these features?
6.	Are there missing values in your dataset?

In [47]:
#1 shape
wine.shape

(4898, 12)

In [48]:
#2 features
wine.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [49]:
#3 categorical vs continuous
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
fixed acidity           4898 non-null float64
volatile acidity        4898 non-null float64
citric acid             4898 non-null float64
residual sugar          4898 non-null float64
chlorides               4898 non-null float64
free sulfur dioxide     4898 non-null float64
total sulfur dioxide    4898 non-null float64
density                 4898 non-null float64
pH                      4898 non-null float64
sulphates               4898 non-null float64
alcohol                 4898 non-null float64
quality                 4898 non-null int64
dtypes: float64(11), int64(1)
memory usage: 459.2 KB


In [50]:
#4 statistical information for the continuous variables
wine.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


In [51]:
#5 missing values
wine.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

### Feature Selection and Conditioning
1. Instead of considering all the features at once, take the following 3 subsets of these features:

A = ['fixed acidity', 'free sulfur dioxide', 'citric acid', 'residual sugar', 'alcohol']

B = ['chlorides', 'sulphates', 'total sulfur dioxide', 'density']

C = ['citric acid', 'volatile acidity', 'alcohol','density','chlorides']

For each feature set, train a classifier and report the Scores


### For Treatment A

- get the feature set for Treatment A

In [52]:
target = wine[['quality']]

feat_a = wine[['fixed acidity', 'free sulfur dioxide', 'citric acid', 'residual sugar', 'alcohol']]
print('Shape of the feature set A:', feat_a.shape)

Shape of the feature set A: (4898, 5)


### Data Preparation (2 questions * 2.5 points)

1.	Report the dimensions and type after separating the dataset to predictors(X) and target(y)

2.	Perform train-test split. Set test_size to 0.1 and random_state to 42 and report the dimensions of X_test and y_test



In [53]:
#1
X = feat_a
y = target

print('Shape of X:', X.shape)
print('Shape of y:', y.shape)

Shape of X: (4898, 5)
Shape of y: (4898, 1)


In [54]:
#2 train/test data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print('Shape of X_test:', X_test.shape)
print('Shape of y_test:', y_test.shape)

Shape of X_test: (490, 5)
Shape of y_test: (490, 1)


### Modeling (2 questions * 2.5 points)
1. Devise 2 models for our problem. 

a.	Decision Tree Classifier

b.	Multinomial NB Classifier **(Hint: This is designed to handle multiclass target values; has the same function signature as Bernoulli NB; use ‘from sklearn.naive_bayes import MultinomialNB’)**

(Both of these classifiers are inherently multiclass i.e. can handle >=2 and you don't need any extra parameters)

2. Give the f1_score, precision_score, recall_score for both the classifiers. Also print the classification report. Which of the 2 classifiers performs better?

In [55]:
#1 Decision Tree classifier
model_dt = DTree(criterion='entropy')
model_dt.fit(X_train, y_train)

y_pred_dt = model_dt.predict(X_test)
print('Shape of y_pred:', y_pred_dt.shape)

Shape of y_pred: (490,)


In [56]:
#2 DT Scores

print("Precision: %0.2f" %precision_score(y_test, y_pred_dt , average="macro"))
print("Recall:  %0.2f" %recall_score(y_test, y_pred_dt , average="macro"))
print("F1-score:  %0.2f" %f1_score(y_test, y_pred_dt , average="macro"))
print(confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

Precision: 0.43
Recall:  0.43
F1-score:  0.43
[[  0   0   2   1   0   0]
 [  0   6   8   4   0   0]
 [  0   7  86  40   9   2]
 [  0   3  41 140  28   3]
 [  0   3   9  23  56   3]
 [  0   0   0   4   6   6]]
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.32      0.33      0.32        18
           5       0.59      0.60      0.59       144
           6       0.66      0.65      0.66       215
           7       0.57      0.60      0.58        94
           8       0.43      0.38      0.40        16

    accuracy                           0.60       490
   macro avg       0.43      0.43      0.43       490
weighted avg       0.60      0.60      0.60       490



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [57]:
#1 Multinomial NB classifier
model_nb = MultinomialNB()
model_nb.fit(X_train, y_train)

y_pred_nb = model_nb.predict(X_test)
print('Shape of y_pred:', y_pred_nb.shape)

Shape of y_pred: (490,)


  y = column_or_1d(y, warn=True)


In [58]:
#2 Multinomial NB Scores
print("Precision: %0.2f" %precision_score(y_test, y_pred_nb , average="macro"))
print("Recall:  %0.2f" %recall_score(y_test, y_pred_nb , average="macro"))
print("F1-score:  %0.2f" %f1_score(y_test, y_pred_nb , average="macro"))
print(confusion_matrix(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

Precision: 0.23
Recall:  0.23
F1-score:  0.22
[[  0   0   2   1   0   0]
 [  0   6   3   8   1   0]
 [  5  13  41  82   3   0]
 [  2   8  44 142  19   0]
 [  2   4  11  65  12   0]
 [  0   0   4  12   0   0]]
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.19      0.33      0.24        18
           5       0.39      0.28      0.33       144
           6       0.46      0.66      0.54       215
           7       0.34      0.13      0.19        94
           8       0.00      0.00      0.00        16

    accuracy                           0.41       490
   macro avg       0.23      0.23      0.22       490
weighted avg       0.39      0.41      0.38       490



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


#### Comparison of the 2 models

- The Precision, Recall & F1 scores for DecisionTree model is 0.41, whereas the same for Multinomial NB model is around 0.23.
- Based on this analysis, the DecisionTree model performs better for this dataset.

### For Treatment B

### Data Preparation (2 questions * 2.5 points)

1.	Report the dimensions and type after separating the dataset to predictors(X) and target(y)

2.	Perform train-test split. Set test_size to 0.1 and random_state to 42 and report the dimensions of X_test and y_test



In [59]:
feat_b = wine[['chlorides', 'sulphates', 'total sulfur dioxide', 'density']]
print('Shape of the feature set B:', feat_b.shape)

#1
X = feat_b
y = target

print('Shape of X:', X.shape)
print('Shape of y:', y.shape)

Shape of the feature set B: (4898, 4)
Shape of X: (4898, 4)
Shape of y: (4898, 1)


In [60]:
#2 train/test data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print('Shape of X_test:', X_test.shape)
print('Shape of y_test:', y_test.shape)

Shape of X_test: (490, 4)
Shape of y_test: (490, 1)


### Modeling (2 questions * 2.5 points)
1. Devise 2 models for our problem. 

a.	Decision Tree Classifier

b.	Multinomial NB Classifier **(Hint: This is designed to handle multiclass target values; has the same function signature as Bernoulli NB; use ‘from sklearn.naive_bayes import MultinomialNB’)**

(Both of these classifiers are inherently multiclass i.e. can handle >=2 and you don't need any extra parameters)

2. Give the f1_score, precision_score, recall_score for both the classifiers. Also print the classification report. Which of the 2 classifiers performs better?

In [61]:
#1 Decision Tree classifier
model_dt = DTree(criterion='entropy')
model_dt.fit(X_train, y_train)

y_pred_dt = model_dt.predict(X_test)
print('Shape of y_pred:', y_pred_dt.shape)

Shape of y_pred: (490,)


In [62]:
#2 DT Scores
print("Precision: %0.2f" %precision_score(y_test, y_pred_dt , average="macro"))
print("Recall:  %0.2f" %recall_score(y_test, y_pred_dt , average="macro"))
print("F1-score:  %0.2f" %f1_score(y_test, y_pred_dt , average="macro"))
print(confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

Precision: 0.40
Recall:  0.43
F1-score:  0.41
[[  0   0   2   1   0   0]
 [  0   3   5   8   1   1]
 [  1   4  93  33   9   4]
 [  0   4  32 143  27   9]
 [  0   2   9  29  50   4]
 [  0   0   0   2   5   9]]
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.23      0.17      0.19        18
           5       0.66      0.65      0.65       144
           6       0.66      0.67      0.66       215
           7       0.54      0.53      0.54        94
           8       0.33      0.56      0.42        16

    accuracy                           0.61       490
   macro avg       0.40      0.43      0.41       490
weighted avg       0.61      0.61      0.61       490



In [63]:
#1 Multinomial NB classifier
model_nb = MultinomialNB()
model_nb.fit(X_train, y_train)

y_pred_nb = model_nb.predict(X_test)
print('Shape of y_pred:', y_pred_nb.shape)

Shape of y_pred: (490,)


  y = column_or_1d(y, warn=True)


In [64]:
#2 Multinomial NB Scores
print("Precision: %0.2f" %precision_score(y_test, y_pred_nb , average="macro"))
print("Recall:  %0.2f" %recall_score(y_test, y_pred_nb , average="macro"))
print("F1-score:  %0.2f" %f1_score(y_test, y_pred_nb , average="macro"))
print(confusion_matrix(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

Precision: 0.07
Recall:  0.17
F1-score:  0.10
[[  0   0   0   3   0   0]
 [  0   0   0  18   0   0]
 [  0   0   0 144   0   0]
 [  0   0   0 215   0   0]
 [  0   0   0  94   0   0]
 [  0   0   0  16   0   0]]
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00        18
           5       0.00      0.00      0.00       144
           6       0.44      1.00      0.61       215
           7       0.00      0.00      0.00        94
           8       0.00      0.00      0.00        16

    accuracy                           0.44       490
   macro avg       0.07      0.17      0.10       490
weighted avg       0.19      0.44      0.27       490



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


#### Comparison of the 2 models

- The Precision, Recall & F1 scores for DecisionTree model is around 0.43, whereas the same for Multinomial NB model is between 0.7 - 0.17.
- Based on this analysis, the DecisionTree model performs better for this dataset.

### For Treatment C

### Data Preparation (2 questions * 2.5 points)

1.	Report the dimensions and type after separating the dataset to predictors(X) and target(y)

2.	Perform train-test split. Set test_size to 0.1 and random_state to 42 and report the dimensions of X_test and y_test

In [65]:
feat_c = wine[['citric acid', 'volatile acidity', 'alcohol','density','chlorides']]
print('Shape of the feature set C:', feat_c.shape)

#1
X = feat_c
y = target

print('Shape of X:', X.shape)
print('Shape of y:', y.shape)

Shape of the feature set C: (4898, 5)
Shape of X: (4898, 5)
Shape of y: (4898, 1)


In [66]:
#2 train/test data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print('Shape of X_test:', X_test.shape)
print('Shape of y_test:', y_test.shape)

Shape of X_test: (490, 5)
Shape of y_test: (490, 1)


### Modeling (2 questions * 2.5 points)

1. Devise 2 models for our problem. 

a.	Decision Tree Classifier

b.	Multinomial NB Classifier **(Hint: This is designed to handle multiclass target values; has the same function signature as Bernoulli NB; use ‘from sklearn.naive_bayes import MultinomialNB’)**

(Both of these classifiers are inherently multiclass i.e. can handle >=2 and you don't need any extra parameters)

2. Give the f1_score, precision_score, recall_score for both the classifiers. Also print the classification report. Which of the 2 classifiers performs better?


In [67]:
#1 Decision Tree classifier
model_dt = DTree(criterion='entropy')
model_dt.fit(X_train, y_train)

y_pred_dt = model_dt.predict(X_test)
print('Shape of y_pred:', y_pred_dt.shape)

Shape of y_pred: (490,)


In [68]:
#2 DT Scores
print("Precision: %0.2f" %precision_score(y_test, y_pred_dt , average="macro"))
print("Recall:  %0.2f" %recall_score(y_test, y_pred_dt , average="macro"))
print("F1-score:  %0.2f" %f1_score(y_test, y_pred_dt , average="macro"))
print(confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

Precision: 0.38
Recall:  0.40
F1-score:  0.39
[[  0   0   2   1   0   0   0]
 [  0   7   7   4   0   0   0]
 [  0   2  90  36  12   4   0]
 [  3   9  36 138  25   3   1]
 [  0   2   8  16  62   6   0]
 [  0   0   1   3   3   8   1]
 [  0   0   0   0   0   0   0]]
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.35      0.39      0.37        18
           5       0.62      0.62      0.62       144
           6       0.70      0.64      0.67       215
           7       0.61      0.66      0.63        94
           8       0.38      0.50      0.43        16
           9       0.00      0.00      0.00         0

    accuracy                           0.62       490
   macro avg       0.38      0.40      0.39       490
weighted avg       0.63      0.62      0.63       490



  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [69]:
#1 Multinomial NB classifier
model_nb = MultinomialNB()
model_nb.fit(X_train, y_train)

y_pred_nb = model_nb.predict(X_test)
print('Shape of y_pred:', y_pred_nb.shape)

Shape of y_pred: (490,)


  y = column_or_1d(y, warn=True)


In [70]:
#2 Multinomial NB Scores
print("Precision: %0.2f" %precision_score(y_test, y_pred_nb , average="macro"))
print("Recall:  %0.2f" %recall_score(y_test, y_pred_nb , average="macro"))
print("F1-score:  %0.2f" %f1_score(y_test, y_pred_nb , average="macro"))
print(confusion_matrix(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

Precision: 0.07
Recall:  0.17
F1-score:  0.10
[[  0   0   0   3   0   0]
 [  0   0   0  18   0   0]
 [  0   0   0 144   0   0]
 [  0   0   0 215   0   0]
 [  0   0   0  94   0   0]
 [  0   0   0  16   0   0]]
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00        18
           5       0.00      0.00      0.00       144
           6       0.44      1.00      0.61       215
           7       0.00      0.00      0.00        94
           8       0.00      0.00      0.00        16

    accuracy                           0.44       490
   macro avg       0.07      0.17      0.10       490
weighted avg       0.19      0.44      0.27       490



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


#### Comparison of the 2 models

- The Precision, Recall & F1 scores for DecisionTree model is around 0.37, whereas the same for Multinomial NB model is between 0.7 - 0.17.
- Based on this analysis, the DecisionTree model performs better for this dataset.

### Which Feature set (A,B,C) is good?  (10 points)
Hint: Take Precision, Recall and F1-Score into account.

- Feature set A performs better than B & C since the Precision, Recall & F1 scores are higher for feature set A compared to the other 2 feature sets.
- For feature set A, Decision Tree model performs better compared to Multinomial NB.
