In [1]:
# initial imports
import pandas as pd
import numpy as np
from path import Path
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Load data
file_path = Path("Resources/winemag-data_cleaned.csv")
wine_df = pd.read_csv(file_path)
wine_df.head()

Unnamed: 0,points,price,country,designation,province,region_1,variety,winery,tokens,filtered_tokens,1,2,3,4,5,6,7,8,9,10
0,96.0,235.0,12,20,6,24,2,15,60,36,2701.0,16004.0,23236.0,28401.0,33079.0,39068.0,45330.0,46582.0,63420.0,66669.0
1,96.0,110.0,11,20,26,26,9,15,51,31,11481.0,33140.0,41643.0,46758.0,55709.0,59455.0,69650.0,79653.0,104662.0,106432.0
2,96.0,90.0,12,20,6,26,19,15,47,30,10077.0,26468.0,32151.0,49402.0,61157.0,64289.0,76272.0,88398.0,99749.0,119775.0
3,96.0,65.0,12,22,27,46,12,15,62,43,1546.0,4238.0,13171.0,15324.0,16356.0,25381.0,33140.0,46758.0,56257.0,67144.0
4,95.0,66.0,4,20,31,26,9,15,66,37,1546.0,17161.0,18176.0,31536.0,33140.0,48479.0,49526.0,65848.0,88378.0,90392.0


In [3]:
# Data types
wine_df.dtypes

points             float64
price              float64
country              int64
designation          int64
province             int64
region_1             int64
variety              int64
winery               int64
tokens               int64
filtered_tokens      int64
1                  float64
2                  float64
3                  float64
4                  float64
5                  float64
6                  float64
7                  float64
8                  float64
9                  float64
10                 float64
dtype: object

In [4]:
# find null values
for column in wine_df.columns:
    print(f"Column {column} has {wine_df[column].isnull().sum()} null values")

Column points has 0 null values
Column price has 0 null values
Column country has 0 null values
Column designation has 0 null values
Column province has 0 null values
Column region_1 has 0 null values
Column variety has 0 null values
Column winery has 0 null values
Column tokens has 0 null values
Column filtered_tokens has 0 null values
Column 1 has 0 null values
Column 2 has 0 null values
Column 3 has 0 null values
Column 4 has 0 null values
Column 5 has 0 null values
Column 6 has 0 null values
Column 7 has 0 null values
Column 8 has 0 null values
Column 9 has 0 null values
Column 10 has 0 null values


In [5]:
#Transform features
wine_df["1"] = wine_df["1"] / 100
wine_df["2"] = wine_df["2"] / 100
wine_df["3"] = wine_df["3"] / 100
wine_df["4"] = wine_df["4"] / 100
wine_df["5"] = wine_df["5"] / 100
wine_df["6"] = wine_df["6"] / 100
wine_df["7"] = wine_df["7"] / 100
wine_df["8"] = wine_df["8"] / 100
wine_df["9"] = wine_df["9"] / 100
wine_df["10"] = wine_df["10"] / 100
wine_df.head()

Unnamed: 0,points,price,country,designation,province,region_1,variety,winery,tokens,filtered_tokens,1,2,3,4,5,6,7,8,9,10
0,96.0,235.0,12,20,6,24,2,15,60,36,27.01,160.04,232.36,284.01,330.79,390.68,453.3,465.82,634.2,666.69
1,96.0,110.0,11,20,26,26,9,15,51,31,114.81,331.4,416.43,467.58,557.09,594.55,696.5,796.53,1046.62,1064.32
2,96.0,90.0,12,20,6,26,19,15,47,30,100.77,264.68,321.51,494.02,611.57,642.89,762.72,883.98,997.49,1197.75
3,96.0,65.0,12,22,27,46,12,15,62,43,15.46,42.38,131.71,153.24,163.56,253.81,331.4,467.58,562.57,671.44
4,95.0,66.0,4,20,31,26,9,15,66,37,15.46,171.61,181.76,315.36,331.4,484.79,495.26,658.48,883.78,903.92


In [6]:
# Define the features set.
X = wine_df.copy()
X = X.drop("variety", axis=1)
X.head()

Unnamed: 0,points,price,country,designation,province,region_1,winery,tokens,filtered_tokens,1,2,3,4,5,6,7,8,9,10
0,96.0,235.0,12,20,6,24,15,60,36,27.01,160.04,232.36,284.01,330.79,390.68,453.3,465.82,634.2,666.69
1,96.0,110.0,11,20,26,26,15,51,31,114.81,331.4,416.43,467.58,557.09,594.55,696.5,796.53,1046.62,1064.32
2,96.0,90.0,12,20,6,26,15,47,30,100.77,264.68,321.51,494.02,611.57,642.89,762.72,883.98,997.49,1197.75
3,96.0,65.0,12,22,27,46,15,62,43,15.46,42.38,131.71,153.24,163.56,253.81,331.4,467.58,562.57,671.44
4,95.0,66.0,4,20,31,26,15,66,37,15.46,171.61,181.76,315.36,331.4,484.79,495.26,658.48,883.78,903.92


In [7]:
# Define the target set.
y = wine_df["variety"].values
y[:5]

array([ 2,  9, 19, 12,  9], dtype=int64)

In [8]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, train_size=0.80)

In [9]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(109737, 19)
(27435, 19)
(109737,)
(27435,)


In [10]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [11]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [12]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Fitting the Decision Tree Model

In [13]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [14]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [15]:
print(model)

DecisionTreeClassifier()


### Making Predictions Using the Tree Model

In [16]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

### Model Evaluation

In [17]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm)
    #cm, index=["Actual High", "Actual Low", "Actual Med", "Actual Top Shelf"], columns=["Predicted High", "Predicted Low", "Predicted Med", "Predicted Top Shelf"])

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [18]:
print(cm)

[[ 597    3   47    0   15    0    7    7    0   69    0    0   10    0
    46    4    1    8    2    6    2    1   15    1    1    3    7]
 [   6  151   13    0    8    0    5    4    0   15    0    0   10    0
    11    0    6    0    1    6    0    0    8    1    2    0    8]
 [  52   18 1874    0   88    0   30   84    0  147   11    2   56    0
    58    6    9    4    6   31   21    8   60    6   10    8   44]
 [   0    0    1  166   18    0    0    1    0    4    0    0    6    0
     1    0    0    0    0    0    0    2    0    0    0    2    0]
 [  13   12  100   11 1916    0    9   47    2  124   15   17  215    1
    29    5   33   10    8   62   15   16   54    3   10   13   39]
 [   0    2    0    0    0  217    0    1    0   21    0    0    0    0
    21    0    0    1    0    0    0    2    0    0    0    4    0]
 [   5    1   36    0   24    0  429    9    0   38    0    0    7    0
    15    0    1    4    0    6    0    0   10    1    0    1    0]
 [   8   11   79    

In [19]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,597,3,47,0,15,0,7,7,0,69,...,8,2,6,2,1,15,1,1,3,7
1,6,151,13,0,8,0,5,4,0,15,...,0,1,6,0,0,8,1,2,0,8
2,52,18,1874,0,88,0,30,84,0,147,...,4,6,31,21,8,60,6,10,8,44
3,0,0,1,166,18,0,0,1,0,4,...,0,0,0,0,2,0,0,0,2,0
4,13,12,100,11,1916,0,9,47,2,124,...,10,8,62,15,16,54,3,10,13,39
5,0,2,0,0,0,217,0,1,0,21,...,1,0,0,0,2,0,0,0,4,0
6,5,1,36,0,24,0,429,9,0,38,...,4,0,6,0,0,10,1,0,1,0
7,8,11,79,0,65,3,10,604,0,58,...,5,6,21,4,0,25,1,6,8,18
8,0,1,1,0,0,0,0,0,272,26,...,0,0,0,0,0,2,0,0,1,2
9,72,19,118,2,148,31,38,75,11,4108,...,44,65,80,22,32,77,96,18,75,41


Accuracy Score : 0.707417532349189
Classification Report
              precision    recall  f1-score   support

           0       0.70      0.70      0.70       852
           1       0.58      0.59      0.58       255
           2       0.71      0.71      0.71      2633
           3       0.88      0.83      0.85       201
           4       0.69      0.69      0.69      2779
           5       0.77      0.81      0.79       269
           6       0.75      0.73      0.74       587
           7       0.58      0.61      0.60       990
           8       0.95      0.88      0.91       309
           9       0.74      0.73      0.73      5657
          10       0.63      0.69      0.66       246
          11       0.59      0.58      0.58       265
          12       0.78      0.79      0.79      2769
          13       0.82      0.83      0.83       357
          14       0.69      0.68      0.68      1892
          15       0.64      0.64      0.64       292
          16       0.76 