In [1]:
# initial imports
import pandas as pd
import numpy as np
from path import Path
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Load data
file_path = Path("Resources/compiled_wine.csv")
wine_df = pd.read_csv(file_path)
wine_df.head()

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,Variety
0,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,95.0,2011,Red
1,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,15.5,2017,Red
2,Erta e China Rosso di Toscana 2015,Italy,Toscana,Renzo Masi,3.9,100,7.45,2015,Red
3,Bardolino 2019,Italy,Bardolino,Cavalchina,3.5,100,8.72,2019,Red
4,Ried Scheibner Pinot Noir 2016,Austria,Carnuntum,Markowitsch,3.9,100,29.15,2016,Red


### Encode Labels

In [3]:
# Creating an instance of label encoder
label_encoder = LabelEncoder()

# wine variety dictionary
Variety_le = {"Red": 1, "White": 2, "Sparkling": 3, "Rose": 4,}

# Wine varieties encoded using the dictionary values
wine_df["Variety_le"] = wine_df["Variety"].apply(lambda x: Variety_le[x])
wine_df.head()

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,Variety,Variety_le
0,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,95.0,2011,Red,1
1,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,15.5,2017,Red,1
2,Erta e China Rosso di Toscana 2015,Italy,Toscana,Renzo Masi,3.9,100,7.45,2015,Red,1
3,Bardolino 2019,Italy,Bardolino,Cavalchina,3.5,100,8.72,2019,Red,1
4,Ried Scheibner Pinot Noir 2016,Austria,Carnuntum,Markowitsch,3.9,100,29.15,2016,Red,1


In [4]:
wine_type = wine_df['Variety_le'].value_counts()
wine_type

2    8666
1    8666
3    1007
4     397
Name: Variety_le, dtype: int64

In [5]:
# wine year dictionary

Year_le = {
    "2016": 2016,
    "2017": 2017,
    "2015": 2015,
    "2018": 2018,
    "2014": 2014,
    "2013": 2013,
    "2012": 2012,
    "2011": 2011,
    "2019": 2019,
    "2010": 2010,
    "2005": 2005,
    "2009": 2009,
    "2008": 2008,
    "2006": 2006,
    "2007": 2007,
    "2004": 2004,
    "2000": 2000,
    "1999": 1999,
    "2001": 2001,
    "2003": 2003,
    "2002": 2002,
    "1997": 1997,
    "1998": 1998,
    "1996": 1996,
    "1995": 1995,
    "1992": 1992,
    "1990": 1990,
    "1989": 1989,
    "1961": 1961,
    "1988": 1988,
    "1991": 1991,
    "1993": 1993,
    "N.V.": 0,}

# Wine varieties encoded using the dictionary values
wine_df["Year_le"] = wine_df["Year"].apply(lambda x: Year_le[x])
wine_df.head()

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,Variety,Variety_le,Year_le
0,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,95.0,2011,Red,1,2011
1,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,15.5,2017,Red,1,2017
2,Erta e China Rosso di Toscana 2015,Italy,Toscana,Renzo Masi,3.9,100,7.45,2015,Red,1,2015
3,Bardolino 2019,Italy,Bardolino,Cavalchina,3.5,100,8.72,2019,Red,1,2019
4,Ried Scheibner Pinot Noir 2016,Austria,Carnuntum,Markowitsch,3.9,100,29.15,2016,Red,1,2016


In [6]:
le = LabelEncoder()
wine_df2 = wine_df.copy()
wine_df2['Name'] = le.fit_transform(wine_df2['Name']) 
wine_df2['Country'] = le.fit_transform(wine_df2['Country']) 
wine_df2['Region'] = le.fit_transform(wine_df2['Region']) 
wine_df2['Winery'] = le.fit_transform(wine_df2['Winery']) 
wine_df2.head()

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,Variety,Variety_le,Year_le
0,5397,9,460,719,4.2,100,95.0,2011,Red,1,2011
1,4166,9,331,817,4.3,100,15.5,2017,Red,1,2017
2,2639,15,604,2441,3.9,100,7.45,2015,Red,1,2015
3,595,15,45,495,3.5,100,8.72,2019,Red,1,2019
4,5898,2,114,2003,3.9,100,29.15,2016,Red,1,2016


### Preprocessing Encoded Data

In [7]:
# Define the features set.
X = wine_df2.copy()
X = X.drop("Variety", axis=1)
X = X.drop("Variety_le", axis=1)
X = X.drop("Year", axis=1)
X.head()

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year_le
0,5397,9,460,719,4.2,100,95.0,2011
1,4166,9,331,817,4.3,100,15.5,2017
2,2639,15,604,2441,3.9,100,7.45,2015
3,595,15,45,495,3.5,100,8.72,2019
4,5898,2,114,2003,3.9,100,29.15,2016


In [8]:
# Define the target set.
y = wine_df2["Variety_le"].values
y[:5]

array([1, 1, 1, 1, 1], dtype=int64)

In [9]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, train_size=0.80)

In [10]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(14988, 8)
(3748, 8)
(14988,)
(3748,)


In [11]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [12]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [13]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Fitting the Decision Tree Model

In [14]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [15]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

### Making Predictions Using the Tree Model

In [18]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

### Model Evaluation

In [20]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 1", "Actual 2", "Actual 3", "Actual 4"], columns=["Predicted 1", "Predicted 2", "Predicted 3", "Predicted 4"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [21]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4
Actual 1,268,1438,7,11
Actual 2,1670,70,7,11
Actual 3,14,6,159,3
Actual 4,39,10,5,30


Accuracy Score : 0.14060832443970117
Classification Report
              precision    recall  f1-score   support

           1       0.13      0.16      0.14      1724
           2       0.05      0.04      0.04      1758
           3       0.89      0.87      0.88       182
           4       0.55      0.36      0.43        84

    accuracy                           0.14      3748
   macro avg       0.40      0.36      0.38      3748
weighted avg       0.14      0.14      0.14      3748

