In [1]:
# initial imports
import pandas as pd
import numpy as np
from path import Path
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Load data
file_path = Path("Resources/winemag-data_cleaned.csv")
wine_df = pd.read_csv(file_path)
wine_df.head()

Unnamed: 0,points,price,country,designation,province,region_1,variety,winery,tokens,filtered_tokens,1,2,3,4,5,6,7,8,9,10
0,96.0,235.0,12,20,6,24,2,15,60,36,2701.0,16004.0,23236.0,28401.0,33079.0,39068.0,45330.0,46582.0,63420.0,66669.0
1,96.0,110.0,11,20,26,26,9,15,51,31,11481.0,33140.0,41643.0,46758.0,55709.0,59455.0,69650.0,79653.0,104662.0,106432.0
2,96.0,90.0,12,20,6,26,19,15,47,30,10077.0,26468.0,32151.0,49402.0,61157.0,64289.0,76272.0,88398.0,99749.0,119775.0
3,96.0,65.0,12,22,27,46,12,15,62,43,1546.0,4238.0,13171.0,15324.0,16356.0,25381.0,33140.0,46758.0,56257.0,67144.0
4,95.0,66.0,4,20,31,26,9,15,66,37,1546.0,17161.0,18176.0,31536.0,33140.0,48479.0,49526.0,65848.0,88378.0,90392.0


In [3]:
# Data types
wine_df.dtypes

points             float64
price              float64
country              int64
designation          int64
province             int64
region_1             int64
variety              int64
winery               int64
tokens               int64
filtered_tokens      int64
1                  float64
2                  float64
3                  float64
4                  float64
5                  float64
6                  float64
7                  float64
8                  float64
9                  float64
10                 float64
dtype: object

In [4]:
# find null values
for column in wine_df.columns:
    print(f"Column {column} has {wine_df[column].isnull().sum()} null values")

Column points has 0 null values
Column price has 0 null values
Column country has 0 null values
Column designation has 0 null values
Column province has 0 null values
Column region_1 has 0 null values
Column variety has 0 null values
Column winery has 0 null values
Column tokens has 0 null values
Column filtered_tokens has 0 null values
Column 1 has 0 null values
Column 2 has 0 null values
Column 3 has 0 null values
Column 4 has 0 null values
Column 5 has 0 null values
Column 6 has 0 null values
Column 7 has 0 null values
Column 8 has 0 null values
Column 9 has 0 null values
Column 10 has 0 null values


In [5]:
#Transform features
wine_df["1"] = wine_df["1"] / 100
wine_df["2"] = wine_df["2"] / 100
wine_df["3"] = wine_df["3"] / 100
wine_df["4"] = wine_df["4"] / 100
wine_df["5"] = wine_df["5"] / 100
wine_df["6"] = wine_df["6"] / 100
wine_df["7"] = wine_df["7"] / 100
wine_df["8"] = wine_df["8"] / 100
wine_df["9"] = wine_df["9"] / 100
wine_df["10"] = wine_df["10"] / 100
wine_df.head()

Unnamed: 0,points,price,country,designation,province,region_1,variety,winery,tokens,filtered_tokens,1,2,3,4,5,6,7,8,9,10
0,96.0,235.0,12,20,6,24,2,15,60,36,27.01,160.04,232.36,284.01,330.79,390.68,453.3,465.82,634.2,666.69
1,96.0,110.0,11,20,26,26,9,15,51,31,114.81,331.4,416.43,467.58,557.09,594.55,696.5,796.53,1046.62,1064.32
2,96.0,90.0,12,20,6,26,19,15,47,30,100.77,264.68,321.51,494.02,611.57,642.89,762.72,883.98,997.49,1197.75
3,96.0,65.0,12,22,27,46,12,15,62,43,15.46,42.38,131.71,153.24,163.56,253.81,331.4,467.58,562.57,671.44
4,95.0,66.0,4,20,31,26,9,15,66,37,15.46,171.61,181.76,315.36,331.4,484.79,495.26,658.48,883.78,903.92


In [6]:
# Define the features set.
X = wine_df.copy()
X = X.drop("country", axis=1)
X.head()

Unnamed: 0,points,price,designation,province,region_1,variety,winery,tokens,filtered_tokens,1,2,3,4,5,6,7,8,9,10
0,96.0,235.0,20,6,24,2,15,60,36,27.01,160.04,232.36,284.01,330.79,390.68,453.3,465.82,634.2,666.69
1,96.0,110.0,20,26,26,9,15,51,31,114.81,331.4,416.43,467.58,557.09,594.55,696.5,796.53,1046.62,1064.32
2,96.0,90.0,20,6,26,19,15,47,30,100.77,264.68,321.51,494.02,611.57,642.89,762.72,883.98,997.49,1197.75
3,96.0,65.0,22,27,46,12,15,62,43,15.46,42.38,131.71,153.24,163.56,253.81,331.4,467.58,562.57,671.44
4,95.0,66.0,20,31,26,9,15,66,37,15.46,171.61,181.76,315.36,331.4,484.79,495.26,658.48,883.78,903.92


In [7]:
# Define the target set.
y = wine_df["country"].values
y[:5]

array([12, 11, 12, 12,  4], dtype=int64)

In [8]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, train_size=0.80)

In [9]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(109737, 19)
(27435, 19)
(109737,)
(27435,)


In [10]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [11]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [12]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Fitting the Decision Tree Model

In [13]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [14]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [15]:
print(model)

DecisionTreeClassifier()


### Making Predictions Using the Tree Model

In [16]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

### Model Evaluation

In [17]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Argentina", "Actual Austrailia", "Actual Austria", "Actual Chile", "Actual France", "Actual Germany", "Actual Italy", "Actual New Zealand", "Actual Other", "Actual Portugal", "Actual South Africa", "Actual Spain", "Actual US"], columns=["Predicted Argentina", "Predicted Austrailia", "Predicted Austria", "Predicted Chile", "Predicted France", "Predicted Germany", "Predicted Italy", "Predicted New Zealand", "Predicted Other", "Predicted Portugal", "Predicted South Africa", "Predicted Spain", "Predicted US"])

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [18]:
print(cm)

[[ 1071     7     0     0     3     0     4     0     1     0     0    14
      5]
 [    3   941     0     0     7     0     3     0     6     0     0    11
      3]
 [    0     0   448     9     0    19     0    12    32    26     8     0
      2]
 [    0     0     9  1088     0     1     0    10    37     8    11     0
      3]
 [    3     6     0     0  2838     0     4     0     4     0     0     4
      1]
 [    0     0    20     5     0   424     0     7     8     2     2     0
      0]
 [    2    12     0     0     2     0  3784     0     3     0     0     6
      4]
 [    0     0     5     9     0    10     0   559     7     1    10     0
      2]
 [    0     9    36    31     1    11     1     9   418    19    37     2
     11]
 [    0     0    28     6     0     1     0     2    17   758    13     0
      1]
 [    0     0     5    17     0     3     0     5    32     2   387     0
      3]
 [   12     3     0     0     3     0     5     0     2     0     0  1563
      9]
 [  

In [19]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted Argentina,Predicted Austrailia,Predicted Austria,Predicted Chile,Predicted France,Predicted Germany,Predicted Italy,Predicted New Zealand,Predicted Other,Predicted Portugal,Predicted South Africa,Predicted Spain,Predicted US
Actual Argentina,1071,7,0,0,3,0,4,0,1,0,0,14,5
Actual Austrailia,3,941,0,0,7,0,3,0,6,0,0,11,3
Actual Austria,0,0,448,9,0,19,0,12,32,26,8,0,2
Actual Chile,0,0,9,1088,0,1,0,10,37,8,11,0,3
Actual France,3,6,0,0,2838,0,4,0,4,0,0,4,1
Actual Germany,0,0,20,5,0,424,0,7,8,2,2,0,0
Actual Italy,2,12,0,0,2,0,3784,0,3,0,0,6,4
Actual New Zealand,0,0,5,9,0,10,0,559,7,1,10,0,2
Actual Other,0,9,36,31,1,11,1,9,418,19,37,2,11
Actual Portugal,0,0,28,6,0,1,0,2,17,758,13,0,1


Accuracy Score : 0.9719701111718607
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      1105
           1       0.96      0.97      0.96       974
           2       0.81      0.81      0.81       556
           3       0.93      0.93      0.93      1167
           4       0.99      0.99      0.99      2860
           5       0.90      0.91      0.90       468
           6       0.99      0.99      0.99      3813
           7       0.92      0.93      0.93       603
           8       0.72      0.71      0.72       585
           9       0.93      0.92      0.92       826
          10       0.83      0.85      0.84       454
          11       0.98      0.98      0.98      1597
          12       1.00      1.00      1.00     12427

    accuracy                           0.97     27435
   macro avg       0.92      0.92      0.92     27435
weighted avg       0.97      0.97      0.97     27435



In [22]:
# List the features sorted in order by feature importance

features = model.feature_importances_
headings = X.columns

print("Feature Importances: Wine trends by Country")
sorted(zip(features, headings), reverse=True)

Feature Importances: Wine trends by Country


[(0.7380825554222149, 'province'),
 (0.11372218327508468, 'region_1'),
 (0.041920561043604115, 'variety'),
 (0.015603951301377436, 'price'),
 (0.010562683531152006, 'filtered_tokens'),
 (0.010360695018062879, 'points'),
 (0.007150749227220838, '1'),
 (0.006792989227137234, 'tokens'),
 (0.006765824876017454, '10'),
 (0.005901312658340584, '2'),
 (0.005809710648253039, 'designation'),
 (0.005695970983826007, '5'),
 (0.005658377269939286, '9'),
 (0.0053771183016563286, '3'),
 (0.005199102695502414, '4'),
 (0.004954946990048882, '7'),
 (0.004948862542148139, '6'),
 (0.004938930832081142, '8'),
 (0.0005534741563325492, 'winery')]