In [1]:
# initial imports
import pandas as pd
import numpy as np
from path import Path
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Load data
file_path = Path("Resources/winemag-data_cleaned.csv")
wine_df = pd.read_csv(file_path)
wine_df.head()

Unnamed: 0,points,price,country,designation,province,region_1,variety,winery,tokens,filtered_tokens,1,2,3,4,5,6,7,8,9,10
0,96.0,235.0,12,20,6,24,2,15,60,36,2701.0,16004.0,23236.0,28401.0,33079.0,39068.0,45330.0,46582.0,63420.0,66669.0
1,96.0,110.0,11,20,26,26,9,15,51,31,11481.0,33140.0,41643.0,46758.0,55709.0,59455.0,69650.0,79653.0,104662.0,106432.0
2,96.0,90.0,12,20,6,26,19,15,47,30,10077.0,26468.0,32151.0,49402.0,61157.0,64289.0,76272.0,88398.0,99749.0,119775.0
3,96.0,65.0,12,22,27,46,12,15,62,43,1546.0,4238.0,13171.0,15324.0,16356.0,25381.0,33140.0,46758.0,56257.0,67144.0
4,95.0,66.0,4,20,31,26,9,15,66,37,1546.0,17161.0,18176.0,31536.0,33140.0,48479.0,49526.0,65848.0,88378.0,90392.0


In [3]:
# Data types
wine_df.dtypes

points             float64
price              float64
country              int64
designation          int64
province             int64
region_1             int64
variety              int64
winery               int64
tokens               int64
filtered_tokens      int64
1                  float64
2                  float64
3                  float64
4                  float64
5                  float64
6                  float64
7                  float64
8                  float64
9                  float64
10                 float64
dtype: object

In [4]:
# find null values
for column in wine_df.columns:
    print(f"Column {column} has {wine_df[column].isnull().sum()} null values")

Column points has 0 null values
Column price has 0 null values
Column country has 0 null values
Column designation has 0 null values
Column province has 0 null values
Column region_1 has 0 null values
Column variety has 0 null values
Column winery has 0 null values
Column tokens has 0 null values
Column filtered_tokens has 0 null values
Column 1 has 0 null values
Column 2 has 0 null values
Column 3 has 0 null values
Column 4 has 0 null values
Column 5 has 0 null values
Column 6 has 0 null values
Column 7 has 0 null values
Column 8 has 0 null values
Column 9 has 0 null values
Column 10 has 0 null values


In [5]:
#Transform features
wine_df["1"] = wine_df["1"] / 100
wine_df["2"] = wine_df["2"] / 100
wine_df["3"] = wine_df["3"] / 100
wine_df["4"] = wine_df["4"] / 100
wine_df["5"] = wine_df["5"] / 100
wine_df["6"] = wine_df["6"] / 100
wine_df["7"] = wine_df["7"] / 100
wine_df["8"] = wine_df["8"] / 100
wine_df["9"] = wine_df["9"] / 100
wine_df["10"] = wine_df["10"] / 100
wine_df.head()

Unnamed: 0,points,price,country,designation,province,region_1,variety,winery,tokens,filtered_tokens,1,2,3,4,5,6,7,8,9,10
0,96.0,235.0,12,20,6,24,2,15,60,36,27.01,160.04,232.36,284.01,330.79,390.68,453.3,465.82,634.2,666.69
1,96.0,110.0,11,20,26,26,9,15,51,31,114.81,331.4,416.43,467.58,557.09,594.55,696.5,796.53,1046.62,1064.32
2,96.0,90.0,12,20,6,26,19,15,47,30,100.77,264.68,321.51,494.02,611.57,642.89,762.72,883.98,997.49,1197.75
3,96.0,65.0,12,22,27,46,12,15,62,43,15.46,42.38,131.71,153.24,163.56,253.81,331.4,467.58,562.57,671.44
4,95.0,66.0,4,20,31,26,9,15,66,37,15.46,171.61,181.76,315.36,331.4,484.79,495.26,658.48,883.78,903.92


In [6]:
wine_df.insert(1, "points_ranked", '')
wine_df.insert(3, "price_ranked", '')
wine_df.head()

Unnamed: 0,points,points_ranked,price,price_ranked,country,designation,province,region_1,variety,winery,...,1,2,3,4,5,6,7,8,9,10
0,96.0,,235.0,,12,20,6,24,2,15,...,27.01,160.04,232.36,284.01,330.79,390.68,453.3,465.82,634.2,666.69
1,96.0,,110.0,,11,20,26,26,9,15,...,114.81,331.4,416.43,467.58,557.09,594.55,696.5,796.53,1046.62,1064.32
2,96.0,,90.0,,12,20,6,26,19,15,...,100.77,264.68,321.51,494.02,611.57,642.89,762.72,883.98,997.49,1197.75
3,96.0,,65.0,,12,22,27,46,12,15,...,15.46,42.38,131.71,153.24,163.56,253.81,331.4,467.58,562.57,671.44
4,95.0,,66.0,,4,20,31,26,9,15,...,15.46,171.61,181.76,315.36,331.4,484.79,495.26,658.48,883.78,903.92


In [7]:
# points statistics
wine_df['points'].describe()

count    137172.000000
mean         87.789192
std           3.220398
min          80.000000
25%          86.000000
50%          88.000000
75%          90.000000
max         100.000000
Name: points, dtype: float64

In [8]:
# price statistics
wine_df['price'].describe()

count    137172.000000
mean         33.133693
std          36.327952
min           4.000000
25%          16.000000
50%          24.000000
75%          40.000000
max        2300.000000
Name: price, dtype: float64

In [9]:
# excellent = 1 = 100-90
# good = 2 = 89.9 - 88
# average = 3 = 87.9 - 86
# ok = 4 = 85.9 - 80

wine_df.loc[(wine_df['points'] >= 90),'points_ranked'] = 'Excellent'
wine_df.loc[(wine_df['points'] <= 89.9) & (wine_df['points'] >= 88),'points_ranked'] = 'Good'
wine_df.loc[(wine_df['points'] <= 87.9) & (wine_df['points'] >= 86),'points_ranked'] = 'Average'
wine_df.loc[(wine_df['points'] < 85.9),'points_ranked'] = 'ok'
points_ranked_count = wine_df['points_ranked'].value_counts()
print(points_ranked_count)

Excellent    42439
ok           33779
Average      33370
Good         27584
Name: points_ranked, dtype: int64


In [10]:
# top_shelf = 1 = 2300 - 40
# high = 2 = 39.9 - 24
# med = 3 = 23.9 - 16
# low = 4 = 15.9 - 4

wine_df.loc[(wine_df['price'] >= 40),'price_ranked'] = 'Top Shelf'
wine_df.loc[(wine_df['price'] <= 39.9) & (wine_df['price'] >= 24),'price_ranked'] = 'High'
wine_df.loc[(wine_df['price'] <= 23.9) & (wine_df['price'] >= 16),'price_ranked'] = 'Med'
wine_df.loc[(wine_df['price'] < 15.9),'price_ranked'] = 'Low'
price_ranked_count = wine_df['price_ranked'].value_counts()
print(price_ranked_count)

High         36106
Top Shelf    35322
Low          33872
Med          31872
Name: price_ranked, dtype: int64


In [11]:
wine_df.tail(20)

Unnamed: 0,points,points_ranked,price,price_ranked,country,designation,province,region_1,variety,winery,...,1,2,3,4,5,6,7,8,9,10
137152,90.0,Excellent,65.0,Top Shelf,4,20,5,26,12,15,...,131.71,145.71,167.04,218.23,259.64,523.51,648.44,706.22,823.72,850.8
137153,89.0,Good,52.0,Top Shelf,4,20,5,26,12,15,...,4.77,24.37,258.82,285.11,336.28,341.21,362.25,424.25,582.33,648.45
137154,89.0,Good,38.0,High,4,17,5,26,12,15,...,94.92,140.79,334.37,406.28,452.94,604.17,720.83,734.89,767.64,777.51
137155,87.0,Average,37.0,High,4,20,5,26,12,15,...,230.71,258.82,515.55,784.95,997.6,998.92,1071.74,1092.08,1134.32,1180.98
137156,94.0,Excellent,30.0,High,4,17,32,26,15,15,...,57.65,110.76,156.82,209.0,285.11,355.13,363.31,413.59,467.58,661.74
137157,94.0,Excellent,25.0,High,12,20,6,3,9,15,...,38.9,125.31,151.32,326.69,362.96,443.36,467.58,512.47,589.47,617.15
137158,93.0,Excellent,30.0,High,12,20,6,26,25,15,...,13.54,122.52,192.12,243.46,283.28,319.03,333.99,349.96,431.57,467.58
137159,93.0,Excellent,65.0,Top Shelf,12,20,6,24,3,15,...,55.61,156.82,295.11,318.39,696.5,764.89,1196.42,1196.67,1261.94,1342.72
137160,92.0,Excellent,30.0,High,4,20,12,11,3,15,...,6.14,26.89,89.35,114.57,230.71,248.46,460.95,622.08,895.83,1092.08
137161,92.0,Excellent,38.0,High,4,20,12,11,3,15,...,18.92,82.97,167.49,190.94,397.04,549.39,557.09,679.12,782.69,865.77


In [12]:
# Saving cleaned data
file_path = "Resources/wine_df_cleaned.csv"
wine_df.to_csv(file_path, index=False)

In [13]:
# Define the features set.
X = wine_df.copy()
X = X.drop("points", axis=1)
X = X.drop("points_ranked", axis=1)
X = X.drop("price_ranked", axis=1)
X.head()

Unnamed: 0,price,country,designation,province,region_1,variety,winery,tokens,filtered_tokens,1,2,3,4,5,6,7,8,9,10
0,235.0,12,20,6,24,2,15,60,36,27.01,160.04,232.36,284.01,330.79,390.68,453.3,465.82,634.2,666.69
1,110.0,11,20,26,26,9,15,51,31,114.81,331.4,416.43,467.58,557.09,594.55,696.5,796.53,1046.62,1064.32
2,90.0,12,20,6,26,19,15,47,30,100.77,264.68,321.51,494.02,611.57,642.89,762.72,883.98,997.49,1197.75
3,65.0,12,22,27,46,12,15,62,43,15.46,42.38,131.71,153.24,163.56,253.81,331.4,467.58,562.57,671.44
4,66.0,4,20,31,26,9,15,66,37,15.46,171.61,181.76,315.36,331.4,484.79,495.26,658.48,883.78,903.92


In [14]:
# Define the target set.
y = wine_df["points_ranked"].values
y[:5]

array(['Excellent', 'Excellent', 'Excellent', 'Excellent', 'Excellent'],
      dtype=object)

In [15]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, train_size=0.80)

In [16]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(109737, 19)
(27435, 19)
(109737,)
(27435,)


In [17]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [18]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [19]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Fitting the Decision Tree Model

In [20]:
#y_train = y_train.astype('int')
y_train

array(['ok', 'Excellent', 'ok', ..., 'ok', 'Average', 'ok'], dtype=object)

In [21]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [22]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [23]:
print(model)

DecisionTreeClassifier()


### Making Predictions Using the Tree Model

In [24]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

### Model Evaluation

In [25]:
#y_test = y_test.astype('int')
y_test

array(['Excellent', 'Average', 'Average', ..., 'Excellent', 'Good',
       'Average'], dtype=object)

In [26]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Average", "Actual Excellent", "Actual Good", "Actual Ok"], columns=["Predicted Average", "Predicted Excellent", "Predicted Good", "Predicted Ok"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [27]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted Average,Predicted Excellent,Predicted Good,Predicted Ok
Actual Average,4661,527,631,821
Actual Excellent,608,6735,894,277
Actual Good,691,853,3512,412
Actual Ok,849,271,416,5277


Accuracy Score : 0.7357390195006379
Classification Report
              precision    recall  f1-score   support

     Average       0.68      0.70      0.69      6640
   Excellent       0.80      0.79      0.80      8514
        Good       0.64      0.64      0.64      5468
          ok       0.78      0.77      0.78      6813

    accuracy                           0.74     27435
   macro avg       0.73      0.73      0.73     27435
weighted avg       0.74      0.74      0.74     27435



In [30]:
# List the features sorted in order by feature importance

features = model.feature_importances_
headings = X.columns

print("Feature Importances: Wine trends by Points")
sorted(zip(features, headings), reverse=True)

Feature Importances: Wine trends by Points


[(0.16565167231004455, 'price'),
 (0.10369870055849721, 'filtered_tokens'),
 (0.06197953889922212, '1'),
 (0.057732702608368144, '2'),
 (0.05637344570812023, '3'),
 (0.05607380860600835, '10'),
 (0.052164615199576735, '4'),
 (0.051234110832160236, 'tokens'),
 (0.04897881608509409, '9'),
 (0.048763404819716985, '5'),
 (0.04869604259472367, '7'),
 (0.048243622528115294, '6'),
 (0.046772968556436596, '8'),
 (0.03745116093121024, 'variety'),
 (0.03526209864850603, 'country'),
 (0.03312600048601323, 'province'),
 (0.027276159416723473, 'region_1'),
 (0.017335931008442794, 'designation'),
 (0.0031852002030201255, 'winery')]