<a href="https://colab.research.google.com/github/aadi-kanwar/MLOps/blob/main/Wine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [15]:
d = pd.read_csv('/content/wines.csv')
d.head()

Unnamed: 0,winery,wine,year,rating,num_reviews,country,region,price,type,body,acidity
0,Teso La Monja,Tinto,2013,4.9,58,Espana,Toro,995.0,Toro Red,5.0,3.0
1,Artadi,Vina El Pison,2018,4.9,31,Espana,Vino de Espana,313.5,Tempranillo,4.0,2.0
2,Vega Sicilia,Unico,2009,4.8,1793,Espana,Ribera del Duero,324.95,Ribera Del Duero Red,5.0,3.0
3,Vega Sicilia,Unico,1999,4.8,1705,Espana,Ribera del Duero,692.96,Ribera Del Duero Red,5.0,3.0
4,Vega Sicilia,Unico,1996,4.8,1309,Espana,Ribera del Duero,778.06,Ribera Del Duero Red,5.0,3.0


In [16]:
d.shape

(7500, 11)

In [17]:
d.columns

Index(['winery', 'wine', 'year', 'rating', 'num_reviews', 'country', 'region',
       'price', 'type', 'body', 'acidity'],
      dtype='object')

In [18]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   winery       7500 non-null   object 
 1   wine         7500 non-null   object 
 2   year         7498 non-null   object 
 3   rating       7500 non-null   float64
 4   num_reviews  7500 non-null   int64  
 5   country      7500 non-null   object 
 6   region       7500 non-null   object 
 7   price        7500 non-null   float64
 8   type         6955 non-null   object 
 9   body         6331 non-null   float64
 10  acidity      6331 non-null   float64
dtypes: float64(4), int64(1), object(6)
memory usage: 644.7+ KB


In [19]:
d.isnull().sum()

Unnamed: 0,0
winery,0
wine,0
year,2
rating,0
num_reviews,0
country,0
region,0
price,0
type,545
body,1169


In [20]:
d.duplicated().sum()

5452

In [21]:
d.describe()

Unnamed: 0,rating,num_reviews,price,body,acidity
count,7500.0,7500.0,7500.0,6331.0,6331.0
mean,4.254933,451.109067,60.095822,4.158427,2.946612
std,0.118029,723.001856,150.356676,0.583352,0.248202
min,4.2,25.0,4.99,2.0,1.0
25%,4.2,389.0,18.9,4.0,3.0
50%,4.2,404.0,28.53,4.0,3.0
75%,4.2,415.0,51.35,5.0,3.0
max,4.9,32624.0,3119.08,5.0,3.0


In [22]:
num_cols = ['rating', 'num_reviews', 'price', 'body', 'acidity']
cat_cols = ['country', 'winery', 'wine', 'region', 'type', 'year']

In [23]:
# Handle missing numerical Values
imputer_num = SimpleImputer(strategy='mean')
d[num_cols] = imputer_num.fit_transform(d[num_cols])

# Handle missing categorical values
imputer_cat = SimpleImputer(strategy='most_frequent')
d[cat_cols] = imputer_cat.fit_transform(d[cat_cols])

In [24]:
d.isnull().sum()

Unnamed: 0,0
winery,0
wine,0
year,0
rating,0
num_reviews,0
country,0
region,0
price,0
type,0
body,0


In [25]:
# Encode Categorical values
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    d[col] = le.fit_transform(d[col])
    label_encoders[col] = le

In [26]:
X = d.drop(columns = ['type'])
y = d['type']

In [27]:
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

In [28]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
classifier = DecisionTreeClassifier(random_state=42)

In [30]:
classifier.fit(X_train, y_train)

In [31]:
pred = classifier.predict(X_test)

In [32]:
# GridSearchCV
parameter = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10]
}

In [33]:
grid_search = GridSearchCV(estimator=classifier, param_grid=parameter, cv=10)
grid_search.fit(X_train, y_train)



In [34]:
# Best Model
best_model = grid_search.best_estimator_

In [35]:
y_pred = best_model.predict(X_test)

In [36]:
# Final Evaluation of the Best Model obtained from GridSearchCV
acc = accuracy_score(y_test, y_pred)
print(f'Accuracy: {acc * 100:.2f}%')

prec = precision_score(y_test, y_pred, average = 'weighted')
print(f'Precision: {prec * 100:.2f}%')

rec = recall_score(y_test, y_pred, average = 'weighted')
print(f'Recall: {rec * 100:.2f}%')

f1 = f1_score(y_test, y_pred, average = 'weighted')
print(f'F1 Score: {f1 * 100:.2f}%')

Accuracy: 97.73%
Precision: 97.45%
Recall: 97.73%
F1 Score: 97.56%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
