In [8]:
import numpy as np
import seaborn as sns
import pandas as pd

from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score,classification_report

In [3]:
data = pd.read_csv('WineQuality.csv')
print(data)

      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0          7.400000          0.700000     0.000000        1.900000   0.076000   
1          7.800000          0.880000     0.000000        2.600000   0.098000   
2          7.800000          0.760000     0.040000        2.300000   0.092000   
3         11.200000          0.280000     0.560000        1.900000   0.075000   
4          7.400000          0.700000     0.000000        1.900000   0.076000   
...             ...               ...          ...             ...        ...   
1694       7.801551          0.774154     0.241745        2.715859   0.079321   
1695       7.207143          0.290908     0.590842        3.962575   0.055079   
1696      10.100031          0.964682     0.540634        3.333877   0.115459   
1697       5.372036          0.510212     0.078397        0.982824   0.095933   
1698       9.277331          0.354429     0.291503        0.900000   0.059163   

      free sulfur dioxide  

In [4]:
print(data.isnull().sum())

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64


In [5]:
print(data['quality'].value_counts())

quality
5.000000    681
6.000000    638
7.000000    199
4.000000     53
8.000000     18
           ... 
4.967976      1
4.499398      1
5.517263      1
7.325771      1
6.737661      1
Name: count, Length: 106, dtype: int64


In [18]:
X = data.drop("quality",axis=1)
y=data["quality"]

In [20]:
y=data["quality"].astype(int)

In [21]:
X_train,X_test,y_train,y_test = train_test_split(
    X,y,test_size=0.2,random_state=42
)

In [22]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [23]:
lr = LogisticRegression(max_iter=2000)
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

print("Accuracy:",accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy: 0.5205882352941177
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00        14
           5       0.60      0.67      0.63       146
           6       0.47      0.53      0.49       135
           7       0.33      0.20      0.25        40
           8       0.00      0.00      0.00         4

    accuracy                           0.52       340
   macro avg       0.23      0.23      0.23       340
weighted avg       0.48      0.52      0.50       340



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [25]:
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
print(classification_report(y_test,knn.predict(X_test)))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.25      0.07      0.11        14
           5       0.61      0.68      0.64       146
           6       0.52      0.58      0.55       135
           7       0.50      0.33      0.39        40
           8       0.00      0.00      0.00         4

    accuracy                           0.56       340
   macro avg       0.31      0.28      0.28       340
weighted avg       0.54      0.56      0.55       340



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [27]:
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
print(classification_report(y_test,dt.predict(X_test)))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.20      0.14      0.17        14
           5       0.70      0.65      0.67       146
           6       0.61      0.65      0.63       135
           7       0.40      0.45      0.42        40
           8       0.50      0.25      0.33         4

    accuracy                           0.60       340
   macro avg       0.40      0.36      0.37       340
weighted avg       0.60      0.60      0.60       340



In [28]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
print(classification_report(y_test,rf.predict(X_test)))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00        14
           5       0.76      0.75      0.75       146
           6       0.59      0.73      0.66       135
           7       0.59      0.40      0.48        40
           8       0.50      0.25      0.33         4

    accuracy                           0.66       340
   macro avg       0.41      0.35      0.37       340
weighted avg       0.64      0.66      0.64       340



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [30]:
param_lr = {
    "C":[0.01,0.1,1,10],
    "penalty":["l2"],
    "solver":["lbfgs"]
}
grid_lr = GridSearchCV(LogisticRegression(max_iter=2000),
                       param_lr,cv=5)
grid_lr.fit(X_train,y_train)

print("Best Params:",grid_lr.best_params_)
print("Best CV Score:",grid_lr.best_score_)

Best Params: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
Best CV Score: 0.5997313870197525


In [31]:
param_knn = {"n_neighbors":[3,5,7,9]}

grid_knn = GridSearchCV(KNeighborsClassifier(),
                        param_knn,cv=5)

grid_knn.fit(X_train,y_train)

print(grid_knn.best_params_)
print(grid_knn.best_score_)

{'n_neighbors': 9}
0.572465812893423


In [33]:
param_dt = {
    "max_depth":[None,5,10,20],
    "min_samples_split":[2,5,10]
}

grid_dt = GridSearchCV(DecisionTreeClassifier(),
                       param_dt,cv=5)

grid_dt.fit(X_train,y_train)
print(grid_dt.best_params_)
print(grid_dt.best_score_)

{'max_depth': None, 'min_samples_split': 2}
0.5871798350336445


In [35]:
param_rf = {
    "n_estimators":[100,200],
    "max_depth":[None,10,20],
    "min_samples_split":[2,5]
}

grid_rf = GridSearchCV(RandomForestClassifier(),
                       param_rf,cv=5)

grid_rf.fit(X_train,y_train)
print(grid_rf.best_params_)
print(grid_rf.best_score_)

{'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}
0.6784539830692424


In [37]:
param_dist_lr = {
    "C":np.logspace(-4,4,20),
    "penalty":["l2"],
    "solver":["lbfgs","liblinear"]
}

rand_lr = RandomizedSearchCV(
    LogisticRegression(max_iter=3000),
    param_distributions=param_dist_lr,
    n_iter=20,
    cv=5,
    random_state=42,
    n_jobs=-1
)
rand_lr.fit(X_train,y_train)
print("Best Parameters:",rand_lr.best_params_)
print("Best Score:",rand_lr.best_score_)

Best Parameters: {'solver': 'lbfgs', 'penalty': 'l2', 'C': np.float64(0.23357214690901212)}
Best Score: 0.5997313870197525


In [40]:
param_dist_knn = {
    "n_neighbors":np.arange(3,32),
    "weights":["uniform","distance"],
    "metrics":["euclidean","manhattan","minknowski"]
}
rand_knn = RandomizedSearchCV(
    KNeighborsClassifier(),
    param_distributions = param_dist_knn,
    n_iter=20,
    cv=5,
    random_state=42,
    n_jobs=-1
)
rand_knn.fit(X_train,y_train)

print("Best Parameters:",rand_knn.best_params_)
print("Best Score:",rand_knn.best_score_)

ValueError: Invalid parameter 'metrics' for estimator KNeighborsClassifier(n_neighbors=np.int64(22), weights='distance'). Valid parameters are: ['algorithm', 'leaf_size', 'metric', 'metric_params', 'n_jobs', 'n_neighbors', 'p', 'weights'].