# Predict color of wine

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats


In [2]:
red_wine_original = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep=';')
white_wine_original= pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', sep=';')

In [3]:
white_wine=white_wine_original.copy()
red_wine=red_wine_original.copy()



white_wine_to_concat=white_wine.copy()
white_wine_to_concat['color']='white'
red_wine_to_concat=red_wine.copy()
red_wine_to_concat['color']='red'

frames = [red_wine_to_concat, white_wine_to_concat]

#Over-sampling: is creating copies of the minority classes to even-up the classes.
#frames = [red_wine_to_concat, white_wine_to_concat, red_wine_to_concat,red_wine_to_concat,]


wines = pd.concat(frames)


In [4]:
wines.describe()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0
mean,7.215307,0.339666,0.318633,5.443235,0.056034,30.525319,115.744574,0.994697,3.218501,0.531268,10.491801,5.818378
std,1.296434,0.164636,0.145318,4.757804,0.035034,17.7494,56.521855,0.002999,0.160787,0.148806,1.192712,0.873255
min,3.8,0.08,0.0,0.6,0.009,1.0,6.0,0.98711,2.72,0.22,8.0,3.0
25%,6.4,0.23,0.25,1.8,0.038,17.0,77.0,0.99234,3.11,0.43,9.5,5.0
50%,7.0,0.29,0.31,3.0,0.047,29.0,118.0,0.99489,3.21,0.51,10.3,6.0
75%,7.7,0.4,0.39,8.1,0.065,41.0,156.0,0.99699,3.32,0.6,11.3,6.0
max,15.9,1.58,1.66,65.8,0.611,289.0,440.0,1.03898,4.01,2.0,14.9,9.0


## Remove Outliers/ legally not acceptable wine

- __Volatile acidity:__    In the U.S, the legal limits of Volatile Acidity are 1.2 g/L for red table wine and 1.1 g/L for white table wine. 

- __Total sulfur dioxide:__   There are legal limits for sulfur levels in wines: in the EU, red wines can only have 160mg/L, while white and rose wines can have about 210mg/L. Sweet wines are allowed to have 400mg/L. For the US, the legal limits are set at 350mg/L, and for Australia, this is 250mg/L.

- __pH:__   Less than 7 are acidic, while solutions with a pH greater than 7 are basic. With a pH of 7, pure water is neutral. Most wines have a pH between 2.9 and 3.9 and are therefore acidic.



### Find wines out of legal limit

In [5]:
count_outliers=wines[wines['volatile acidity']>1.1].shape #8 rows
print(count_outliers)
display_outliers=wines[wines['volatile acidity']>1.1]
print(display_outliers)

count_outliers=wines[wines['total sulfur dioxide']>350].shape #2
print(count_outliers)
wines[(wines['total sulfur dioxide']>210)&(wines['color']=='red')]
wines[(wines['total sulfur dioxide']>400)]

(8, 13)
      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
38              5.7             1.130         0.09            1.50      0.172   
126             8.2             1.330         0.00            1.70      0.081   
127             8.1             1.330         0.00            1.80      0.082   
672             9.8             1.240         0.34            2.00      0.079   
690             7.4             1.185         0.00            4.25      0.097   
724             7.5             1.115         0.10            3.10      0.086   
1299            7.6             1.580         0.00            2.10      0.137   
1312            8.0             1.180         0.21            1.90      0.083   

      free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
38                    7.0                  19.0  0.99400  3.50       0.48   
126                   3.0                  12.0  0.99640  3.53       0.49   
127                   3.0      

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
4745,6.1,0.26,0.25,2.9,0.047,289.0,440.0,0.99314,3.44,0.64,10.5,3,white


### Remove non-legal and outlier

In [6]:
#get index of outliers
get_index= wines.index[wines['volatile acidity']>1.1].tolist()
print(get_index)

#drop outliers
wines_without_outliers=wines.drop(index=get_index, axis=0)

# check if any outliers remained
wines_without_outliers.index[wines_without_outliers['volatile acidity']>1.1].tolist()


[38, 126, 127, 672, 690, 724, 1299, 1312]


[]

# Label encoder

- encode ordinal data
- for the target value y.

In [7]:
from sklearn.preprocessing import LabelEncoder

#cheque for unique values in quality_label column

wines_without_outliers['color'].unique()


le = LabelEncoder()


le.fit(wines_without_outliers[['color']])

print(le.classes_)

df_transformed_color=pd.DataFrame(le.transform(wines_without_outliers[['color']]))

#print(df_transformed_color)

enc_wines_ml=wines_without_outliers.copy()

#print(enc_wines_ml['color'])

#replace color with encoded values
enc_wines_ml['color']=df_transformed_color

#print(le.inverse_transform([0,0, 1]))
#print(le.transform(["red", "red", "white"]))



enc_wines_ml.head()

['red' 'white']


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0


## Split data

- X = df # --> the features we will keep to build our model
- y = target # --> what you're trying to predict
- test-size: prportion of dataset to use for test
- random_state: shuffling applied to the data before applying the split

In [8]:
from sklearn.model_selection import train_test_split

X=enc_wines_ml.copy()


# Example:
y=enc_wines_ml[['color']]
X.drop(['color'],axis=1,inplace=True)


X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
# 						    random_state=42,
# 						    stratify=y)



In [9]:
#to know the shape of the train and test dataset.
print('X_train',X_train.shape)
print('y_train',y_train.shape)
print('X_test',X_test.shape)
print('y_test',y_test.shape)

X_train (5184, 12)
y_train (5184, 1)
X_test (1297, 12)
y_test (1297, 1)


## Feature scaling

- Use normalisation techniques when you know that the distribution of your data is skewed.
- On the other hand, standardisation can be helpful in cases where the data follows a Gaussian distribution (normal distribution). Also, outliers will not be affected by standardisation.

### Normalisation

In [10]:

# data normalisation with sklearn
from sklearn.preprocessing import MinMaxScaler

# fit scaler on training data
norm = MinMaxScaler().fit(X_train)

# transform training data
X_train_norm = norm.transform(X_train)

# transform testing data
X_test_norm = norm.transform(X_test)







## Logistic regression



Fitting the model means training the model on training data using the .fit method provided in sklearn.

In [11]:
from sklearn.linear_model import LogisticRegression

# Fit the model
lr = LogisticRegression()
lr.fit(X_train_norm, y_train)

pred_lr = lr.predict(X_test_norm)
pred_lr

  y = column_or_1d(y, warn=True)


array([1, 1, 1, ..., 0, 0, 1])

## Evaluate model

class color:
   - PURPLE = '\033[95m'
   - CYAN = '\033[96m'
   - DARKCYAN = '\033[36m'
   - BLUE = '\033[94m'
   - GREEN = '\033[92m'
   -  YELLOW = '\033[93m'
   - RED = '\033[91m'
   - BOLD = '\033[1m'
   - UNDERLINE = '\033[4m'
   - END = '\033[0m'

print(color.BOLD + 'Hello, World!' + color.END)

In [12]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score

def evaluate (y_test, pred_lr, target_names):
    print(" \033[95m \033[1m Confusion matrix: \033[0m")
    cm=confusion_matrix(y_test, pred_lr)
    print(cm, '\n\n')
    print("\033[95m \033[1m Accuracy score: \033[0m", np.round(accuracy_score(y_test, pred_lr)*100, 2), '%\n\n')
    print("\033[95m \033[1m Classification report: \033[0m \n",classification_report(y_test, pred_lr, target_names=target_names), '\n\n')
    kappa = cohen_kappa_score(pred_lr, y_test)
    print("\033[95m \033[1m Cohen-Kappa score: \033[0m", kappa)
    
evaluate(y_test, pred_lr,["red","white"])

 [95m [1m Confusion matrix: [0m
[[425 230]
 [100 542]] 


[95m [1m Accuracy score: [0m 74.56 %


[95m [1m Classification report: [0m 
               precision    recall  f1-score   support

         red       0.81      0.65      0.72       655
       white       0.70      0.84      0.77       642

    accuracy                           0.75      1297
   macro avg       0.76      0.75      0.74      1297
weighted avg       0.76      0.75      0.74      1297
 


[95m [1m Cohen-Kappa score: [0m 0.492102858634643


## SVC (Support Vector classifier)

In [13]:
#We use Support Vector classifier as a classifier
from sklearn.svm import SVC

#training the classifier using X_Train and y_train 
clf = SVC(kernel = 'linear').fit(X_train,y_train)
clf.predict(X_train)

#Testing the model using X_test and storing the output in y_pred
y_pred = clf.predict(X_test)

  y = column_or_1d(y, warn=True)


In [14]:
evaluate(y_test, y_pred,["red","white"])

 [95m [1m Confusion matrix: [0m
[[401 254]
 [ 61 581]] 


[95m [1m Accuracy score: [0m 75.71 %


[95m [1m Classification report: [0m 
               precision    recall  f1-score   support

         red       0.87      0.61      0.72       655
       white       0.70      0.90      0.79       642

    accuracy                           0.76      1297
   macro avg       0.78      0.76      0.75      1297
weighted avg       0.78      0.76      0.75      1297
 


[95m [1m Cohen-Kappa score: [0m 0.5156598054127363


## AdaBoostClassifier

In [15]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=1000, n_features=12 ,  n_informative=2, n_redundant=0, random_state=0, shuffle=False)
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(X, y)
y_pred = clf.predict(X_test)

evaluate(y_test, y_pred,["red","white"])




 [95m [1m Confusion matrix: [0m
[[  0 655]
 [  0 642]] 


[95m [1m Accuracy score: [0m 49.5 %


[95m [1m Classification report: [0m 
               precision    recall  f1-score   support

         red       0.00      0.00      0.00       655
       white       0.49      1.00      0.66       642

    accuracy                           0.49      1297
   macro avg       0.25      0.50      0.33      1297
weighted avg       0.25      0.49      0.33      1297
 


[95m [1m Cohen-Kappa score: [0m 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
