## Sketch of ML models

In [503]:
print("hello world!")

hello world!


Relevant library/modules import

In [504]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn import preprocessing as sk

EDA and preparation

In [505]:
df_red = pd.read_csv('winequality-red.csv', sep= ';') # sep to identify the ";" value separation (observable in csv file) 

In [506]:
df_white = pd.read_csv('winequality-white.csv', sep= ';') # sep to identify the ";" value separation (observable in csv file) 

In [507]:
# creating a new column called "quality_label", we define a range and associate that range with a label
df_red ['quality_label'] = df_red['quality'].apply(lambda value: 'low'
if value <= 5 else 'medium'
if value <= 7 else 'high')

# transforming these labels into categrical data type (specific to pandas) instead of simple string
df_red ['quality_label'] = pd.Categorical(df_red['quality_label'],
categories=['low', 'medium', 'high'])

In [508]:
# creating a new column called "quality_label", we define a range and associate that range with a label
df_white ['quality_label'] = df_white['quality'].apply(lambda value: 'low'
if value <= 5 else 'medium'
if value <= 7 else 'high')

# transforming these labels into categrical data type (specific to pandas) instead of simple string
df_white['quality_label'] = pd.Categorical(df_white['quality_label'],
categories=['low', 'medium', 'high'])

In [509]:
# Addition of column of type of wine (white or red) before concatenating (Pandas)
df_white ['type'] = df_white.apply(lambda value: 'white')
df_white ['type'] = df_white['type'].fillna('white')


In [510]:
# Addition of column of type of wine (white or red) before concatenating (Pandas)
df_red['type'] = df_red.apply(lambda value: 'red')
df_red['type'] = df_red['type'].fillna('red')


In [511]:
# DF merge
df_mix = pd.concat([df_red, df_white]) 


In [512]:
#index re-adjustment
df_mix.reset_index(inplace=True)

In [513]:
#index re-adjustment
df_mix.pop('index')

0          0
1          1
2          2
3          3
4          4
        ... 
6492    4893
6493    4894
6494    4895
6495    4896
6496    4897
Name: index, Length: 6497, dtype: int64

ML logistic regression model prep

In [514]:
#new data drame for ML purposes
dfm=df_mix.copy()

In [515]:
df_mix.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,quality_label,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,low,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,low,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,low,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,medium,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,low,red


In [516]:
#data encoding to avoid having cat/obj data fed into the ML model
dfm['quality_label'].replace({'low':0, 'medium':1, 'high':2},inplace=True)
dfm['type'].replace({'red':0, 'white':1}, inplace=True) # object encoder 


In [517]:
#encoding categorical data with "pd.get_dummies"
cat_col_qlabel = dfm.select_dtypes('category')
cat_col_qlabel_encode = pd.get_dummies(cat_col_qlabel, dtype=int)


In [518]:
# category columns are deleted to be replaced with the boolean "1" or "0" option for use in the ML model as "low-to-high classification" is not suitable
dfm.drop(columns=cat_col_qlabel, axis=1, inplace=True)

In [519]:
#inserting the new new 'encoded' columns into a new labelled DF   
dfml = pd.concat([dfm, cat_col_qlabel_encode], axis=1)

In [520]:
#encoded columns are now inserted as shown below
dfml.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type,quality_label_0,quality_label_1,quality_label_2
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0,1,0,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0,1,0,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0,1,0,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0,0,1,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0,1,0,0


In [521]:
#splitting module
from sklearn.model_selection import train_test_split

In [522]:
#split data in x and y

X = dfml.drop('type', axis=1)
y = dfml['type']

In [523]:
#split data between test and train data (validation set is not defined with train_test_split)
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, random_state=42)

In [524]:
# reviewing shape partition/split according with the 'test_size' described above → train/test should correspond to the share allocated to each
print('X_test accounts to:', X_test.shape) 
print('X_train accounts to:', X_train.shape)
print('y_test accounts to:', y_test.shape)
print('y_train accounts to:', y_train.shape)

X_test accounts to: (1625, 15)
X_train accounts to: (4872, 15)
y_test accounts to: (1625,)
y_train accounts to: (4872,)


In [525]:
from sklearn.linear_model import LogisticRegression 

In [526]:
# defining the ML model with a defined random state matching the one defined above in the splitted data
logreg_model = LogisticRegression(random_state=42)

In [527]:
#fitting the model with the data purposed for training 
logreg_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [528]:
#predicting with the model

predict = logreg_model.predict(X_test)

In [529]:
# wine type prediction depicted below 1 and 0 accounting to red and white, respectively 
predict

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [530]:
#accuracy function is imported to derive the classification accuracy score 

from sklearn.metrics import accuracy_score

In [531]:
#accuracy of the ML model 
acc = accuracy_score(y_test, predict)
print(acc)

0.9747692307692307


In [532]:
#testing the model with new batch of random generated data for wines without the corresponding red/white classification

# Define column names
columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 
           'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 
           'pH', 'sulphates', 'alcohol', 'quality', 'quality_label_0', 
           'quality_label_1', 'quality_label_2']

# Define ranges for each column
ranges = {
    'fixed acidity': (4, 10),
    'volatile acidity': (0, 1),
    'citric acid': (0, 1),
    'residual sugar': (2, 55),
    'chlorides': (0.001, 0.09),
    'free sulfur dioxide': (15, 50),
    'total sulfur dioxide': (110, 250),
    'density': (0, 1),
    'pH': (3.01, 3.50),
    'sulphates': (0.20, 0.70),
    'alcohol': (7, 18),
    'quality': (1, 10),
    'quality_label_0': (0, 1),
    'quality_label_1': (0, 1),
    'quality_label_2': (0, 1)
}

# Generate random data
data = {col: np.random.uniform(low, high, 100) if col not in ['quality_label_0', 'quality_label_1', 'quality_label_2']
        else np.random.randint(low, high+1, 100) for col, (low, high) in ranges.items()}

# Create DataFrame
X1_test = pd.DataFrame(data)

# Display DataFrame
X1_test.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,quality_label_0,quality_label_1,quality_label_2
0,5.05903,0.16155,0.575435,12.73317,0.035032,26.866997,179.058482,0.833911,3.16056,0.327643,8.37927,6.150913,0,0,0
1,9.394856,0.80083,0.717624,9.294494,0.006341,32.438351,149.643809,0.715357,3.223553,0.393767,13.501923,4.523468,1,1,0
2,7.716791,0.688023,0.354803,34.428726,0.059231,46.866322,245.322785,0.040267,3.409503,0.226078,8.730254,9.981914,0,0,0
3,8.673536,0.245489,0.888049,27.780288,0.056128,24.249147,144.092755,0.026557,3.495129,0.258519,8.218014,7.451644,0,1,0
4,8.369534,0.211818,0.787215,21.166347,0.063276,32.042368,182.122689,0.831456,3.365289,0.366602,7.918185,9.719793,1,1,1


In [533]:
#prediction for the new DF of unclassified wines
predict1 = logreg_model.predict(X1_test)
print(predict1)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1]


ML random forest model prep for wine type (with duplicates)

In [534]:
#import of the random forest module to build this type of ML models 
from sklearn.ensemble import RandomForestClassifier

In [535]:
# random forest model build-up

RF_model = RandomForestClassifier(n_estimators=100, criterion='gini', random_state=62)



In [536]:
#fitting the RF model

RF_model.fit(X_train, y_train)


In [537]:

#prediction of red/white wine classification using the RF model, note → the model returns different output than LogReg model 
RF_prediction = RF_model.predict(X_test)
RF_prediction


array([1, 0, 1, ..., 1, 1, 1], dtype=int64)

In [538]:
# obtining accuracy score of the model, note → it is higher than the obtained above
RF_accuracy_score = accuracy_score(y_test,RF_prediction)
print(RF_accuracy_score)

0.9956923076923077


In [539]:
# running the model on the random batch data, note → as above, the model returns different output than LogReg model 
RF_model.predict(X1_test)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

Performing Logistic Regression model for wine "quality_label" (with duplicates)

In [540]:
# obtaining clean ml model
df1 = df_mix.copy()
df1['type'].replace({'red':0, 'white':1}, inplace=True) #converting to numeric data

In [541]:
# importing the label encoder function used to encode ordinal variables for target value y (in this case 'quality')

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [542]:
#Using ordinal encoder for the 'quality_label' column

from sklearn.preprocessing import OrdinalEncoder

quality_order = [['low', 'medium', 'high']]

o_enc = OrdinalEncoder(categories=quality_order)

df2 = df_mix.copy()

encoded_data = o_enc.fit_transform(df2[['quality_label']])

df2['quality_label_encoded']= encoded_data.astype(int)


In [543]:
#review of the value for each encoded category 
encoded_q_labels =  o_enc.categories_
for i, category in enumerate(encoded_q_labels[0]):
    print(f"{category} corresponds to encoded value: {i}")

low corresponds to encoded value: 0
medium corresponds to encoded value: 1
high corresponds to encoded value: 2


In [544]:
#encoding the quality label using label encoder
df1['quality_label_encoded'] =le.fit_transform(df1['quality_label'])


In [545]:
df1.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,quality_label,type,quality_label_encoded
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,low,0,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,low,0,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,low,0,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,medium,0,2
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,low,0,1


In [546]:
#no longer useful as it is not numeric column
df1m= df1.drop('quality_label', axis=1)

In [547]:
df1m['quality_label_encoded'].value_counts()

quality_label_encoded
2    3915
1    2384
0     198
Name: count, dtype: int64

In [548]:
class_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Class to Encoded Value Mapping:")
print(class_mapping)

Class to Encoded Value Mapping:
{'high': 0, 'low': 1, 'medium': 2}


In [549]:
df1.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,quality_label,type,quality_label_encoded
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,low,0,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,low,0,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,low,0,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,medium,0,2
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,low,0,1


In [550]:
df1m['quality_label_encoded'].value_counts()

quality_label_encoded
2    3915
1    2384
0     198
Name: count, dtype: int64

In [551]:
#splitting data in x and y 

X1 = df1m.drop('quality_label_encoded', axis=1)
y1 = df1m['quality_label_encoded']

In [552]:
#Setup of the training data 
X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1,test_size=0.3, random_state=42)


In [553]:
# reviewing shape partition/split according with the 'test_size' described above → train/test should correspond to the share allocated to each
print('X1_test accounts to:', X1_test.shape) 
print('X1_train accounts to:', X1_train.shape)
print('y1_test accounts to:', y1_test.shape)
print('y1_train accounts to:', y1_train.shape)

X1_test accounts to: (1950, 13)
X1_train accounts to: (4547, 13)
y1_test accounts to: (1950,)
y1_train accounts to: (4547,)


In [554]:
#training the model with the train data
logreg_model.fit(X1_train, y1_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [555]:
predictquality = logreg_model.predict(X1_test) 

In [556]:
predictquality

array([2, 1, 2, ..., 2, 2, 2])

In [557]:
accquality = accuracy_score(y1_test, predictquality)
print(accquality)

0.9446153846153846


Performing Random Forest model for wine "quality_label" (with duplicates)

In [558]:
RF_model.fit(X1_train, y1_train)

In [559]:
RF1_prediction = RF_model.predict(X1_test)
RF1_prediction


array([0, 1, 2, ..., 2, 2, 2])

In [560]:
# obtining accuracy score of the model
RF1_accuracy_score = accuracy_score(y1_test,RF1_prediction)
print(RF1_accuracy_score)

1.0


Droping duplicates

In [561]:
df1m_NoDup = df1m.drop_duplicates()

Performing Logistic Regression model for wine "quality_label" (without duplicates)

In [562]:
X1d = df1m_NoDup.drop('quality_label_encoded', axis=1)
y1d = df1m_NoDup['quality_label_encoded']

In [563]:
X1d_train, X1d_test, y1d_train, y1d_test = train_test_split(X1d,y1d,test_size=0.2, random_state=42)

In [564]:
print('X_test accounts to:', X1d_test.shape) 
print('X_train accounts to:', X1d_train.shape)
print('y_test accounts to:', y1d_test.shape)
print('y_train accounts to:', y1d_train.shape)

X_test accounts to: (1064, 13)
X_train accounts to: (4256, 13)
y_test accounts to: (1064,)
y_train accounts to: (4256,)


In [565]:
logreg_model.fit(X1d_train, y1d_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [566]:
predict_qnd = logreg_model.predict(X1d_test)
predict_qnd

array([2, 2, 2, ..., 2, 2, 2])

In [567]:
acc_quality_nd = accuracy_score(y1d_test, predict_qnd)
print(acc_quality_nd)

0.9492481203007519


Performing Random Forest model for wine "quality_label" (without duplicates)

In [568]:
RF_model.fit(X1d_train, y1d_train)


In [569]:
RF_quality_prediction_nd = RF_model.predict(X1d_test)
RF_quality_prediction_nd

array([2, 2, 2, ..., 2, 2, 2])

In [570]:
RF_acc_nd = accuracy_score(y1d_test, RF_quality_prediction_nd) # accuracy test as high as 1 should point out to a fault in process
print(RF_acc_nd)

1.0


Logistic regression for wine type (without duplicates)

In [571]:
dfml_nd = dfml.drop_duplicates()

In [572]:
X_nd = dfml_nd.drop('type', axis=1)
y_nd = dfml_nd['type']

In [573]:
Xnd_train, Xnd_test, ynd_train, ynd_test = train_test_split( X_nd, y_nd, test_size=0.30, random_state=42)

In [574]:
print('X_test accounts to:', Xnd_test.shape) 
print('X_train accounts to:', Xnd_train.shape)
print('y_test accounts to:', ynd_test.shape)
print('y_train accounts to:', ynd_train.shape)

X_test accounts to: (1596, 15)
X_train accounts to: (3724, 15)
y_test accounts to: (1596,)
y_train accounts to: (3724,)


In [575]:
logreg_model.fit(Xnd_train, ynd_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [576]:
predict_lr_nd = logreg_model.predict(Xnd_test)
print(predict_lr_nd)

[1 1 1 ... 0 1 1]


In [577]:
acc_type_nd = accuracy_score(ynd_test, predict_lr_nd)
print(acc_type_nd)

0.9761904761904762


Random forest for wine type (without duplicates)

In [578]:
RF_model.fit(Xnd_train, ynd_train)

In [579]:
RF_pred_nd = RF_model.predict(Xnd_test)
print(RF_pred_nd)

[1 1 1 ... 0 1 1]


In [580]:
RF_accuracy_score_nd = accuracy_score(ynd_test, RF_pred_nd)
print(RF_accuracy_score_nd)

0.9949874686716792


Droping outliers

In [581]:
# outlier analysis will depend on the variable being observed, categorical data is not as prone to outlier analysis as numerical data

Implementing Support Vector Machine (VMS) ML model fro wine type (no duplicates)

In [582]:
#import of the respective svm module  
from sklearn import svm 

In [583]:
#fitting the svm model with previously used data (no duplicates) 
clf = svm.SVC(kernel='linear').fit(Xnd_train, ynd_train) # cfl is often used as short for classifier

In [584]:
#predicting the wine type
svm_predict = clf.predict(Xnd_test)

In [585]:
# the decision function will indidicate the side and distance of each point from the "hyperlane", based on that it provides a type as shown above
clf.decision_function(Xnd_test)

array([ 3.53612045,  4.06645871,  1.75273563, ..., -3.38095953,
        1.075707  ,  3.41569435])

In [586]:
#obtaining the accuracy score for SVM (wine type/without duplicates)
accuracy_score(ynd_test, svm_predict)

0.9837092731829574

In [587]:
print(dfml_nd['quality'].astype(float).skew())

print(dfml_nd['quality'].astype(float).kurt())


0.1474673665121148
0.2981001319979337


Performing "Feature Scaling" on models above

In [588]:
# Importing feature scaling functions from the preprocessing module of sklearn 
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler

In [589]:
# applying StandardScaler to the train data (wine type / without duplicates)

# fit scaler on training data
norm = MinMaxScaler().fit(Xnd_train)

# transform training data
X_train_norm = norm.transform(Xnd_train)

# transform testing data
X_test_norm = norm.transform(Xnd_test)


In [594]:
#Scaler in place with data being modift in regard to the mean at 0 and standard deviaiton at 1
X_train_norm

array([[0.23076923, 0.06      , 0.21084337, ..., 0.        , 1.        ,
        0.        ],
       [0.20512821, 0.04      , 0.23493976, ..., 0.        , 1.        ,
        0.        ],
       [0.24786325, 0.08      , 0.15060241, ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.29059829, 0.18666667, 0.37349398, ..., 0.        , 1.        ,
        0.        ],
       [0.17948718, 0.16      , 0.18072289, ..., 1.        , 0.        ,
        0.        ],
       [0.42735043, 0.16      , 0.18674699, ..., 0.        , 1.        ,
        0.        ]])

In [None]:
#logistic regression ML model ran with StandardScaler in place  

logreg_model.fit(X_train_norm, ynd_train)

predict_wscale = logreg_model.predict(X_test_norm)

print(predict_wscale)

accuracy_wscale= accuracy_score(ynd_test, predict_wscale)

print(accuracy_wscale) 



[1 1 1 ... 0 1 1]
0.9843358395989975


ML Model Evaluation

In [598]:
# confusion matrix 

from sklearn.metrics import confusion_matrix

print('Confusion matrix: \n', confusion_matrix(ynd_test, predict_wscale)) # will display true positive, true negative, false positive and false negative

Confusion matrix: 
 [[ 417   16]
 [   9 1154]]


In [603]:
# classification report

from sklearn.metrics import classification_report

print(classification_report(ynd_test, predict_wscale, target_names=["red","white"]))


              precision    recall  f1-score   support

         red       0.98      0.96      0.97       433
       white       0.99      0.99      0.99      1163

    accuracy                           0.98      1596
   macro avg       0.98      0.98      0.98      1596
weighted avg       0.98      0.98      0.98      1596

