## Sketch of ML models

In [3]:
print("hello world!")

hello world!


Relevant library/modules import

In [72]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn import preprocessing as sk

EDA and preparation

In [5]:
df_red = pd.read_csv('winequality-red.csv', sep= ';')

In [6]:
df_white = pd.read_csv('winequality-white.csv', sep= ';')

In [7]:
# creating a new column called "quality_label", we define a range and associate that range with a label
df_red ['quality_label'] = df_red['quality'].apply(lambda value: 'low'
if value <= 5 else 'medium'
if value <= 7 else 'high')

# transforming these labels into categrical data type (specific to pandas) instead of simple string
df_red ['quality_label'] = pd.Categorical(df_red['quality_label'],
categories=['low', 'medium', 'high'])

In [8]:
# creating a new column called "quality_label", we define a range and associate that range with a label
df_white ['quality_label'] = df_white['quality'].apply(lambda value: 'low'
if value <= 5 else 'medium'
if value <= 7 else 'high')

# transforming these labels into categrical data type (specific to pandas) instead of simple string
df_white['quality_label'] = pd.Categorical(df_white['quality_label'],
categories=['low', 'medium', 'high'])

In [9]:
# Addition of column of type of wine (white or red) before concatenating (Pandas)
df_white ['type'] = df_white.apply(lambda value: 'white')
df_white ['type'] = df_white['type'].fillna('white')


In [10]:
# Addition of column of type of wine (white or red) before concatenating (Pandas)
df_red['type'] = df_red.apply(lambda value: 'red')
df_red['type'] = df_red['type'].fillna('red')


In [11]:
# DF merge
df_mix = pd.concat([df_red, df_white]) 


In [12]:
#index re-adjustment
df_mix.reset_index(inplace=True)

In [13]:
#index re-adjustment
df_mix.pop('index')

0          0
1          1
2          2
3          3
4          4
        ... 
6492    4893
6493    4894
6494    4895
6495    4896
6496    4897
Name: index, Length: 6497, dtype: int64

ML logistic regression model prep

In [14]:
#new data drame for ML purposes
dfm=df_mix.copy()

In [15]:
df_mix.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,quality_label,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,low,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,low,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,low,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,medium,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,low,red


In [16]:
#data encoding to avoid having cat/obj data fed into the ML model
dfm['quality_label'].replace({'low':0, 'medium':1, 'high':2},inplace=True)
dfm['type'].replace({'red':0, 'white':1}, inplace=True) # object encoder 


In [17]:
#encoding categorical data with "pd.get_dummies"
cat_col_qlabel = dfm.select_dtypes('category')
cat_col_qlabel_encode = pd.get_dummies(cat_col_qlabel, dtype=int)


In [18]:
# category columns are deleted to be replaced with the boolean "1" or "0" option for use in the ML model as "low-to-high classification" is not suitable
dfm.drop(columns=cat_col_qlabel, axis=1, inplace=True)

In [19]:
#inserting the new new 'encoded' columns into a new labelled DF   
dfml = pd.concat([dfm, cat_col_qlabel_encode], axis=1)

In [20]:
#encoded columns are now inserted as shown below
dfml.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type,quality_label_0,quality_label_1,quality_label_2
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0,1,0,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0,1,0,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0,1,0,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0,0,1,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0,1,0,0


In [21]:
#splitting module
from sklearn.model_selection import train_test_split

In [22]:
#split data in x and y

X = dfml.drop('type', axis=1)
y = dfml['type']

In [23]:
#split data between test and train data (validation set is not defined with train_test_split)
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.15, random_state=42)

In [24]:
# reviewing shape partition/split according with the 'test_size' described above → train/test should correspond to the share allocated to each
print('X_test accounts to:', X_test.shape) 
print('X_train accounts to:', X_train.shape)
print('y_test accounts to:', y_test.shape)
print('y_train accounts to:', y_train.shape)

X_test accounts to: (975, 15)
X_train accounts to: (5522, 15)
y_test accounts to: (975,)
y_train accounts to: (5522,)


In [25]:
from sklearn.linear_model import LogisticRegression 

In [26]:
# defining the ML model with a defined random state matching the one defined above in the splitted data
logreg_model = LogisticRegression(random_state=42)

In [27]:
#fitting the model with the data purposed for training 
logreg_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
#predicting with the model

predict = logreg_model.predict(X_test)

In [29]:
# wine type prediction depicted below 1 and 0 accounting to red and white, respectively 
predict

array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [30]:
#accuracy function is imported to derive the classification accuracy score 

from sklearn.metrics import accuracy_score

In [31]:
#accuracy of the ML model 
acc = accuracy_score(y_test, predict)
print(acc)

0.9702564102564103


In [32]:
#testing the model with new batch of random generated data for wines without the corresponding red/white classification

# Define column names
columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 
           'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 
           'pH', 'sulphates', 'alcohol', 'quality', 'quality_label_0', 
           'quality_label_1', 'quality_label_2']

# Define ranges for each column
ranges = {
    'fixed acidity': (4, 10),
    'volatile acidity': (0, 1),
    'citric acid': (0, 1),
    'residual sugar': (2, 55),
    'chlorides': (0.001, 0.09),
    'free sulfur dioxide': (15, 50),
    'total sulfur dioxide': (110, 250),
    'density': (0, 1),
    'pH': (3.01, 3.50),
    'sulphates': (0.20, 0.70),
    'alcohol': (7, 18),
    'quality': (1, 10),
    'quality_label_0': (0, 1),
    'quality_label_1': (0, 1),
    'quality_label_2': (0, 1)
}

# Generate random data
data = {col: np.random.uniform(low, high, 100) if col not in ['quality_label_0', 'quality_label_1', 'quality_label_2']
        else np.random.randint(low, high+1, 100) for col, (low, high) in ranges.items()}

# Create DataFrame
X1_test = pd.DataFrame(data)

# Display DataFrame
X1_test.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,quality_label_0,quality_label_1,quality_label_2
0,5.420366,0.838125,0.485466,51.242763,0.07202,31.608784,231.528796,0.470176,3.081182,0.336496,16.250256,5.208662,0,1,1
1,9.064723,0.820357,0.964907,9.581566,0.014119,17.960118,173.749115,0.653882,3.087379,0.268555,11.393742,7.639644,0,1,1
2,4.620708,0.859668,0.220488,48.22177,0.066518,33.585442,189.359842,0.569028,3.131973,0.64395,16.733331,7.643424,0,1,1
3,8.660581,0.52375,0.409252,4.50803,0.04875,40.925976,188.343171,0.730383,3.083218,0.449915,13.930326,8.306166,0,1,1
4,6.005148,0.330797,0.038293,39.141322,0.072146,29.912804,194.201673,0.760439,3.198601,0.665385,8.71647,5.434418,0,1,1


In [33]:
#prediction for the new DF of unclassified wines
predict1 = logreg_model.predict(X1_test)
print(predict1)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


ML random forest model prep

In [34]:
#import of the random forest module to build this type of ML models 
from sklearn.ensemble import RandomForestClassifier

In [35]:
# random forest model build-up

RF_model = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)



In [36]:
#fitting the RF model

RF_model.fit(X_train, y_train)


In [37]:

#prediction of red/white wine classification using the RF model, note → the model returns different output than LogReg model 
RF_prediction = RF_model.predict(X_test)
RF_prediction


array([1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [38]:
# obtining accuracy score of the model, note → it is higher than the obtained above
RF_accuracy_score = accuracy_score(y_test,RF_prediction)
print(RF_accuracy_score)

0.9958974358974358


In [39]:
# running the model on the random batch data, note → as above, the model returns different output than LogReg model 
RF_model.predict(X1_test)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

Performing Logistic Regression model for wine "quality_label" instead of the wine type

In [40]:
# obtaining clean ml model
df1 = df_mix.copy()
df1['type'].replace({'red':0, 'white':1}, inplace=True) #converting to numeric data

In [41]:
# importing the label encoder function used to encode ordinal variables for target value y (in this case 'quality')

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [42]:
#Using ordinal encoder for the 'quality_label' column

from sklearn.preprocessing import OrdinalEncoder

quality_order = [['low', 'medium', 'high']]

o_enc = OrdinalEncoder(categories=quality_order)

df2 = df_mix.copy()

encoded_data = o_enc.fit_transform(df2[['quality_label']])

df2['quality_label_encoded']= encoded_data.astype(int)


In [43]:
#review of the value for each encoded category 
encoded_q_labels =  o_enc.categories_
for i, category in enumerate(encoded_q_labels[0]):
    print(f"{category} corresponds to encoded value: {i}")

low corresponds to encoded value: 0
medium corresponds to encoded value: 1
high corresponds to encoded value: 2


In [44]:
#encoding the quality label using label encoder
df1['quality_label_encoded'] =le.fit_transform(df1['quality_label'])


In [45]:
df1.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,quality_label,type,quality_label_encoded
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,low,0,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,low,0,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,low,0,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,medium,0,2
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,low,0,1


In [46]:
#no longer useful as it is not numeric column
df1m= df1.drop('quality_label', axis=1)

In [47]:
df1m.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type,quality_label_encoded
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0,2
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0,1


In [48]:
class_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Class to Encoded Value Mapping:")
print(class_mapping)

Class to Encoded Value Mapping:
{'high': 0, 'low': 1, 'medium': 2}


In [49]:
df1.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,quality_label,type,quality_label_encoded
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,low,0,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,low,0,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,low,0,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,medium,0,2
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,low,0,1


In [50]:
df1m['quality_label_encoded'].value_counts()

quality_label_encoded
2    3915
1    2384
0     198
Name: count, dtype: int64

In [51]:
#splitting data in x and y 

X1 = df1m.drop('quality_label_encoded', axis=1)
y1 = df1m['quality_label_encoded']

In [52]:
#Setup of the training data 
X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1,test_size=0.2, random_state=42)


In [53]:
# reviewing shape partition/split according with the 'test_size' described above → train/test should correspond to the share allocated to each
print('X_test accounts to:', X1_test.shape) 
print('X_train accounts to:', X1_train.shape)
print('y_test accounts to:', y1_test.shape)
print('y_train accounts to:', y1_train.shape)

X_test accounts to: (1300, 13)
X_train accounts to: (5197, 13)
y_test accounts to: (1300,)
y_train accounts to: (5197,)


In [54]:
#training the model with the train data
logreg_model.fit(X1_train, y1_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [55]:
predictquality = logreg_model.predict(X1_test) 

In [56]:
predictquality

array([2, 1, 2, ..., 2, 2, 2])

In [57]:
accquality = accuracy_score(y1_test, predictquality)
print(accquality)

0.9423076923076923


Performing Random Forest model for wine "quality_label" - with duplicates

In [58]:
RF_model.fit(X1_train, y1_train)

In [59]:
RF1_prediction = RF_model.predict(X1_test)
RF1_prediction


array([0, 1, 2, ..., 2, 2, 2])

In [60]:
# obtining accuracy score of the model
RF1_accuracy_score = accuracy_score(y1_test,RF1_prediction)
print(RF1_accuracy_score)

0.9992307692307693


Droping duplicates

In [61]:
df1m_NoDup = df1m.drop_duplicates()

Performing Logistic Regression model for wine "quality_label" - No duplicates

In [62]:
X1d = df1m_NoDup.drop('quality_label_encoded', axis=1)
y1d = df1m_NoDup['quality_label_encoded']

In [63]:
X1d_train, X1d_test, y1d_train, y1d_test = train_test_split(X1d,y1d,test_size=0.2, random_state=42)

In [64]:
print('X_test accounts to:', X1d_test.shape) 
print('X_train accounts to:', X1d_train.shape)
print('y_test accounts to:', y1d_test.shape)
print('y_train accounts to:', y1d_train.shape)

X_test accounts to: (1064, 13)
X_train accounts to: (4256, 13)
y_test accounts to: (1064,)
y_train accounts to: (4256,)


In [65]:
logreg_model.fit(X1d_train, y1d_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [66]:
predict_qnd = logreg_model.predict(X1d_test)
predict_qnd

array([2, 2, 2, ..., 2, 2, 2])

In [67]:
acc_quality_nd = accuracy_score(y1d_test, predict_qnd)
print(acc_quality_nd)

0.9492481203007519


Performing Random Forest model for wine "quality_label" - No duplicates

In [68]:
RF_model.fit(X1d_train, y1d_train)


In [69]:
RF_prediction_nd = RF_model.predict(X1d_test)
RF_prediction_nd

array([2, 2, 2, ..., 2, 2, 2])

In [70]:
RF_acc_nd = accuracy_score(y1d_test, RF_prediction_nd)
print(RF_acc_nd)

0.9990601503759399


Droping outliers

Running LR and RF model without outliers

Running LR and RF model without duplicates and outliers

Performing "Feature Scaling" on models above

In [71]:
# Importing feature scaling functions from the preprocessing module of sklearn 
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
