In [None]:
##Classifying Wine Quality Data Based on Different Supervised Learning Methods
#Data Adjustment
import pandas as pd
import numpy as np
# Specifying the file path
file_path = r'E:\Study Materials\Masters Data Science\1-1\Introduction to Python\Assisgnment\wine-quality white.csv'

# Loading the CSV file into a DataFrame
df = pd.read_csv(file_path)
print(df.head())

# according to the instruction of the assignment append new rows.
# my id=05, Thus X=0.05 and it will be added to the values of the new rows.
X = 0.05
r1 = np.round([7.8 + X, 0.88 + X, 0 + X, 1.9, 0.09 + X, 25 + X, 67 + X, .991 + X, 3.22, 0.68 + X, 9.8 + X, 5], 2)
r2 = np.round([7.2 + X, 0.83 + X, 0.01 + X, 2.2, 0.19 + X, 15 + X, 60 + X, .996 + X, 3.52, 0.55 + X, 9.6 + X, 6], 2)
r3 = np.round([7.9 + X, 0.89 + X, 0.01 + X, 1.7, 0.08 + X, 22 + X, 57 + X, .997 + X, 3.26, 0.64 + X, 9.8 + X, 2], 2)
r4 = np.round([7.7 + X, 0.86 + X, 0.02 + X, 2.3, 0.07 + X, 11 + X, 38 + X, .994 + X, 3.12, 0.08 + X, 9.4 + X, 3], 2)
dataSeries = [pd.Series(r1, index=df.columns), pd.Series(r2, index=df.columns),
              pd.Series(r3, index=df.columns), pd.Series(r4, index=df.columns)]

df2 = pd.concat([df, pd.DataFrame(dataSeries)], ignore_index=True)
print(df2)

#### Modifying quality column as per assignment
#### reallocating the quality of wine as 1: 0 to 5 (Bad quality) and 2: 6 to 10 (Good quality).
df2[[df2[['quality']] <= 5.0]]=1
df2[[df2[['quality']] > 5.0]]=2
df2['quality'] = df2['quality'].map({1:'Bad', 2:'Good'})
print (df2)

##Logistic Regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, accuracy_score, precision_score, recall_score
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Defining feature variables (X) and target variable (y)
X = df2.drop('quality', axis=1)
y = df2['quality']

# Convert 'Good' and 'Bad' to numerical labels (0 and 1)
y = y.apply(lambda label: 1 if label == 'Good' else 0)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predicting on the test set
y_pred = model.predict(X_test)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# ROC Curve
y_pred_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1)  # Specify pos_label
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Plot ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

# Calculate and interpret performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Calculate specificity
tn, fp, fn, tp = conf_matrix.ravel()
specificity = tn / (tn + fp)

print("Accuracy:", accuracy)
print("Precision (Positive Predictive Value):", precision)
print("Recall (Sensitivity):", recall)
print("Specificity:", specificity)

##Decision Tree Model
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, accuracy_score, precision_score, recall_score
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Defining feature variables (X) and target variable (y)
X = df2.drop('quality', axis=1)
y = df2['quality']

# Convert 'Good' and 'Bad' to numerical labels (0 and 1)
y = y.apply(lambda label: 1 if label == 'Good' else 0)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the Decision Tree model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Predicting on the test set
y_pred = model.predict(X_test)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# ROC Curve
y_pred_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1)  # Specify pos_label
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Plot ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

# Calculate and interpret performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Calculate specificity
tn, fp, fn, tp = conf_matrix.ravel()
specificity = tn / (tn + fp)

print("Accuracy:", accuracy)
print("Precision (Positive Predictive Value):", precision)
print("Recall (Sensitivity):", recall)
print("Specificity:", specificity)

##Random Forest Model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, accuracy_score, precision_score, recall_score
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Defining feature variables (X) and target variable (y)
X = df2.drop('quality', axis=1)
y = df2['quality']

# Convert 'Good' and 'Bad' to numerical labels (0 and 1)
y = y.apply(lambda label: 1 if label == 'Good' else 0)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predicting on the test set
y_pred = model.predict(X_test)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# ROC Curve
y_pred_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1)  # Specify pos_label
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Plot ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

# Calculate and interpret performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Calculate specificity
tn, fp, fn, tp = conf_matrix.ravel()
specificity = tn / (tn + fp)

print("Accuracy:", accuracy)
print("Precision (Positive Predictive Value):", precision)
print("Recall (Sensitivity):", recall)
print("Specificity:", specificity)

##Support Vector Machine Model

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, accuracy_score, precision_score, recall_score
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Defining feature variables (X) and target variable (y)
X = df2.drop('quality', axis=1)
y = df2['quality']

# Convert 'Good' and 'Bad' to numerical labels (0 and 1)
y = y.apply(lambda label: 1 if label == 'Good' else 0)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the Support Vector Machines model
model = SVC(probability=True)  # Enable probability estimation for ROC curve
model.fit(X_train, y_train)

# Predicting on the test set
y_pred = model.predict(X_test)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# ROC Curve
y_pred_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1)  # Specify pos_label
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Plot ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

# Calculate and interpret performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Calculate specificity
tn, fp, fn, tp = conf_matrix.ravel()
specificity = tn / (tn + fp)

print("Accuracy:", accuracy)
print("Precision (Positive Predictive Value):", precision)
print("Recall (Sensitivity):", recall)
print("Specificity:", specificity)

