In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc

In [None]:
# Load the data
data = pd.read_csv('CA1-Classification-Dataset.csv')

IMPUTING AND CLEANING OF DATA


In [None]:
# Check for missing values
data.isnull().sum()

# Create an imputer object
imputer = SimpleImputer(strategy='mean')

# Impute the missing values
imputed_data = imputer.fit_transform(data)

# Convert the imputed data back to a pandas DataFrame
df = pd.DataFrame(imputed_data, columns=data.columns)

# Print the imputed DataFrame
print(df)
df.describe()

In [None]:
df.isnull().sum()

LOGISTIC REGRESSION MODEL

In [None]:
from sklearn.linear_model import LogisticRegression

X = df.drop('Quality', axis=1)
y = df['Quality']

df =df.dropna(subset=['Trihalomethanes']) # dropped as it is less than 5% of data


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

#training model
LRmodel = LogisticRegression()
LRmodel.fit(X_train,y_train)



In [None]:
# Make predictions on the test set
pred_LR = LRmodel.predict(X_test) 

In [None]:



# Evaluate the model
accuracy_LR = accuracy_score(y_test, pred_LR)
conf_matrix_LR = confusion_matrix(y_test, pred_LR)
classification_rep_LR = classification_report(y_test, pred_LR)

print(f'Logistic Regression Accuracy: {accuracy_LR}')
print(f'Logistic Regression Classification Report:\n{classification_rep_LR}')

# Confusion Matrix Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_LR, annot=True, fmt="d", cmap="Blues", linewidths=.5)
plt.title("Confusion Matrix - Logistic Regression")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Feature Importance
coefficients = LRmodel.coef_[0]
feature_importance = dict(zip(X.columns, coefficients))

# Plotting feature importance
plt.figure(figsize=(10, 6))
plt.bar(X.columns, coefficients, color='green')
plt.title('Feature Importance for Logistic Regression')
plt.xlabel('Feature')
plt.ylabel('Coefficient')
plt.xticks(rotation=45, ha='right')
plt.show()



DECISION TREE MODEL

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



# Create a Decision Tree classifier
tree_classifier = DecisionTreeClassifier(max_depth = 4)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


# Train the model
tree_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred_tree = tree_classifier.predict(X_test)

In [None]:

# Evaluate the model
accuracy_tree = accuracy_score(y_test, y_pred_tree)
conf_matrix_tree = confusion_matrix(y_test, y_pred_tree)
classification_rep_tree = classification_report(y_test, y_pred_tree)

In [None]:
print(f'Decision Tree Accuracy: {accuracy_tree}')
print(f'Decision Tree Confusion Matrix:\n{conf_matrix_tree}')
print(f'Decision Tree Classification Report:\n{classification_rep_tree}')

In [None]:
import seaborn as sns

# Assuming y_test and y_pred_tree are pandas Series or NumPy arrays

# Create a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_tree)

# Display the confusion matrix using seaborn heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Class 0', 'Class 1', 'Class 2'], 
            yticklabels=['Class 0', 'Class 1', 'Class 2'])
plt.title('Decision Tree Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Feature Importance for Decision Tree
feature_importance = tree_classifier.feature_importances_
print("Feature Importance (Decision Tree):")
print(dict(zip(X.columns, feature_importance)))

# Feature Importance Bar Chart
plt.figure(figsize=(10, 6))
plt.barh(X.columns, feature_importance)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Decision Tree Feature Importance')
plt.show()



DUMMY MODEL

In [None]:
from sklearn.dummy import DummyClassifier

# Create a DummyClassifier predicting the majority class
dummy_classifier = DummyClassifier(strategy='most_frequent')

# Train the DummyClassifier on the training set
dummy_classifier.fit(X_train, y_train)

# Make predictions on the test set
pred_dummy = dummy_classifier.predict(X_test)

# Calculate accuracy on the test set
accuracy_dummy = accuracy_score(y_test, pred_dummy)

# Print the accuracy of the dummy classifier
print(f'Dummy Classifier Accuracy: {accuracy_dummy}')

KNN MODEL


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Assuming X_train, X_test, y_train, y_test are your training and testing data

# Create a KNeighborsClassifier
knn_classifier = KNeighborsClassifier()

In [None]:
# Define the parameter grid to search for the best hyperparameters
param_grid = {'n_neighbors': range(3, 15)}

# Use GridSearchCV to search for the best hyperparameters
grid_search = GridSearchCV(knn_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)


In [None]:
# Get the best K value from the grid search
best_k_value = grid_search.best_params_['n_neighbors']

# Create a KNeighborsClassifier with the best K value
best_knn_classifier = KNeighborsClassifier(n_neighbors=best_k_value)

# Fit the model with the best hyperparameters on the entire training set
best_knn_classifier.fit(X_train, y_train)

# Make predictions on the test set
pred_knn = best_knn_classifier.predict(X_test)

# Calculate accuracy on the test set
accuracy_knn = accuracy_score(y_test, pred_knn)

# Generate confusion matrix
cm_knn = confusion_matrix(y_test, pred_knn)

# Generate classification report
class_report_knn = classification_report(y_test, pred_knn)

In [None]:
# Print the best K value and accuracy
print("Best K value:", best_k_value)
print("Accuracy with best K:", accuracy_knn)


# Print the classification report
print("KNN Classification Report:")
print(class_report_knn)


# Plotting the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm_knn, annot=True, fmt="d", cmap="Blues", xticklabels=best_knn_classifier.classes_, yticklabels=best_knn_classifier.classes_)
plt.title('KNN Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Bar graph comparing accuracy of the best KNN model with other classifiers including the dummy classifier
accuracies = [accuracy_LR, accuracy_tree, accuracy_knn, accuracy_dummy]
models = ['Logistic Regression', 'Decision Tree', 'K-Nearest Neighbors', 'Dummy Classifier']

plt.figure(figsize=(12, 6))
plt.bar(models, accuracies, color=['blue', 'green', 'orange', 'gray'])
plt.title('Comparison of Classification Model Accuracies')
plt.ylabel('Accuracy')
plt.ylim(0, 1)

# Displaying the accuracy values on top of the bars
for i, v in enumerate(accuracies):
    plt.text(i, v + 0.02, f'{v:.5f}', ha='center', va='bottom')

plt.show()

