# Q1-Classification-60%

In [1]:
# Libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

#Import some libraries to make displaying easier
import warnings
warnings.filterwarnings('ignore')

pd.get_option("display.max_columns")

20

## a. (10 pts) Load the data & replace the quality score form 0-6 to low (e.g. 0), and 7-10 to high (e.g. 1). For this question you can use Python, Excel, or Notepad etc.

In [2]:
#Read the dataset
data=pd.read_csv('wine-quality.csv',sep=";")
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
#See some information regarding the dataset
#data.describe()
#data.info()

In [4]:
# Check for missing values
print(data.isnull().sum())

# Drop rows with missing values
data = data.dropna()

# Check for duplicates
print(data.duplicated().sum())

# Drop duplicates
data = data.drop_duplicates()

# Check the data types of each column
print(data.dtypes)

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64
240
fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object


In [5]:
# Replace the quality score based on what asked
data["quality"] = pd.cut(data["quality"], bins=[0, 6.5, 10], labels=[0, 1])

## b. (10 pts) Rank the input features based on their importance & print them

In [6]:
# Drop rows with missing values
data = data.dropna()

# Drop duplicates
data = data.drop_duplicates()

# Split the data into input and target
X = data.drop("quality", axis=1)
y = data["quality"]

# Compute mutual information between each feature and the target
mutual_info = mutual_info_classif(X, y)

# Create a DataFrame of mutual information scores
mutual_info_df = pd.DataFrame(
    {"feature": X.columns, "mutual_info": mutual_info}
)

# Sort the features by mutual information in descending order
mutual_info_df = mutual_info_df.sort_values(by="mutual_info", ascending=False)

# Print the features and their mutual information scores
print(mutual_info_df)


                 feature  mutual_info
10               alcohol     0.080368
9              sulphates     0.055566
1       volatile acidity     0.050401
2            citric acid     0.044398
7                density     0.031699
4              chlorides     0.029816
0          fixed acidity     0.029811
6   total sulfur dioxide     0.026974
5    free sulfur dioxide     0.011165
3         residual sugar     0.005923
8                     pH     0.005483


## c. (20 pts) Train a Decision Tree and a Neural Network (with 50% of data). Experiment with the
following parameters for the architecture and training of the Neural Network: the number of nodes in the hidden layers (it’s up to you), and the number of hidden layers (up to 2). For the Decision tree, experiment with the with the max_depth specifying the maximum depth of the tree (try two different values).

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Train a Decision Tree with max_depth=3 and max_depth=5
dtc1 = DecisionTreeClassifier(max_depth=3, random_state=42)
dtc2 = DecisionTreeClassifier(max_depth=5, random_state=42)
dtc1.fit(X_train, y_train)
dtc2.fit(X_train, y_train)

# Train a Neural Network with different architectures and hyperparameters
nn1 = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
nn2 = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
nn3 = MLPClassifier(hidden_layer_sizes=(200, 100, 50), max_iter=500, random_state=42)
nn1.fit(X_train, y_train)
nn2.fit(X_train, y_train)
nn3.fit(X_train, y_train)

# Evaluate the models on the test set
dtc1_score = accuracy_score(y_test, dtc1.predict(X_test))
dtc2_score = accuracy_score(y_test, dtc2.predict(X_test))
nn1_score = accuracy_score(y_test, nn1.predict(X_test))
nn2_score = accuracy_score(y_test, nn2.predict(X_test))
nn3_score = accuracy_score(y_test, nn3.predict(X_test))

# Print the accuracy scores for the models
print("Decision Tree with max_depth=3 accuracy score:", dtc1_score)
print("Decision Tree with max_depth=5 accuracy score:", dtc2_score)
print("Neural Network with one hidden layer accuracy score:", nn1_score)
print("Neural Network with two hidden layers accuracy score:", nn2_score)
print("Neural Network with three hidden layers accuracy score:", nn3_score)

Decision Tree with max_depth=3 accuracy score: 0.8852941176470588
Decision Tree with max_depth=5 accuracy score: 0.861764705882353
Neural Network with one hidden layer accuracy score: 0.8779411764705882
Neural Network with two hidden layers accuracy score: 0.8808823529411764
Neural Network with three hidden layers accuracy score: 0.8647058823529412


## d. (10 pts) Obtain macro-Precision, macro-Recall and F1, for best set of parameters for each of the models that you experiment with, that give the best F1 score.

In [8]:
# Evaluate the models on the test set
dtc1_y_pred = dtc1.predict(X_test)
dtc2_y_pred = dtc2.predict(X_test)
nn1_y_pred = nn1.predict(X_test)
nn2_y_pred = nn2.predict(X_test)
nn3_y_pred = nn3.predict(X_test)

# Compute the macro-precision, macro-recall, and F1 score for the models
dtc1_precision = precision_score(y_test, dtc1_y_pred, average='macro')
dtc1_recall = recall_score(y_test, dtc1_y_pred, average='macro')
dtc1_f1 = f1_score(y_test, dtc1_y_pred, average='macro')

#Measures of performance: Precision, Recall, F1
print ('DT1: Macro Precision, recall, f1-score')
print('[{},{},{}]'.format(dtc1_precision,dtc1_recall,dtc1_f1))
print ('\n')

# Compute the macro-precision, macro-recall, and F1 score for the models
dtc2_precision = precision_score(y_test, dtc2_y_pred, average='macro')
dtc2_recall = recall_score(y_test, dtc2_y_pred, average='macro')
dtc2_f1 = f1_score(y_test, dtc2_y_pred, average='macro')

#Measures of performance: Precision, Recall, F1
print ('DT2: Macro Precision, recall, f1-score')
print('[{},{},{}]'.format(dtc2_precision,dtc2_recall,dtc2_f1))
print ('\n')

# Compute the macro-precision, macro-recall, and F1 score for the models
nn1_precision = precision_score(y_test, nn1_y_pred, average='macro')
nn1_recall = recall_score(y_test, nn1_y_pred, average='macro')
nn1_f1 = f1_score(y_test, nn1_y_pred, average='macro')

#Measures of performance: Precision, Recall, F1
print ('NN1: Macro Precision, recall, f1-score')
print('[{},{},{}]'.format(nn1_precision,nn1_recall,nn1_f1))
print ('\n')

# Compute the macro-precision, macro-recall, and F1 score for the models
nn2_precision = precision_score(y_test, nn2_y_pred, average='macro')
nn2_recall = recall_score(y_test, nn2_y_pred, average='macro')
nn2_f1 = f1_score(y_test,nn2_y_pred,average='macro')

#Measures of performance: Precision, Recall, F1
print ('NN2: Macro Precision, recall, f1-score')
print('[{},{},{}]'.format(nn2_precision,nn2_recall,nn2_f1))

DT1: Macro Precision, recall, f1-score
[0.7611172733689624,0.6476929219186677,0.6813725490196079]


DT2: Macro Precision, recall, f1-score
[0.6972145002705274,0.7009638966520276,0.6990583804143127]


NN1: Macro Precision, recall, f1-score
[0.7453846153846153,0.5909713112416586,0.6178197306319788]


NN2: Macro Precision, recall, f1-score
[0.7382305194805194,0.6785585277286641,0.7017398508699255]


----------