<a href="https://colab.research.google.com/github/ko-source/BitCoin/blob/main/Assignment_03(Q2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# 24 November 2023
# CSC461 – Assignment3 – Machine Learning
# Komal Khizar
# FA20-BSE-096
# This task involves applying Logistic Regression, Support Vector Machines,
# and Multilayer Perceptron algorithms to a gender prediction dataset, first with a 2/3 train and 1/3 test split,
# then with an 80/20 split. The objectives are to assess classification accuracy,
# identify the two most influential attributes, exclude them, and observe the impact on model performance.
#  The analysis aims to understand the effect of training/test split ratios and feature importance on classification outcomes.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Read the dataset
data = pd.read_csv('gender-prediction.csv')

# Convert categorical variables to numeric using label encoding
label_encoders = {}
for column in ['beard', 'hair_length', 'scarf', 'eye_color']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Splitting the data into features (X) and target variable (y)
X = data.drop('gender', axis=1)
y = data['gender']

# Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting the dataset into training (2/3) and testing (1/3) sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=1/3, random_state=42)

# Initializing the models with increased max_iter for Logistic Regression and MLP
logistic_regression_model = LogisticRegression(max_iter=1000)
svm_model = SVC()
mlp_model = MLPClassifier(max_iter=1000)

# Training the models
logistic_regression_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)
mlp_model.fit(X_train, y_train)

# Making predictions with each model
lr_predictions = logistic_regression_model.predict(X_test)
svm_predictions = svm_model.predict(X_test)
mlp_predictions = mlp_model.predict(X_test)

# Calculating the number of incorrectly classified instances
incorrect_lr = (y_test != lr_predictions).sum()
incorrect_svm = (y_test != svm_predictions).sum()
incorrect_mlp = (y_test != mlp_predictions).sum()

print("Initial Experiment (2/3 Train, 1/3 Test Split)")
print("Incorrect Logistic Regression:", incorrect_lr)
print("Incorrect SVM:", incorrect_svm)
print("Incorrect MLP:", incorrect_mlp)

# Rerun the experiment with 80/20 train/test split
X_train_80, X_test_80, y_train_80, y_test_80 = train_test_split(X_scaled, y, test_size=0.20, random_state=42)

# Training the models again
logistic_regression_model.fit(X_train_80, y_train_80)
svm_model.fit(X_train_80, y_train_80)
mlp_model.fit(X_train_80, y_train_80)

# Making new predictions
lr_predictions_80 = logistic_regression_model.predict(X_test_80)
svm_predictions_80 = svm_model.predict(X_test_80)
mlp_predictions_80 = mlp_model.predict(X_test_80)

# Calculating the number of incorrectly classified instances for 80/20 split
incorrect_lr_80 = (y_test_80 != lr_predictions_80).sum()
incorrect_svm_80 = (y_test_80 != svm_predictions_80).sum()
incorrect_mlp_80 = (y_test_80 != mlp_predictions_80).sum()

print("\nRerun Experiment (80/20 Train, 20/80 Test Split)")
print("Incorrect Logistic Regression:", incorrect_lr_80)
print("Incorrect SVM:", incorrect_svm_80)
print("Incorrect MLP:", incorrect_mlp_80)


Initial Experiment (2/3 Train, 1/3 Test Split)
Incorrect Logistic Regression: 3
Incorrect SVM: 2
Incorrect MLP: 2

Rerun Experiment (80/20 Train, 20/80 Test Split)
Incorrect Logistic Regression: 1
Incorrect SVM: 1
Incorrect MLP: 0
