In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from bayes_opt import BayesianOptimization

# Step 1: Load the data
data = pd.read_csv('train.csv')

# Drop the ID column
data = data.drop('ID', axis=1)

# Step 2: Split the data into features and target variable
X = data.drop('category', axis=1)
y = data['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Step 3: Detect and remove outliers using LOF
from sklearn.neighbors import LocalOutlierFactor
lof = LocalOutlierFactor(n_neighbors=9)
yhat = lof.fit_predict(X_train)
mask = yhat != -1
X_train = X_train[mask]
y_train = y_train[mask]

# Step 4a: Preprocess the data
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 5: Choose a machine learning algorithm
def fit_logreg(C):
    clf = LogisticRegression(C=C, max_iter=1000)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return accuracy_score(y_test, y_pred)

# Step 6: Define the parameter bounds for Bayesian optimization
pbounds = {'C': (0.000001, 100000)}

# Step 7: Run Bayesian optimization to find the best hyperparameters
optimizer = BayesianOptimization(
    f=fit_logreg,
    pbounds=pbounds,
    random_state=42
)
optimizer.maximize(init_points=5, n_iter=10)

print("Best hyperparameters:", optimizer.max)

clf = LogisticRegression(C=optimizer.max['params']['C'], max_iter=1000)
clf.fit(X_train, y_train)

# Step 8: Test the model's performance on the test set
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Confusion matrix:")
# print(confusion_matrix(y_test, y_pred))

# Step 9: Save the predictions to a file using the trained models
clf2 = LogisticRegression(C=optimizer.max['params']['C'], max_iter=1000)
clf2.fit(X, y)
test_df = pd.read_csv('test.csv')
# drop the ID column
test_df = test_df.drop('ID', axis=1)
X_test = scaler.transform(test_df)
y_pred = clf.predict(X_test)
test_df['category'] = y_pred
file = open('bayesian.csv', 'w')
# Writing the header
file.write('ID,Category')
for i in range(len(y_pred)):
    file.write('\n')
    file.write(str(i))
    file.write(',')
    file.write(y_pred[i])
file.close()

|   iter    |  target   |     C     |
-------------------------------------
| [0m1        [0m | [0m0.8074   [0m | [0m3.745e+04[0m |
| [0m2        [0m | [0m0.8033   [0m | [0m9.507e+04[0m |
| [0m3        [0m | [0m0.8033   [0m | [0m7.32e+04 [0m |
| [0m4        [0m | [0m0.8033   [0m | [0m5.987e+04[0m |
| [0m5        [0m | [0m0.8033   [0m | [0m1.56e+04 [0m |
| [0m6        [0m | [0m0.8074   [0m | [0m3.745e+04[0m |
| [0m7        [0m | [0m0.8074   [0m | [0m4.461e+04[0m |
| [0m8        [0m | [0m0.8033   [0m | [0m4.812    [0m |
| [0m9        [0m | [0m0.8074   [0m | [0m3.051e+04[0m |
| [0m10       [0m | [0m0.8074   [0m | [0m4.14e+04 [0m |
| [0m11       [0m | [0m0.8074   [0m | [0m3.347e+04[0m |
| [0m12       [0m | [0m0.8074   [0m | [0m4.804e+04[0m |
| [0m13       [0m | [0m0.8074   [0m | [0m2.677e+04[0m |
| [0m14       [0m | [0m0.8074   [0m | [0m2.836e+04[0m |
| [0m15       [0m | [0m0.8074   [0m | [0m4.656e+04