In [3]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load dataset
data = pd.read_csv('../data/toy_data.csv')

# Split the data
X = data.drop('target', axis=1)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression Model
model = LogisticRegression(class_weight='balanced', solver='liblinear')

# Randomized Parameter Search
param_dist = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2']         # Regularization type
}

# Perform random search and cross-validation
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=5, cv=5, random_state=42)
random_search.fit(X_train, y_train)

# Best Model
best_model = random_search.best_estimator_

# Cross-validation
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5)

# Predictions and evaluation
predictions = best_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Best Logistic Regression Accuracy: {accuracy}")
print(f"Cross-validation Accuracy scores: {cv_scores.mean()}")


Best Logistic Regression Accuracy: 1.0
Cross-validation Accuracy scores: 0.9875
