- [1. Ridge](#1)
- [2.](#2)

## 1. Ridge <a id='1'></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from scipy.stats import uniform, randint

In [5]:
# path_template = '/Users/benjamincheng/Documents/GitHub/Sports-Betting/data/tuning_data/complete_{}.csv'
path_template = '/Users/liqingyang/Documents/GitHub/sports_trading/sports_betting/data/tuning_data/complete_{}.csv'
df1 = pd.read_csv(path_template.format(1))
df2 = pd.read_csv(path_template.format(2))
df3 = pd.read_csv(path_template.format(3))
df4 = pd.read_csv(path_template.format(4))

# Concatenate the DataFrames into one DataFrame
complete_cleaning = pd.concat([df1, df2, df3, df4], ignore_index=True)

In [None]:
features = [col for col in complete_cleaning.columns if col != 'target_x']
X = complete_cleaning[features]
y = complete_cleaning['target_x']

# Define the model
rr = RidgeClassifier()

# Create a pipeline with Sequential Feature Selector
pipeline = Pipeline([
    ('feature_selector', SequentialFeatureSelector(rr, direction='forward')),
    ('classifier', rr)
])

# Define the parameter distribution to sample from
param_distributions = {
    'feature_selector__n_features_to_select': randint(20, 80),  # Random integer between 1 and number of features
    'classifier__alpha': uniform(0.1, 50), 
    'classifier__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']  # Different solvers
}

# Initialize TimeSeriesSplit for cross-validation
tscv = TimeSeriesSplit(n_splits=2)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=2,  # Number of parameter settings sampled
    cv=tscv,  # Cross-validation strategy
    verbose=1,
    n_jobs=-1
)


# Perform the randomized search on the data
random_search.fit(X, y)

# Get the best combination of hyperparameters and the corresponding best model
best_hyperparams = random_search.best_params_
best_model = random_search.best_estimator_

# Display the best hyperparameters
print("Best hyperparameters:")
print(best_hyperparams)

saved_file = '/Users/liqingyang/Documents/GitHub/sports_trading/sports_betting/data/tuning_results/'


best_hyperparams_path = os.path.join(saved_dir, 'best_hyperparameters.json')
with open(best_hyperparams_path, 'w') as f:
    json.dump(best_hyperparams, f)

# 2. Save Fitted Model
best_model_path = os.path.join(saved_dir, 'best_model.joblib')
dump(best_model, best_model_path)

# 3. Save CV Results
cv_results_path = os.path.join(saved_dir, 'cv_results.csv')
cv_results = pd.DataFrame(random_search.cv_results_)
cv_results.to_csv(cv_results_path, index=False)

print("Saved best hyperparameters, best model, and CV results.")

Fitting 2 folds for each of 2 candidates, totalling 4 fits


In [None]:
# Assuming `random_search` is your fitted RandomizedSearchCV object
# and 'features' is the list of all feature names used during fitting

# Extract the best estimator from your randomized search
best_model = random_search.best_estimator_

# Access the SequentialFeatureSelector from the best estimator pipeline
feature_selector = best_model.named_steps['feature_selector']

# Get the support mask for the selected features
support_mask = feature_selector.get_support()

# Use the mask to get the list of selected feature names
selected_features = [feature for feature, selected in zip(features, support_mask) if selected]

# Now, `selected_features` contains the names of the features selected by the SequentialFeatureSelector
# Define the path where you want to save the selected features
selected_features_path = os.path.join(saved_dir, 'selected_features.txt')

# Save the selected feature names to the file
with open(selected_features_path, 'w') as f:
    for feature in selected_features:
        f.write(f"{feature}\n")

print("Saved selected features.")


In [None]:
best_accuracy = random_search.best_score_
print(f"Best Accuracy: {best_accuracy:.4f}")

In [None]:
# Get the feature selector from the pipeline of the best model
feature_selector = random_search.best_estimator_.named_steps['feature_selector']

# Apply the mask to the feature names to get the selected features
selected_features = [feature for feature, is_selected in zip(features, feature_selector.get_support()) if is_selected]

# Save the selected features to a file
with open('selected_features.txt', 'w') as f:
    for feature in selected_features:
        f.write("%s\n" % feature)
