In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline

def load_data(file_path):
    try:
        # Read Excel file into a DataFrame
        data = pd.read_excel(file_path)
        return data
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

def preprocess_data(data):
    # Drop unnecessary columns if they exist
    cols_to_drop = ['Unnamed: 0', 'sample_title', 'organism', 'BioSample', 
                    'BioProject', 'AssemblyAccession', 'BioSampleAccn', 
                    'SubmitterOrganization', 'geo_loc_name']
    data = data.drop(columns=[col for col in cols_to_drop if col in data.columns], errors='ignore')
    
    # Convert date columns to numerical (UNIX timestamp)
    if 'collection_date' in data.columns:
        # Specify date format if known
        data['collection_date'] = pd.to_datetime(data['collection_date'], format='%Y-%m-%d', errors='coerce')
        data['collection_date'] = data['collection_date'].astype('int64') // 10**9  # Convert to seconds
    
    # Drop rows where target 'BRD_Total' is missing
    data = data.dropna(subset=['BRD_Total'])
    
    # Separate features and target variable
    X = data.drop(columns=['BRD_Total'])
    y = data['BRD_Total']
    
    # Identify numerical and categorical columns
    num_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
    cat_cols = X.select_dtypes(include=['object']).columns.tolist()
    
    # Fill missing numerical values with mean
    num_imputer = SimpleImputer(strategy='mean')
    imputed_data = num_imputer.fit_transform(X[num_cols])
    
    # Ensure imputed data matches the original number of columns
    imputed_df = pd.DataFrame(imputed_data, columns=num_cols, index=X.index[:len(imputed_data)])
    
    # Replace numerical columns in X with imputed values
    for col in num_cols:
        X.loc[:, col] = imputed_df[col]
    
    # Encode categorical variables
    for col in cat_cols:
        X[col] = LabelEncoder().fit_transform(X[col].astype(str))
    
    # Ensure all data is numerical before splitting
    X = X.apply(pd.to_numeric, errors='coerce')
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Standardize features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test, y_train, y_test

def train_lasso_model(X_train, y_train):
    try:
        # Define pipeline with Lasso model
        pipeline = Pipeline([
            ('lasso', Lasso())
        ])
        
        # Define hyperparameter tuning space
        param_grid = {
            'lasso__alpha': np.logspace(-4, 0, 5)
        }
        
        # Perform grid search for hyperparameter tuning
        grid_search = GridSearchCV(pipeline, param_grid, cv=5)
        grid_search.fit(X_train, y_train)
        
        return grid_search.best_estimator_
    except Exception as e:
        print(f"Error training model: {e}")
        return None

def evaluate_model(model, X_test, y_test):
    try:
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        # Calculate "accuracy" as the percentage of variance explained by the model
        r2_percentage = r2 * 100
        
        # Create a DataFrame to display the metrics
        metrics = {
            'Metric': ['Mean Squared Error (MSE)', 'Mean Absolute Error (MAE)', 'R-squared', 'Model Accuracy (R² in %)'],
            'Value': [mse, mae, r2, r2_percentage]
        }
        metrics_df = pd.DataFrame(metrics)
        
        return metrics_df
    except Exception as e:
        print(f"Error evaluating model: {e}")
        return None

def main(file_path):
    # Load the data
    data = load_data(file_path)
    
    if data is not None:
        # Preprocess the data
        X_train, X_test, y_train, y_test = preprocess_data(data)
        
        # Train the Lasso regression model
        lasso = train_lasso_model(X_train, y_train)
        
        if lasso is not None:
            # Evaluate the model
            metrics_df = evaluate_model(lasso, X_test, y_test)
            
            if metrics_df is not None:
                # Print the evaluation results in tabular format
                print(metrics_df)

# Example usage (replace with actual path to your Excel file)
file_path = 'fulldataset.xlsx'  # Replace with the actual file path
main(file_path)


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


                      Metric      Value
0   Mean Squared Error (MSE)   0.145557
1  Mean Absolute Error (MAE)   0.258164
2                  R-squared   0.976454
3   Model Accuracy (R² in %)  97.645441
