In [3]:
import pandas as pd
import numpy as np

# Generate synthetic data
np.random.seed(42)

# Numeric covariate with missing values
n_samples = 1000
numeric_covariate = np.random.normal(loc=10, scale=2, size=n_samples)
missing_indices = np.random.choice(range(n_samples), size=int(0.1 * n_samples), replace=False)
numeric_covariate[missing_indices] = np.nan

# Categorical variable
categories = ['A', 'B', 'C', 'D']
categorical_variable = np.random.choice(categories, size=n_samples)

# Binary target (outcome)
target = np.random.randint(0, 2, size=n_samples)

# Create a DataFrame
df = pd.DataFrame({
    'numeric_covariate': numeric_covariate,
    'categorical_variable': categorical_variable,
    'target': target
})

# Display the first few rows
print(df.head(), df.shape)


   numeric_covariate categorical_variable  target
0          10.993428                    C       0
1           9.723471                    C       1
2          11.295377                    B       1
3          13.046060                    C       1
4           9.531693                    C       1 (1000, 3)


In [5]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Load your dataset (replace with your actual data)
# df = pd.read_csv('your_dataset.csv')

# Assume 'numeric_covariate' and 'categorical_variable' are column names
numeric_features = ['numeric_covariate']
categorical_features = ['categorical_variable']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['target']), df['target'], test_size=0.2, random_state=42)

# Create transformers for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))  # Impute missing values with mean
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with most frequent value
    ('encoder', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical variables
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create pipeline with SVC and RandomForestClassifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC())  # You can replace with RandomForestClassifier()
])

# Define hyperparameters for grid search
param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf']
}

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get best model
best_model = grid_search.best_estimator_

# Evaluate on test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Best model accuracy: {accuracy:.4f}")
print(f"Best hyperparameters: {grid_search.best_params_}")


Best model accuracy: 0.4800
Best hyperparameters: {'classifier__C': 0.1, 'classifier__kernel': 'rbf'}


In [14]:
# https://www.kaggle.com/datasets/brijlaldhankour/flood-prediction-factors
# https://www.kaggle.com/competitions/playground-series-s4e5
import sklearn
import numpy  as np
import pandas as pd
import zipfile
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import Perceptron, LinearRegression, Ridge, Lasso
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, make_scorer, mean_squared_error, r2_score, mean_absolute_percentage_error, max_error
from sklearn.pipeline import Pipeline, make_pipeline

# Assume 'numeric_covariate' and 'categorical_variable' are column names
numeric_features     = ['numeric_covariate1',    'numeric_covariate2'   ]
categorical_features = ['categorical_variable1', 'categorical_variable2']

# Define the models
models = {
    'SVM'             : SVR(cache_size = 2000),
    #'LinearRegression': LinearRegression(),
    'LASSO'           : Lasso(),
    #'Ridge'           : Ridge(),
    'RandomForest'    : RandomForestRegressor(),
    'GradientBoost'   : GradientBoostingRegressor(),
    'Dummy'           : DummyRegressor()
}

# Define the preprocessors
preprocessors = {
    'None': None,
    'pp1': ColumnTransformer(
        [
            ('numeric_cols', numeric_transformer,                    numeric_features),
            ('categor_cols', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder = 'passthrough',
        n_jobs = 10
    )
}

# Define hyperparameter grids for each model
param_grids = {

    'SVM': [
        {
            'model__C': [0.1, 1, 100],
            'model__epsilon': [0.01, 0.1, 0.2]
        }
    ],
    
    'LinearRegression': [
        {
            'model__fit_intercept': [True]
        }
    ],
    
    'LASSO': [
        {
            'model__fit_intercept': [True],
            'model__alpha': [0.1, 1, 10, 100]
        }
    ],
    
    'Ridge': [
        {
            'model__fit_intercept': [True],
            'model__alpha': [0.1, 1, 10, 100]
        }
    ],
    
    'RandomForest': [
        {
            'model__n_estimators': [10, 100, 500],
            'model__max_depth': [None, 2]
        }
    ],
    
    'GradientBoost': [
        {
            'model__n_estimators': [10, 100, 500],
            'model__learning_rate': [0.1, 0.2]
        }
    ],

    'Dummy': [
        {
            'model__strategy': ['mean', 'median']
        },

        {
            'model__strategy': ['constant'], 
            'model__constant': [0, 0.5, 1]
        },

        {
            'model__strategy': ['quantile'],
            'model__quantile': [0, 0.5, 1]
        }
    ]
}

scorer = r2_score

best_score = 0
best_model = None
for model_name, model in models.items():
    for preprocessor_name, preprocessor in preprocessors.items():

        pipeline = Pipeline(
            [
                ('preprocessor', preprocessor),
                ('model',  model)
            ]
        )
        print(
            f'preprocessor: {preprocessor_name}'
            f'\n'
            f'Model: {model_name}'
        )
        grid_search = GridSearchCV(
            estimator  = pipeline,
            param_grid = param_grids[model_name],
            cv = 10, 
            scoring = make_scorer(scorer),
            refit = True,
            n_jobs = -1,
            verbose = 4
        )

        grid_search.fit(X_train, y_train)
        
        # Evaluate the model on the train set
        y_pred = grid_search.predict(X_test)
        test_score = scorer(y_test, y_pred)

        # Print results of grid parameter search
        print(f"Best parameters for {model_name} with {preprocessor_name}: {grid_search.best_params_}")
        print(f"Train set score for {model_name} with {preprocessor_name}: {grid_search.best_score_:.4f}")
        print( f"Test set score for {model_name} with {preprocessor_name}: {test_score:.4f}\n")
        
        # Update best model if necessary
        if test_score > best_score:
            best_score = test_score
            best_model = model_name
            best_estimator = grid_search.best_estimator_

print(f"Best performing model: {best_model}")
print(f"Best Model's Score on the test set: {best_score:.4f}")
print(f"Scorer was: {scorer.__name__}")

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 123)