In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error, r2_score

In [39]:
df = pd.read_csv("final_grep_data.csv")

In [40]:
df2 = pd.DataFrame()
df2["matcher"] = df["matcher"]
df2["regex_length"] = df["regex_length"]
df2["regex_complexity"] = df["regex_complexity"]
df2["input_file_size (bytes)"] = df["input_file_size (bytes)"]
df2["execution_time (ms)"] = df["execution_time (ms)"]
df = df2.copy()

df

Unnamed: 0,matcher,regex_length,regex_complexity,input_file_size (bytes),execution_time (ms)
0,-E,12,7.2,873045,23.861249
1,-E,12,7.2,271635,16.129971
2,-E,12,7.2,566610,15.757958
3,-E,12,7.2,1786200,39.776882
4,-E,12,7.2,62860,8.767207
...,...,...,...,...,...
17575,-x,63,46.3,2836320,6.530364
17576,-x,63,46.3,1746090,5.733649
17577,-x,63,46.3,403054,3.797213
17578,-x,63,46.3,166280,3.627380


In [44]:


# Load and preprocess data
def preprocess_data(df):
    # Create copy to avoid modifications to original data
    df = df.copy()
    
    # Define feature columns with exact names
    categorical_features = ['matcher']
    numeric_features = ['regex_length', 'regex_complexity', 'input_file_size (bytes)']
    target = 'execution_time (ms)'
    
    # Create preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', MinMaxScaler(), numeric_features),
            ('cat', OneHotEncoder(drop='first', sparse=False), categorical_features)
        ])
    
    # Create feature matrix X and target variable y
    X = df[categorical_features + numeric_features]
    y = df[target]
    
    # Fit and transform the data
    X_transformed = preprocessor.fit_transform(X)
    
    # Get feature names after transformation
    onehot_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
    feature_names = np.concatenate([numeric_features, onehot_features])
    
    # Convert to DataFrame to maintain feature names
    X_transformed_df = pd.DataFrame(X_transformed, columns=feature_names)
    
    return X_transformed_df, y, preprocessor

# Function to evaluate models
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    
    return {
        'MAE': mae,
        'RMSE': rmse,
        'R2': r2,
        'MAPE': mape
    }

# Initialize models with parameter grids
models = {
    'DecisionTree': {
        'model': DecisionTreeRegressor(random_state=42),
        'params': {
            'max_depth': [3, 5, 7, 10],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    },
    'RandomForest': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [5, 10],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2]
        }
    },
    'XGBoost': {
        'model': XGBRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1],
            'min_child_weight': [1, 3]
        }
    }
}

# Preprocess data
X_transformed_df, y, preprocessor = preprocess_data(df)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_transformed_df, y, test_size=0.2, random_state=42
)

print("\nDataset Split Information:")
print(f"Total samples: {len(X_transformed_df)}")
print(f"Training samples: {len(X_train)} ({len(X_train)/len(X_transformed_df)*100:.1f}%)")
print(f"Testing samples: {len(X_test)} ({len(X_test)/len(X_transformed_df)*100:.1f}%)")

# Train and evaluate models
results = {}
for name, model_info in models.items():
    # Perform grid search
    grid_search = GridSearchCV(
        model_info['model'],
        model_info['params'],
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    
    # Get best model
    best_model = grid_search.best_estimator_
    
    # Make predictions
    y_pred = best_model.predict(X_test)
    
    # Calculate metrics
    metrics = evaluate_model(y_test, y_pred)
    results[name] = {
        'metrics': metrics,
        'best_params': grid_search.best_params_
    }

# Create results DataFrame in the requested format
acc1_values = []
acc2_values = []
mape_values = []
model_names = []


for model_name, model_results in results.items():
    y_pred = models[model_name]['model'].fit(X_train, y_train).predict(X_test)
    
    # Calculate ratio of predicted to true values
    ratio = y_pred / y_test
    
    # Calculate accuracy metrics
    acc1 = np.mean((ratio >= 0.8) & (ratio <= 1.2))
    acc2 = np.mean((ratio >= 0.9) & (ratio <= 1.1))
    mape = model_results['metrics']['MAPE']
    
    # Append values
    model_names.append(model_name)
    acc1_values.append(round(acc1, 4))
    acc2_values.append(round(acc2, 4))
    mape_values.append(round(mape, 3))

results_df = pd.DataFrame({
    'Model': model_names,
    'Acc1\n(0.8 < y_pred/y_true < 1.2)': acc1_values,
    'Acc2\n(0.9 < y_pred/y_true < 1.1)': acc2_values,
    'MAPE': mape_values
})

# Sort results by MAPE (ascending) to show best performing models first
results_df = results_df.sort_values('MAPE', ascending=True).reset_index(drop=True)

# Display results
print("\nModel Performance Summary:")
print(results_df.to_string(index=False))

# Display best parameters for each model
print("\nBest Parameters:")
for name, result in results.items():
    print(f"\n{name}:")
    for param, value in result['best_params'].items():
        print(f"{param}: {value}")

# Print feature importance for Random Forest (as an example)
rf_model = models['RandomForest']['model'].fit(X_train, y_train)
feature_importance = pd.DataFrame({
    'Feature': X_transformed_df.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nFeature Importance (Random Forest):")
print(feature_importance)


Dataset Split Information:
Total samples: 17580
Training samples: 14064 (80.0%)
Testing samples: 3516 (20.0%)





Model Performance Summary:
       Model  Acc1\n(0.8 < y_pred/y_true < 1.2)  Acc2\n(0.9 < y_pred/y_true < 1.1)  MAPE
DecisionTree                             0.4568                             0.2750 0.459
RandomForest                             0.4983                             0.2787 0.508
     XGBoost                             0.3859                             0.2071 0.524

Best Parameters:

DecisionTree:
max_depth: 10
min_samples_leaf: 1
min_samples_split: 2

RandomForest:
max_depth: 10
min_samples_leaf: 1
min_samples_split: 5
n_estimators: 100

XGBoost:
learning_rate: 0.1
max_depth: 7
min_child_weight: 3
n_estimators: 100

Feature Importance (Random Forest):
                   Feature  Importance
2  input_file_size (bytes)    0.225527
1         regex_complexity    0.213474
5               matcher_-P    0.154620
4               matcher_-G    0.106464
0             regex_length    0.105110
3               matcher_-F    0.102154
6               matcher_-x    0.092651
