In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
csv_folder_path = "/content/drive/My Drive/datasets/Regression/"


In [None]:
import os
import glob

# Get list of all CSV files in the folder
csv_files = glob.glob(os.path.join(csv_folder_path, "*.csv"))
print(csv_files)

['/content/drive/My Drive/datasets/Regression/AirfoilSelfNoise.csv', '/content/drive/My Drive/datasets/Regression/CrabAgePrediction.csv', '/content/drive/My Drive/datasets/Regression/forestfires.csv', '/content/drive/My Drive/datasets/Regression/US_Health_Insurance.csv', '/content/drive/My Drive/datasets/Regression/cpu_small.csv', '/content/drive/My Drive/datasets/Regression/bike_hour.csv', '/content/drive/My Drive/datasets/Regression/Diamonds Prices2022.csv', '/content/drive/My Drive/datasets/Regression/winequalitywhite1.csv', '/content/drive/My Drive/datasets/Regression/plasma_retinol.csv']


In [None]:
import pandas as pd
import numpy as np
import google.generativeai as genai
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import os
import ast
import time
from google.colab import userdata

# Get free API key from https://aistudio.google.com/app/apikey
GOOGLE_API_KEY = userdata.get("Google_API_2")  # Replace with your key
genai.configure(api_key=GOOGLE_API_KEY)

# Initialize Gemini model
model = genai.GenerativeModel('gemini-1.5-pro-latest')

def nrmse_by_mean(y_true, y_pred):
    """
    Compute Normalized Root Mean Squared Error (nRMSE),
    normalized by the mean of y_true.
    """
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mean_y = np.mean(y_true)
    return rmse / mean_y if mean_y != 0 else float('inf')


class LLMFE:
    def __init__(self, data, target_column, metric=nrmse_by_mean, max_iter=50):
        self.data = data
        self.target = target_column
        self.metric = metric
        self.max_iter = max_iter

        # Split data
        self.train, self.val = train_test_split(data, test_size=0.2, random_state=42)
        self.X_train = self.train.drop(target_column, axis=1)
        self.y_train = self.train[target_column]
        self.X_val = self.val.drop(target_column, axis=1)
        self.y_val = self.val[target_column]

        # Memory buffers
        self.memory = []
        self.best_score = float('inf')
        self.best_transformation = None

    def _create_prompt(self):

          # ===== PHASE 1: FEATURE SELECTION =====
      feature_selection_prompt = f"""
      **PHASE 1: Feature Selection for '{self.target}'**

      Dataset: {os.path.basename(self.data.attrs.get('filename', 'unknown'))}
      Features: {list(self.X_train.columns)}

      Instructions:
      1. Analyze features using:
       - Correlation (keep if |r| > 0.15)
       - Domain knowledge (e.g., "carat" matters for diamond prices)
       - Variance (drop if >95% same value)
      2. Flag redundant features (drop if r > 0.8 with more important feature)

      Required Output:
      ```python
      # Features to KEEP (high relevance):
      keep = ['feature1', 'feature2']

      # Features to DROP:
      drop = ['id', 'constant_feature']

      # Potential interaction terms:
      interactions = [('feature1', 'feature2')]
      ```
      """
    # Dataset-specific guidance
      dataset_guidance = {
        "Diamonds Prices2022.csv": {
            "hint": "Consider transformations involving carat, cut quality, and physical dimensions (x,y,z).",
            "examples": [
                "lambda df: df.assign(volume=df['x'] * df['y'] * df['z'])",
                "lambda df: df.assign(carat_per_depth=df['carat'] / df['depth'])",
                "lambda df: df.assign(cut_encoded=df['cut'].map({'Fair':1, 'Good':2, 'Very Good':3, 'Premium':4, 'Ideal':5}))"
            ]
        },
        "winequalitywhite1.csv": {
          "hint": "Consider transformations involving acidity, sugar, sulfur compounds, and alcohol levels.",
          "examples": [
              "lambda df: df.assign(sugar_to_acidity_ratio=df['residual sugar'] / df['fixed acidity'])",
              "lambda df: df.assign(total_acidity=df['fixed acidity'] + df['volatile acidity'] + df['citric acid'])",
              "lambda df: df.assign(sulfur_ratio=df['free sulfur dioxide'] / df['total sulfur dioxide'])",
              "lambda df: df.assign(alcohol_density_ratio=df['alcohol'] / df['density'])"
          ]
      },
        "plasma_retinol.csv": {
          "hint": "Consider transformations involving age, BMI, dietary intake, and cholesterol levels.",
          "examples": [
            "lambda df: df.assign(bmi_age_ratio=df['bmi'] / df['age'])",
            "lambda df: df.assign(fat_to_calories_ratio=df['fat'] / df['calories'])",
            "lambda df: df.assign(cholesterol_per_bmi=df['cholesterol'] / df['bmi'])",
            "lambda df: df.assign(age_squared=df['age'] ** 2)"
          ]
      },
        "AirfoilSelfNoise.csv": {
            "hint": "Consider aerodynamic interactions between frequency, angle of attack, and flow velocity.",
            "examples": [
                "lambda df: df.assign(reynolds=df['U_infinity'] * df['c'] / 1.5e-5)",  # Approx kinematic viscosity of air
                "lambda df: df.assign(strouhal=df['f'] * df['c'] / (df['U_infinity'] + 1e-9))",  # Small value to avoid div/0
                "lambda df: df.assign(angle_velocity_ratio=df['alpha'] / (df['U_infinity'] + 1e-9))"
            ]
        },
        "CrabAgePrediction.csv": {
            "hint": "Consider ratios between different weight measurements and size dimensions.",
            "examples": [
                "lambda df: df.assign(shell_ratio=df['Shell Weight'] / df['Weight'])",
                "lambda df: df.assign(size_to_weight=df['Length'] * df['Diameter'] / df['Weight'])",
                "lambda df: df.assign(meat_yield=(df['Shucked Weight'] + df['Viscera Weight']) / df['Weight'])"
            ]
        },
        "forestfires.csv": {
            "hint": "Consider interactions between weather conditions and temporal factors.",
            "examples": [
                "lambda df: df.assign(fire_risk_index=df['temp'] * df['wind'] / (df['RH'] + 1))",
                "lambda df: df.assign(drought_index=df['DMC'] * df['DC'])",
                "lambda df: df.assign(month_encoded=pd.to_datetime(df['month'], format='%b').dt.month)"
            ]
        },
        "US_Health_Insurance.csv": {
            "hint": "Consider interactions between BMI, smoking status, and age.",
            "examples": [
                "lambda df: df.assign(bmi_age=df['bmi'] * df['age'])",
                "lambda df: df.assign(smoker_encoded=df['smoker'].map({'yes':1, 'no':0}))",
                "lambda df: df.assign(risk_factor=np.where(df['smoker']=='yes', df['bmi']*df['age'], df['bmi']))"
            ]
        },
        "cpu_small.csv": {
            "hint": "Consider ratios between different system operations and resource usage.",
            "examples": [
                "lambda df: df.assign(io_ratio=df['lread'] / (df['lwrite'] + 1))",
                "lambda df: df.assign(mem_pressure=df['freemem'] / (df['freeswap'] + 1))",
                "lambda df: df.assign(syscall_efficiency=df['exec'] / (df['scall'] + 1))"
            ]
        },
        "bike_hour.csv": {
            "hint": "Consider temporal patterns and weather interactions.",
            "examples": [
                "lambda df: df.assign(temp_feel=df['temp'] * df['hum'])",
                "lambda df: df.assign(hour_sin=np.sin(2 * np.pi * df['hr'] / 24))",
                "lambda df: df.assign(workday_weather=df['workingday'] * df['weathersit'])"
            ]
        }
    }

    # Get filename for guidance lookup
      filename = os.path.basename(self.data.attrs.get('filename', 'unknown'))

    # Get dataset-specific guidance or use defaults
      guidance = dataset_guidance.get(filename, {
        "hint": "Consider interactions between numerical features and encodings for categoricals.",
        "examples": [
            "lambda df: df.assign(feature1_squared=df['feature1'] ** 2)",
            "lambda df: df.assign(feature_ratio=df['feature1'] / (df['feature2'] + 1e-9))",
            "lambda df: df.assign(cat_encoded=df['category'].map({'A':1, 'B':2, 'C':3}))"
        ]
    })

      return f"""
      You are a feature engineering expert working with a dataset to predict '{self.target}'.
      The available features are: {list(self.X_train.columns)}

      Dataset-specific guidance: {guidance['hint']}

      Current best score (NMSE): {self.best_score:.4f}

      Here are some relevant transformation examples for this dataset:
      {chr(10).join(guidance['examples'])}

      Please suggest 3 novel, computationally efficient feature transformations that would help predict '{self.target}'.
      Focus on creating meaningful interactions between features or transformations that might reveal non-linear relationships.

      Format each transformation as a Python lambda function:
      lambda df: df.assign(<new_feature_name>=<transformation_expression>)

      Requirements:
      1. Each transformation should be a single line
      2. Avoid extremely complex expressions that might cause numerical instability
      3. Include comments explaining the transformation when appropriate
      """

    def generate_transformations(self):
        prompt = self._create_prompt()
        response = model.generate_content(prompt)
        return self._parse_response(response.text)

    def _parse_response(self, text):
        transformations = []
        for line in text.split('\n'):
            if line.startswith('lambda'):
                try:
                    ast.parse(line)
                    transformations.append(line.strip())
                except SyntaxError:
                    continue
        return transformations[:3]

    def evaluate_transformation(self, transformation):
        try:
            func = eval(transformation) # checks if the transformation is a valid python expression or not.

            # Apply transformation
            X_train_trans = func(self.X_train.copy())
            X_val_trans = func(self.X_val.copy())

            # Encode categorical features
            for col in X_train_trans.select_dtypes(include=['object']).columns:
                X_train_trans[col], uniques = pd.factorize(X_train_trans[col])
                X_val_trans[col] = X_val_trans[col].map({val: idx for idx, val in enumerate(uniques)}).fillna(-1)

            # Train model
            model = RandomForestRegressor(n_estimators=50, random_state=42)
            model.fit(X_train_trans, self.y_train)
            preds = model.predict(X_val_trans)

            # Calculate NMSE score
            score = self.metric(self.y_val, preds)

            return {
                'score': score,
                'transformation': transformation,
                'features': list(X_train_trans.columns)
            }

        except Exception as e:
            print(f"Error in transformation: {e}")
            return None

    def run(self):
        for iteration in range(self.max_iter):
            print(f"\n--- Iteration {iteration+1}/{self.max_iter} ---")

            # Generate transformations
            transformations = self.generate_transformations()
            print(f"Generated {len(transformations)} transformations")

            # Evaluate transformations
            results = [self.evaluate_transformation(t) for t in transformations]

            # Update memory
            for result in results:
                if result and result['score'] < self.best_score:
                    self.best_score = result['score']
                    self.best_transformation = result['transformation']
                    self.memory.append(result)
                    func = eval(self.best_transformation)
                    self.X_train = func(self.X_train.copy())
                    self.X_val = func(self.X_val.copy())

            print(f"Current Best NRMSE: {self.best_score:.4f}")

        return self.best_transformation, self.best_score

In [None]:
import pandas as pd
for csv_file in csv_files:
    try:
        print(f"\n📂 Processing file: {csv_file}")
        data = pd.read_csv(csv_file)




        print("Columns in the dataset:", list(data.columns))
        target_col = input("👉 Please enter the name of the target column: ").strip()

        if target_col not in data.columns:
            print(f"❌ Column '{target_col}' not found in dataset. Skipping.")
            continue

        print(f"🎯 Using target column: {target_col}")

        # Run the LLMFE pipeline
        fe = LLMFE(data=data, target_column=target_col, max_iter=3)
        best_trans, best_score = fe.run()

        print(f"\n✅ Done: {os.path.basename(csv_file)}")
        print(f"🔍 Best Transformation: {best_trans}")
        print(f"📉 Validation NRMSE: {best_score:.4f}")
    except Exception as e:
        print(f"❌ Error processing {csv_file}: {e}")


📂 Processing file: /content/drive/My Drive/datasets/Regression/AirfoilSelfNoise.csv
Columns in the dataset: ['f', 'alpha', 'c', 'U_infinity', 'delta', 'SSPL']
👉 Please enter the name of the target column: SSPL
🎯 Using target column: SSPL

--- Iteration 1/3 ---
Generated 3 transformations
Current Best NRMSE: 0.0133

--- Iteration 2/3 ---
Generated 3 transformations
Current Best NRMSE: 0.0133

--- Iteration 3/3 ---
Generated 3 transformations
Current Best NRMSE: 0.0133

✅ Done: AirfoilSelfNoise.csv
🔍 Best Transformation: lambda df: df.assign(f_alpha_scaled=df['f'] * df['alpha'] / (df['U_infinity'] + 1e-9))
📉 Validation NRMSE: 0.0133

📂 Processing file: /content/drive/My Drive/datasets/Regression/CrabAgePrediction.csv
Columns in the dataset: ['Sex', 'Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight', 'Age']
👉 Please enter the name of the target column: Age
🎯 Using target column: Age

--- Iteration 1/3 ---
Generated 3 transformations
Current Best 



❌ Error processing /content/drive/My Drive/datasets/Regression/forestfires.csv: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.

📂 Processing file: /content/drive/My Drive/datasets/Regression/US_Health_Insurance.csv
Columns in the dataset: ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']
👉 Please enter the name of the target column: charges
🎯 Using target column: charges

--- Iteration 1/3 ---
Generated 0 transformations
Current Best NRMSE: inf

--- Iteration 2/3 ---
Generated 0 transformations
Current Best NRMSE: inf

--- Iteration 3/3 ---
Generated 0 transformations
Current Best NRMSE: inf

✅ Done: US_Health_Insurance.csv
🔍 Best Transformation: None
📉 Validation NRMSE: inf

📂 Processing file: /content/drive/My



❌ Error processing /content/drive/My Drive/datasets/Regression/cpu_small.csv: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.

📂 Processing file: /content/drive/My Drive/datasets/Regression/bike_hour.csv
Columns in the dataset: ['instant', 'dteday', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered', 'cnt']
👉 Please enter the name of the target column: hr
🎯 Using target column: hr

--- Iteration 1/3 ---




❌ Error processing /content/drive/My Drive/datasets/Regression/bike_hour.csv: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.

📂 Processing file: /content/drive/My Drive/datasets/Regression/Diamonds Prices2022.csv
Columns in the dataset: ['Id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z']
👉 Please enter the name of the target column: price
🎯 Using target column: price

--- Iteration 1/3 ---




❌ Error processing /content/drive/My Drive/datasets/Regression/Diamonds Prices2022.csv: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.

📂 Processing file: /content/drive/My Drive/datasets/Regression/winequalitywhite1.csv
Columns in the dataset: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']
👉 Please enter the name of the target column: quality
🎯 Using target column: quality

--- Iteration 1/3 ---




❌ Error processing /content/drive/My Drive/datasets/Regression/winequalitywhite1.csv: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.

📂 Processing file: /content/drive/My Drive/datasets/Regression/plasma_retinol.csv
Columns in the dataset: ['AGE', 'SEX', 'SMOKSTAT', 'QUETELET', 'VITUSE', 'CALORIES', 'FAT', 'FIBER', 'ALCOHOL', 'CHOLESTEROL', 'BETADIET', 'RETDIET', 'BETAPLASMA', 'RETPLASMA']
👉 Please enter the name of the target column: RETPLASMA
🎯 Using target column: RETPLASMA

--- Iteration 1/3 ---
❌ Error processing /content/drive/My Drive/datasets/Regression/plasma_retinol.csv: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceed

