In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
csv_folder_path = "/content/drive/My Drive/datasets/Classification/"


In [3]:
import os
import glob

# Get list of all CSV files in the folder
csv_files = glob.glob(os.path.join(csv_folder_path, "*.csv"))
print(csv_files)

['/content/drive/My Drive/datasets/Classification/balance-scale.csv', '/content/drive/My Drive/datasets/Classification/Contraceptive_Method_Classification.csv', '/content/drive/My Drive/datasets/Classification/eucalyptus_dataset.csv', '/content/drive/My Drive/datasets/Classification/heart .csv', '/content/drive/My Drive/datasets/Classification/breast-wisconsin.csv', '/content/drive/My Drive/datasets/Classification/blood-transfusion-sc.csv', '/content/drive/My Drive/datasets/Classification/pc1.csv', '/content/drive/My Drive/datasets/Classification/tic-tac-toe.csv', '/content/drive/My Drive/datasets/Classification/vehicle_c.csv', '/content/drive/My Drive/datasets/Classification/carPr.csv', '/content/drive/My Drive/datasets/Classification/credit_g.csv']


In [17]:
import pandas as pd
import numpy as np
import openai
from openai import OpenAI
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import os
import ast
import time
from google.colab import userdata
# Initialize OpenAI client
OPENAI_API_KEY = userdata.get("OPENAI_API_KEY")  # Replace with your OpenAI key
client = OpenAI(api_key=OPENAI_API_KEY)


class LLMFE:
    def __init__(self, data, target_column, metric=accuracy_score, max_iter=50):
        self.data = data
        self.target = target_column
        self.metric = metric
        self.max_iter = max_iter

        # Split data and encode labels
        self.train, self.val = train_test_split(data, test_size=0.2, random_state=42)
        self.X_train = self.train.drop(target_column, axis=1)
        self.y_train = self.train[target_column]
        self.X_val = self.val.drop(target_column, axis=1)
        self.y_val = self.val[target_column]

        # Encode labels
        self.le = LabelEncoder()
        self.y_train = self.le.fit_transform(self.y_train)
        self.y_val = self.le.transform(self.y_val)

        # Memory buffers
        self.memory = []
        self.best_score = float('-inf')
        self.best_transformation = None

    def _create_prompt(self):
        feature_selection_prompt = f"""
        **PHASE 1: Feature Selection for '{self.target}'**

        Dataset: {os.path.basename(self.data.attrs.get('filename', 'unknown'))}
        Features: {list(self.X_train.columns)}

        Instructions:
        1. Analyze features using:
        - Correlation (keep if |r| > 0.15)
        - Domain knowledge (e.g., "age" matters for disease prediction)
        - Variance (drop if >95% same value)
        2. Flag redundant features (drop if correlation > 0.8 with more important feature)
        3. For categorical features, check cardinality (drop if >20 unique values)

        Required Output:
        ```python
        # Features to KEEP (high relevance):
        keep = ['feature1', 'feature2']

        # Features to DROP:
        drop = ['id', 'constant_feature']

        # Potential interaction terms:
        interactions = [('feature1', 'feature2')]
        ```
        """

        dataset_guidance = {
            "Contraceptive_Method_Classification.csv": {
                "hint": "Consider interactions between wife and husband characteristics and family size.",
                "examples": [
                    "lambda df: df.assign(education_gap=df['Wife Education'] - df['Husband Education'])",
                    "lambda df: df.assign(children_per_year=df['Children'] / (df['Wife Age'] - 18))",
                    "lambda df: df.assign(working_mom=df['Wife working'] * df['Children'])"
                ]
            },
            "balance-scale.csv": {
                "hint": "Focus on weight and distance ratios between left and right sides.",
                "examples": [
                    "lambda df: df.assign(weight_ratio=df['L-Weight'] / (df['R-Weight'] + 1e-9))",
                    "lambda df: df.assign(distance_ratio=df['L-Distance'] / (df['R-Distance'] + 1e-9))",
                    "lambda df: df.assign(weight_diff=df['L-Weight'] - df['R-Weight'])"
                ]
            },
            "default": {
                "hint": "Consider interactions between numerical features and encodings for categoricals.",
                "examples": [
                    "lambda df: df.assign(feature1_squared=df['feature1'] ** 2)",
                    "lambda df: df.assign(feature_ratio=df['feature1'] / (df['feature2'] + 1e-9))",
                    "lambda df: df.assign(cat_encoded=df['category'].map({'A':1, 'B':2, 'C':3}))"
                ]
            }
        }

        filename = os.path.basename(self.data.attrs.get('filename', 'unknown'))
        guidance = dataset_guidance.get(filename, dataset_guidance["default"])

        return f"""
        You are a feature engineering expert working with a classification dataset to predict '{self.target}'.
        The available features are: {list(self.X_train.columns)}

        Dataset-specific guidance: {guidance['hint']}

        Current best accuracy: {self.best_score:.4f}

        Here are some relevant transformation examples for this dataset:
        {chr(10).join(guidance['examples'])}

        Please suggest 3 novel, computationally efficient feature transformations that would help classify '{self.target}'.
        Focus on:
        1. Creating meaningful interactions between features
        2. Transformations that might reveal non-linear decision boundaries
        3. Appropriate encodings for categorical variables

        Format each transformation as a Python lambda function:
        lambda df: df.assign(<new_feature_name>=<transformation_expression>)

        Requirements:
        1. Each transformation should be a single line
        2. Avoid extremely complex expressions that might cause numerical instability
        3. Include comments explaining the transformation when appropriate
        """

    def generate_transformations(self):
        prompt = self._create_prompt()

        try:
            response = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "You are a helpful AI assistant that generates Python code for feature engineering in classification tasks."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.7,
                max_tokens=500
            )
            return self._parse_response(response.choices[0].message.content)
        except Exception as e:
            print(f"Error calling OpenAI API: {e}")
            return []

    def _parse_response(self, text):
        transformations = []
        for line in text.split('\n'):
            if line.startswith('lambda'):
                try:
                    ast.parse(line)
                    transformations.append(line.strip())
                except SyntaxError:
                    continue
        return transformations[:3]

    def evaluate_transformation(self, transformation):
      try:
            func = eval(transformation)

            # Apply transformation
            X_train_trans = func(self.X_train.copy())
            X_val_trans = func(self.X_val.copy())

            # Encode categorical features
            for col in X_train_trans.select_dtypes(include=['object']).columns:
                X_train_trans[col], uniques = pd.factorize(X_train_trans[col])
                X_val_trans[col] = X_val_trans[col].map({val: idx for idx, val in enumerate(uniques)}).fillna(-1)

            # Train model - using Classifier instead of Regressor
            model = RandomForestClassifier(n_estimators=50, random_state=42)
            model.fit(X_train_trans, self.y_train)
            preds = model.predict(X_val_trans)

            # Calculate accuracy
            score = self.metric(self.y_val, preds)

            return {
                'score': score,
                'transformation': transformation,
                'features': list(X_train_trans.columns)
            }

      except Exception as e:
          print(f"Error in transformation: {e}")
          return None

    def run(self):
        for iteration in range(self.max_iter):
            print(f"\n--- Iteration {iteration+1}/{self.max_iter} ---")

            # Generate transformations
            transformations = self.generate_transformations()
            print(f"Generated {len(transformations)} transformations")

            # Evaluate transformations
            results = [self.evaluate_transformation(t) for t in transformations]

            # Update memory
            for result in results:
                if result and result['score'] > self.best_score:
                    self.best_score = result['score']
                    self.best_transformation = result['transformation']
                    self.memory.append(resulColumn error in transformation - check column names: "None of [Index(['Category'], dtype='object')] are in the [columns]"t)
                    func = eval(self.best_transformation)
                    self.X_train = func(self.X_train.copy())
                    self.X_val = func(self.X_val.copy())

            print(f"Current Best Accuracy: {self.best_score:.6f}")

        return self.best_transformation, self.best_score

In [19]:
import pandas as pd
import os

# Assuming csv_files is defined somewhere in your code
for csv_file in csv_files:
    try:
        print(f"\n📂 Processing file: {csv_file}")
        data = pd.read_csv(csv_file)

        print("Columns in the dataset:", list(data.columns))
        target_col = input("👉 Please enter the name of the target column: ").strip()

        if target_col not in data.columns:
            print(f"❌ Column '{target_col}' not found in dataset. Skipping.")
            continue

        print(f"🎯 Using target column: {target_col}")

        # Run the LLMFE pipeline for classification
        fe = LLMFE(data=data, target_column=target_col, max_iter=3)
        best_trans, best_score = fe.run()

        print(f"\n✅ Done: {os.path.basename(csv_file)}")
        print(f"🔍 Best Transformation: {best_trans}")
        print(f"📈 Validation Accuracy: {best_score:.4f}")  # Changed from MSE to Accuracy
    except Exception as e:
        print(f"❌ Error processing {csv_file}: {e}")


📂 Processing file: /content/drive/My Drive/datasets/Classification/balance-scale.csv
Columns in the dataset: ['Class', 'L-Weight', 'L-Distance', 'R-Weight', 'R-Distance']
👉 Please enter the name of the target column: Class
🎯 Using target column: Class

--- Iteration 1/3 ---
Generated 3 transformations
Error in transformation: 'Class'
Current Best Accuracy: 0.872000

--- Iteration 2/3 ---
Generated 3 transformations
Error in transformation: 'Class'
Current Best Accuracy: 0.872000

--- Iteration 3/3 ---
Generated 3 transformations
Current Best Accuracy: 0.880000

✅ Done: balance-scale.csv
🔍 Best Transformation: lambda df: df.assign(weight_distance_diff=df['L-Weight'] - df['R-Weight'])  # Interaction: Difference between left and right weights
📈 Validation Accuracy: 0.8800

📂 Processing file: /content/drive/My Drive/datasets/Classification/Contraceptive_Method_Classification.csv
Columns in the dataset: ['Wife Age', 'Wife Education', 'Husband Education', 'Children', 'Wife religion', 'Wife 