In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
csv_folder_path = "/content/drive/My Drive/datasets/Classification/"


In [None]:
import os
import glob

# Get list of all CSV files in the folder
csv_files = glob.glob(os.path.join(csv_folder_path, "*.csv"))
print(csv_files)

['/content/drive/My Drive/datasets/Classification/balance-scale.csv', '/content/drive/My Drive/datasets/Classification/Contraceptive_Method_Classification.csv', '/content/drive/My Drive/datasets/Classification/eucalyptus_dataset.csv', '/content/drive/My Drive/datasets/Classification/heart .csv', '/content/drive/My Drive/datasets/Classification/breast-wisconsin.csv', '/content/drive/My Drive/datasets/Classification/blood-transfusion-sc.csv', '/content/drive/My Drive/datasets/Classification/pc1.csv', '/content/drive/My Drive/datasets/Classification/tic-tac-toe.csv', '/content/drive/My Drive/datasets/Classification/vehicle_c.csv', '/content/drive/My Drive/datasets/Classification/carPr.csv', '/content/drive/My Drive/datasets/Classification/credit_g.csv']


In [None]:
import pandas as pd
import numpy as np
import google.generativeai as genai
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier  # Changed to Classifier
from sklearn.metrics import accuracy_score  # Using accuracy instead of RMSE
import os
import ast
import time
from google.colab import userdata

# Get free API key from https://aistudio.google.com/app/apikey
GOOGLE_API_KEY = userdata.get("GOOGLE_API_KEY")  # Replace with your key
genai.configure(api_key=GOOGLE_API_KEY)

# Initialize Gemini model
model = genai.GenerativeModel('gemini-1.5-pro-latest')

class LLMFE:
    def __init__(self, data, target_column, metric=accuracy_score, max_iter=50):
        self.data = data
        self.target = target_column
        self.metric = metric
        self.max_iter = max_iter

        # Split data
        self.train, self.val = train_test_split(data, test_size=0.2, random_state=42)
        self.X_train = self.train.drop(target_column, axis=1)
        self.y_train = self.train[target_column]
        self.X_val = self.val.drop(target_column, axis=1)
        self.y_val = self.val[target_column]

        # Memory buffers
        self.memory = []
        self.best_score = float('-inf')  # Changed to -inf since we want to maximize accuracy
        self.best_transformation = None

    def _create_prompt(self):
      # ===== PHASE 1: FEATURE SELECTION =====
      feature_selection_prompt = f"""
      **PHASE 1: Feature Selection for '{self.target}'**

      Dataset: {os.path.basename(self.data.attrs.get('filename', 'unknown'))}
      Features: {list(self.X_train.columns)}

      Instructions:
      1. Analyze features using:
      - Correlation (keep if |r| > 0.15)
      - Domain knowledge (e.g., "age" matters for disease prediction)
      - Variance (drop if >95% same value)
      2. Flag redundant features (drop if correlation > 0.8 with more important feature)
      3. For categorical features, check cardinality (drop if >20 unique values)

      Required Output:
      ```python
      # Features to KEEP (high relevance):
      keep = ['feature1', 'feature2']

      # Features to DROP:
      drop = ['id', 'constant_feature']

      # Potential interaction terms:
      interactions = [('feature1', 'feature2')]
      ```
      """

      # Dataset-specific guidance for classification tasks
      dataset_guidance = {
          "balance-scale.csv": {
              "hint": "Consider interactions between left and right weight/distance measurements.",
              "examples": [
                  "lambda df: df.assign(weight_diff=df['L-Weight'] - df['R-Weight'])",
                  "lambda df: df.assign(distance_ratio=df['L-Distance'] / (df['R-Distance'] + 1e-9))",
                  "lambda df: df.assign(total_torque=(df['L-Weight']*df['L-Distance']) - (df['R-Weight']*df['R-Distance']))"
              ]
          },
          "Contraceptive_Method_Classification.csv": {
              "hint": "Consider interactions between wife and husband characteristics and family size.",
              "examples": [
                  "lambda df: df.assign(education_gap=df['Wife Education'] - df['Husband Education'])",
                  "lambda df: df.assign(children_per_year=df['Children'] / (df['Wife Age'] - 18))",  # Approx years of fertility
                  "lambda df: df.assign(working_mom=df['Wife working'] * df['Children'])"
              ]
          },
          "heart.csv": {
              "hint": "Consider interactions between age, cholesterol levels, and other health indicators.",
              "examples": [
                  "lambda df: df.assign(age_chol_ratio=df['age'] / df['chol'])",
                  "lambda df: df.assign(blood_pressure_diff=df['trestbps'] - 120)",  # Difference from normal
                  "lambda df: df.assign(risk_factor=df['age'] * df['chol'] / (df['thalach'] + 1e-9))"
              ]
          },
          "breast-wisconsin.csv": {
              "hint": "Consider statistical properties of cell characteristics.",
              "examples": [
                  "lambda df: df.assign(mean_diff=df['mean radius'] - df['mean texture'])",
                  "lambda df: df.assign(compactness=df['mean perimeter']**2 / (4*np.pi*df['mean area']))",
                  "lambda df: df.assign(size_variation=df['mean area'] * df['mean smoothness'])"
              ]
          },
          "tic-tac-toe.csv": {
              "hint": "Consider patterns in board positions and winning configurations.",
              "examples": [
                  "lambda df: df.assign(top_row=df['top-left'] + df['top-middle'] + df['top-right'])",
                  "lambda df: df.assign(diagonal_x=(df['top-left'] == df['middle-middle']) & (df['middle-middle'] == df['bottom-right']))",
                  "lambda df: df.assign(center_control=df['middle-middle'].map({'x':1, 'o':-1, 'b':0}))"
              ]
          },
          "vehicle_c.csv": {
              "hint": "Consider geometric relationships between vehicle dimensions.",
              "examples": [
                  "lambda df: df.assign(compactness=df['COMPACTNESS'] * df['CIRCULARITY'])",
                  "lambda df: df.assign(size_ratio=df['MAX.LENGTH_RECT'] / (df['MIN.LENGTH_RECT'] + 1e-9))",
                  "lambda df: df.assign(scatter_radius=df['DISTANCE_CIRCULARITY'] * df['RADIUS_RATIO'])"
              ]
          }
      }

      # Get filename for guidance lookup
      filename = os.path.basename(self.data.attrs.get('filename', 'unknown'))

      # Get dataset-specific guidance or use defaults
      guidance = dataset_guidance.get(filename, {
          "hint": "Consider interactions between numerical features and encodings for categoricals.",
          "examples": [
              "lambda df: df.assign(feature1_squared=df['feature1'] ** 2)",
              "lambda df: df.assign(feature_ratio=df['feature1'] / (df['feature2'] + 1e-9))",
              "lambda df: df.assign(cat_encoded=df['category'].map({'A':1, 'B':2, 'C':3}))"
          ]
      })

      return f"""
      You are a feature engineering expert working with a classification dataset to predict '{self.target}'.
      The available features are: {list(self.X_train.columns)}

      Dataset-specific guidance: {guidance['hint']}

      Current best accuracy: {self.best_score:.4f}

      Here are some relevant transformation examples for this dataset:
      {chr(10).join(guidance['examples'])}

      Please suggest 3 novel, computationally efficient feature transformations that would help classify '{self.target}'.
      Focus on:
      1. Creating meaningful interactions between features
      2. Transformations that might reveal non-linear decision boundaries
      3. Appropriate encodings for categorical variables

      Format each transformation as a Python lambda function:
      lambda df: df.assign(<new_feature_name>=<transformation_expression>)

      Requirements:
      1. Each transformation should be a single line
      2. Avoid extremely complex expressions that might cause numerical instability
      3. Include comments explaining the transformation when appropriate
      """

    def generate_transformations(self):
        prompt = self._create_prompt()
        response = model.generate_content(prompt)
        return self._parse_response(response.text)

    def _parse_response(self, text):
        transformations = []
        for line in text.split('\n'):
            if line.startswith('lambda'):
                try:
                    ast.parse(line)
                    transformations.append(line.strip())
                except SyntaxError:
                    continue
        return transformations[:3]

    def evaluate_transformation(self, transformation):
        try:
            func = eval(transformation)

            # Apply transformation
            X_train_trans = func(self.X_train.copy())
            X_val_trans = func(self.X_val.copy())

            # Encode categorical features
            for col in X_train_trans.select_dtypes(include=['object']).columns:
                X_train_trans[col], uniques = pd.factorize(X_train_trans[col])
                X_val_trans[col] = X_val_trans[col].map({val: idx for idx, val in enumerate(uniques)}).fillna(-1)

            # Train model - using Classifier instead of Regressor
            model = RandomForestClassifier(n_estimators=50, random_state=42)
            model.fit(X_train_trans, self.y_train)
            preds = model.predict(X_val_trans)

            # Calculate accuracy
            score = self.metric(self.y_val, preds)

            return {
                'score': score,
                'transformation': transformation,
                'features': list(X_train_trans.columns)
            }

        except Exception as e:
            print(f"Error in transformation: {e}")
            return None

    def run(self):
        for iteration in range(self.max_iter):
            print(f"\n--- Iteration {iteration+1}/{self.max_iter} ---")

            # Generate transformations
            transformations = self.generate_transformations()
            print(f"Generated {len(transformations)} transformations")

            # Evaluate transformations
            results = [self.evaluate_transformation(t) for t in transformations]

            # Update memory - now looking for higher scores
            for result in results:
                if result and result['score'] > self.best_score:
                    self.best_score = result['score']
                    self.best_transformation = result['transformation']
                    self.memory.append(result)
                    func = eval(self.best_transformation)
                    self.X_train = func(self.X_train.copy())
                    self.X_val = func(self.X_val.copy())

            print(f"Current Best Accuracy: {self.best_score:.6f}")

        return self.best_transformation, self.best_score

In [None]:
import pandas as pd
import os

# Assuming csv_files is defined somewhere in your code
for csv_file in csv_files:
    try:
        print(f"\n📂 Processing file: {csv_file}")
        data = pd.read_csv(csv_file)

        print("Columns in the dataset:", list(data.columns))
        target_col = input("👉 Please enter the name of the target column: ").strip()

        if target_col not in data.columns:
            print(f"❌ Column '{target_col}' not found in dataset. Skipping.")
            continue

        print(f"🎯 Using target column: {target_col}")

        # Run the LLMFE pipeline for classification
        fe = LLMFE(data=data, target_column=target_col, max_iter=3)
        best_trans, best_score = fe.run()

        print(f"\n✅ Done: {os.path.basename(csv_file)}")
        print(f"🔍 Best Transformation: {best_trans}")
        print(f"📈 Validation Accuracy: {best_score:.4f}")  # Changed from MSE to Accuracy
    except Exception as e:
        print(f"❌ Error processing {csv_file}: {e}")


📂 Processing file: /content/drive/My Drive/datasets/Classification/balance-scale.csv
Columns in the dataset: ['Class', 'L-Weight', 'L-Distance', 'R-Weight', 'R-Distance']
👉 Please enter the name of the target column: Class
🎯 Using target column: Class

--- Iteration 1/3 ---
Generated 3 transformations
Current Best Accuracy: 1.000000

--- Iteration 2/3 ---
Generated 3 transformations
Current Best Accuracy: 1.000000

--- Iteration 3/3 ---
Generated 3 transformations
Current Best Accuracy: 1.000000

✅ Done: balance-scale.csv
🔍 Best Transformation: lambda df: df.assign(Leverage_Diff=df['L-Weight'] * df['L-Distance'] - df['R-Weight'] * df['R-Distance']) # Captures the difference in leverage/torque between left and right sides.
📈 Validation Accuracy: 1.0000

📂 Processing file: /content/drive/My Drive/datasets/Classification/Contraceptive_Method_Classification.csv
Columns in the dataset: ['Wife Age', 'Wife Education', 'Husband Education', 'Children', 'Wife religion', 'Wife working', 'Husband



❌ Error processing /content/drive/My Drive/datasets/Classification/Contraceptive_Method_Classification.csv: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.

📂 Processing file: /content/drive/My Drive/datasets/Classification/eucalyptus_dataset.csv
Columns in the dataset: ['Abbrev', 'Rep', 'Locality', 'Map_Ref', 'Latitude', 'Altitude', 'Rainfall', 'Frosts', 'Year', 'Sp', 'PMCno', 'DBH', 'Ht', 'Surv', 'Vig', 'Ins_res', 'Stem_Fm', 'Crown_Fm', 'Brnch_Fm', 'Utility']
👉 Please enter the name of the target column: Utility
🎯 Using target column: Utility

--- Iteration 1/3 ---




❌ Error processing /content/drive/My Drive/datasets/Classification/eucalyptus_dataset.csv: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.

📂 Processing file: /content/drive/My Drive/datasets/Classification/heart .csv
Columns in the dataset: ['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS', 'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope', 'HeartDisease']
👉 Please enter the name of the target column: HeartDisease
🎯 Using target column: HeartDisease

--- Iteration 1/3 ---




❌ Error processing /content/drive/My Drive/datasets/Classification/heart .csv: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.

📂 Processing file: /content/drive/My Drive/datasets/Classification/breast-wisconsin.csv
Columns in the dataset: ['Clump_Thickness', 'Cell_Size_Uniformity', 'Cell_Shape_Uniformity', 'Marginal_Adhesion', 'Single_Epi_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin', 'Normal_Nucleoli', 'Mitoses', 'Class']
👉 Please enter the name of the target column: Class
🎯 Using target column: Class

--- Iteration 1/3 ---
Generated 3 transformations
Current Best Accuracy: 0.971429

--- Iteration 2/3 ---
Generated 3 transformations
Current Best Accuracy: 0.978571

--- Iteration 3/3 ---
Generated 3 transformations
Current Best



❌ Error processing /content/drive/My Drive/datasets/Classification/blood-transfusion-sc.csv: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.

📂 Processing file: /content/drive/My Drive/datasets/Classification/pc1.csv
Columns in the dataset: ['loc', 'v(g)', 'ev(g)', 'iv(G)', 'N', 'V', 'L', 'D', 'I', 'E', 'B', 'T', 'lOCode', 'lOComment', 'locCodeAndComment', 'lOBlank', 'uniq_Op', 'uniq_Opnd', 'total_Op', 'total_Opnd', 'branchCount', 'defects']
👉 Please enter the name of the target column: defects
🎯 Using target column: defects

--- Iteration 1/3 ---
Generated 3 transformations
Current Best Accuracy: 0.923423

--- Iteration 2/3 ---
Generated 3 transformations
Current Best Accuracy: 0.923423

--- Iteration 3/3 ---
Generated 3 trans

  arr = np.asarray(values, dtype=dtype)


Error in transformation: Input X contains infinity or a value too large for dtype('float32').
Current Best Accuracy: 0.923423

✅ Done: pc1.csv
🔍 Best Transformation: lambda df: df.assign(complexity_volume = df['v(g)'] * df['loc'])
📈 Validation Accuracy: 0.9234

📂 Processing file: /content/drive/My Drive/datasets/Classification/tic-tac-toe.csv
Columns in the dataset: ['top-left-square', 'top-middle-square', 'top-right-square', 'middle-left-square', 'middle-middle-square', 'middle-right-square', 'bottom-left-square', 'bottom-middle-square', 'bottom-right-square', 'Class']
👉 Please enter the name of the target column: Class
🎯 Using target column: Class

--- Iteration 1/3 ---




❌ Error processing /content/drive/My Drive/datasets/Classification/tic-tac-toe.csv: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.

📂 Processing file: /content/drive/My Drive/datasets/Classification/vehicle_c.csv
Columns in the dataset: ['COMPACTNESS', 'CIRCULARITY', 'DISTANCE_CIRCULARITY', 'RADIUS_RATIO', 'PR.AXIS_ASPECT_RATIO', 'MAX.LENGTH_ASPECT_RATIO', 'SCATTER_RATIO', 'ELONGATEDNESS', 'PR.AXIS_RECTANGULARITY', 'MAX.LENGTH_RECTANGULARITY', 'SCALED_VARIANCE_MAJOR', 'SCALED_VARIANCE_MINOR', 'SCALED_RADIUS_OF_GYRATION', 'SKEWNESS_ABOUT_MAJOR', 'SKEWNESS_ABOUT_MINOR', 'KURTOSIS_ABOUT_MAJOR', 'KURTOSIS_ABOUT_MINOR', 'HOLLOWS_RATIO', 'Class']
👉 Please enter the name of the target column: Class
🎯 Using target column: Class

--- I



❌ Error processing /content/drive/My Drive/datasets/Classification/vehicle_c.csv: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.

📂 Processing file: /content/drive/My Drive/datasets/Classification/carPr.csv
Columns in the dataset: ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
👉 Please enter the name of the target column: class
🎯 Using target column: class

--- Iteration 1/3 ---




❌ Error processing /content/drive/My Drive/datasets/Classification/carPr.csv: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.

📂 Processing file: /content/drive/My Drive/datasets/Classification/credit_g.csv
Columns in the dataset: ['checking_status', 'duration', 'credit_history', 'purpose', 'credit_amount', 'savings_status', 'employment', 'installment_commitment', 'personal_status', 'other_parties', 'residence_since', 'property_magnitude', 'age', 'other_payment_plans', 'housing', 'existing_credits', 'job', 'num_dependents', 'own_telephone', 'foreign_worker', 'class']
👉 Please enter the name of the target column: class
🎯 Using target column: class

--- Iteration 1/3 ---




❌ Error processing /content/drive/My Drive/datasets/Classification/credit_g.csv: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
