In [17]:
# Check the actual column names in the dataset
import pandas as pd
data = pd.read_csv('./autograder/source/drybean.csv')
print("Column names:", data.columns.tolist())
print("\nFirst few rows:")
print(data.head())

Column names: ['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'AspectRation', 'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent', 'Solidity', 'roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3', 'ShapeFactor4', 'Class']

First few rows:
    Area  Perimeter  MajorAxisLength  MinorAxisLength  AspectRation  \
0  28395    610.291       208.178117       173.888747      1.197191   
1  28734    638.018       200.524796       182.734419      1.097356   
2  29380    624.110       212.826130       175.931143      1.209713   
3  30008    645.884       210.557999       182.516516      1.153638   
4  30140    620.134       201.847882       190.279279      1.060798   

   Eccentricity  ConvexArea  EquivDiameter    Extent  Solidity  roundness  \
0      0.549812       28715     190.141097  0.763923  0.988856   0.958027   
1      0.411785       29172     191.272751  0.783968  0.984986   0.887034   
2      0.562727       29690     193.410904  0.778113  0.989559   

In [18]:
import openai
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get API key from environment variable
openai.api_key = os.getenv("OPENAI_API_KEY")

In [19]:
def call_llm(prompt):
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": "You are a python coding assistant. "},
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message.content

In [39]:
# Read prompt from the text file
with open('hw2_q1_prompt.txt', 'r') as file:
    prompt = file.read()

In [40]:
generated_code = call_llm(prompt)
print(generated_code)

```python
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression


# Load dataset
data = pd.read_csv('/autograder/source/drybean.csv')

# Fix column name typos
data.rename(columns={'AspectRation': 'AspectRatio', 'roundness': 'Roundness'}, inplace=True)

# Define features and target
X = data.drop(columns='Class')
y = data['Class']

# Train/validation/test split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Function to log-transform Area
def log_area(X):
    X = X.c

In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import warnings

warnings.filterwarnings('ignore')

data = pd.read_csv('./autograder/source/drybean.csv')
X = data.drop(columns=['Class'])
y = data['Class']

X_train_temp, X_test, y_train_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_temp, y_train_temp, test_size=0.25, random_state=42, stratify=y_train_temp)

def log_area(X):
    X = X.copy()
    X['Area'] = np.log(X['Area'] + 1e-6)
    return X

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['AspectRation', 'Eccentricity', 'roundness', 'ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3']),
        ('log', FunctionTransformer(log_area, validate=False), ['Area'])
    ]
)

models = {
    'DecisionTree': {
        'classifier': DecisionTreeClassifier(random_state=42),
        'params': {
            'classifier__max_depth': [None, 5, 10, 15]
        }
    },
    'KNN': {
        'classifier': KNeighborsClassifier(),
        'params': {
            'classifier__n_neighbors': [3, 5, 7, 9]
        }
    },
    'LogisticRegression': {
        'classifier': LogisticRegression(solver='lbfgs', max_iter=1000, multi_class='multinomial', random_state=42),
        'params': {
            'classifier__C': [0.1, 1, 10]
        }
    }
}

best_accuracy = 0
best_model = None

for name, config in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', config['classifier'])
    ])
    
    grid_search = GridSearchCV(pipeline, config['params'], cv=5)
    grid_search.fit(X_train, y_train)

    if grid_search.best_score_ > best_accuracy:
        best_accuracy = grid_search.best_score_
        best_model = grid_search

train_accuracy = best_model.score(X_train, y_train)
valid_accuracy = best_model.score(X_valid, y_valid)
test_accuracy = best_model.score(X_test, y_test)

print({'train': train_accuracy, 'valid': valid_accuracy, 'test': test_accuracy})

{'train': 0.9156257653686015, 'valid': 0.9243203526818515, 'test': 0.9151670951156813}


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
data = pd.read_csv('./autograder/source/drybean.csv')

# Split the dataset into features and target
X = data.drop(columns='Class')
y = data['Class']

# Stratified train/validation/test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Preprocessing function for log transformation
def log_area(X):
    X = X.copy()
    X['Area'] = np.log(X['Area'] + 1e-6)
    return X

# Define the preprocessing and model pipeline
# Note: Column names have typos in the dataset - 'AspectRation' not 'AspectRatio', 'roundness' not 'Roundness'
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['AspectRation', 'Eccentricity', 'roundness', 'ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3']),
        ('log_area', FunctionTransformer(log_area, validate=False), ['Area'])
    ]
)

# Define models and hyperparameter grids
models = {
    'DecisionTree': {
        'model': DecisionTreeClassifier(random_state=42),
        'params': {'classifier__max_depth': [None, 5, 10, 15]}
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {'classifier__n_neighbors': [3, 5, 7, 9]}
    },
    'LogisticRegression': {
        'model': LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42),
        'params': {'classifier__C': [0.1, 1, 10]}
    }
}

# Model selection using GridSearchCV
best_model = None
best_accuracy = 0

for model_name, config in models.items():
    pipe = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', config['model'])])
    grid_search = GridSearchCV(pipe, param_grid=config['params'], cv=5)
    grid_search.fit(X_train, y_train)
    
    # Get the best accuracy from the grid search
    if grid_search.best_score_ > best_accuracy:
        best_accuracy = grid_search.best_score_
        best_model = grid_search.best_estimator_

# Evaluate on training, validation and test sets
train_accuracy = accuracy_score(y_train, best_model.predict(X_train))
valid_accuracy = accuracy_score(y_valid, best_model.predict(X_valid))
test_accuracy = accuracy_score(y_test, best_model.predict(X_test))

# Print the accuracies
print({'train': train_accuracy, 'valid': valid_accuracy, 'test': test_accuracy})

{'train': 0.9183198628459466, 'valid': 0.9202792064658339, 'test': 0.9078222548659567}


In [32]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import warnings

warnings.filterwarnings('ignore')

data = pd.read_csv('./autograder/source/drybean.csv')
X = data.drop(columns='Class')
y = data['Class']

X_train_temp, X_test, y_train_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_temp, y_train_temp, test_size=0.5, random_state=42, stratify=y_train_temp)

def log_area(X):
    X = X.copy()
    X['Area'] = np.log(X['Area'] + 1e-6)
    return X

log_area_transformer = FunctionTransformer(log_area, validate=False)

numeric_features = ['AspectRation', 'Eccentricity', 'roundness', 'ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3', 'Area']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[('log_area', log_area_transformer), ('scaler', StandardScaler())]), numeric_features)
    ]
)

models = {
    'DecisionTree': (DecisionTreeClassifier(random_state=42), {'classifier__max_depth': [None, 5, 10, 15]}),
    'KNN': (KNeighborsClassifier(), {'classifier__n_neighbors': [3, 5, 7, 9]}),
    'LogisticRegression': (LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42, multi_class='multinomial'), {'classifier__C': [0.1, 1, 10]})
}

best_accuracy = 0
best_model = None
best_params = None

for model_name, (model, params) in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    grid_search = GridSearchCV(pipeline, param_grid=params, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
    if grid_search.best_score_ > best_accuracy:
        best_accuracy = grid_search.best_score_
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_

accuracies = {}
for split, X_split, y_split in zip(['train', 'valid', 'test'], [X_train, X_valid, X_test], [y_train, y_valid, y_test]):
    accuracies[split] = best_model.score(X_split, y_split)

print(accuracies)



{'train': 0.9226671565025716, 'valid': 0.9208302718589273, 'test': 0.9081894968784429}
