In [1]:
# Check the actual column names in the dataset
import pandas as pd
data = pd.read_csv('./autograder/source/drybean.csv')
print("Column names:", data.columns.tolist())
print("\nFirst few rows:")
print(data.head())

Column names: ['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'AspectRation', 'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent', 'Solidity', 'roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3', 'ShapeFactor4', 'Class']

First few rows:
    Area  Perimeter  MajorAxisLength  MinorAxisLength  AspectRation  \
0  28395    610.291       208.178117       173.888747      1.197191   
1  28734    638.018       200.524796       182.734419      1.097356   
2  29380    624.110       212.826130       175.931143      1.209713   
3  30008    645.884       210.557999       182.516516      1.153638   
4  30140    620.134       201.847882       190.279279      1.060798   

   Eccentricity  ConvexArea  EquivDiameter    Extent  Solidity  roundness  \
0      0.549812       28715     190.141097  0.763923  0.988856   0.958027   
1      0.411785       29172     191.272751  0.783968  0.984986   0.887034   
2      0.562727       29690     193.410904  0.778113  0.989559   

In [2]:
import openai
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get API key from environment variable
openai.api_key = os.getenv("OPENAI_API_KEY")

In [3]:
def call_llm(prompt):
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": "You are a python coding assistant. "},
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message.content

In [22]:
# Read prompt from the text file
with open('hw2_q2_prompt.txt', 'r') as file:
    prompt = file.read()

In [28]:
generated_code = call_llm(prompt)
print(generated_code)

```python
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin

# Load the data
data = pd.read_csv('/autograder/source/housing.csv')

# Split the data
X = data.drop('median_house_value', axis=1)
y = data['median_house_value']

X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.25, random_state=42)

class NearestAnchorDistance(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.anchors = np.array([[37.38, -122.21], [33.99, -118.50

In [10]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin

class NearestAnchorDistance(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.anchors = np.array([[37.38, -122.21], [33.99, -118.50], [32.82, -117.31]])
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        lat_lon = X.values if hasattr(X, 'values') else X
        distances = np.linalg.norm(lat_lon[:, np.newaxis, :] - self.anchors, axis=2)
        return np.min(distances, axis=1).reshape(-1, 1)

# Load data
data = pd.read_csv('./autograder/source/housing.csv')
X = data.drop('median_house_value', axis=1)
y = data['median_house_value']

# Split data: 60% train, 20% validation, 20% test
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Define preprocessing pipelines for different column groups
# For latitude, longitude: custom transformer -> standardization
lat_lon_transformer = Pipeline(steps=[
    ('nearest_anchor', NearestAnchorDistance()),
    ('scaler', StandardScaler())
])

# For total_rooms: KNN imputer -> log transform -> standardization
total_rooms_transformer = Pipeline(steps=[
    ('imputer', KNNImputer()),
    ('log', FunctionTransformer(np.log1p, validate=True)),
    ('scaler', StandardScaler())
])

# For housing_median_age, median_income: KNN imputer -> standardization
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer()),
    ('scaler', StandardScaler())
])

# For ocean_proximity: one-hot encoding
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine all transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('lat_lon', lat_lon_transformer, ['latitude', 'longitude']),
        ('total_rooms', total_rooms_transformer, ['total_rooms']),
        ('numeric', numeric_transformer, ['housing_median_age', 'median_income']),
        ('cat', categorical_transformer, ['ocean_proximity'])
    ],
    remainder='drop'
)

# Create and train model
model = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('regressor', KNeighborsRegressor(n_neighbors=5))
])

model.fit(X_train, y_train)

# Evaluate
mae_train = mean_absolute_error(y_train, model.predict(X_train))
mae_valid = mean_absolute_error(y_valid, model.predict(X_valid))
mae_test = mean_absolute_error(y_test, model.predict(X_test))

results = {'train': round(mae_train, 2), 'valid': round(mae_valid, 2), 'test': round(mae_test, 2)}
print(results)


{'train': 38028.78, 'valid': 45945.26, 'test': 46600.01}


In [13]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin

# Load the data
data = pd.read_csv('./autograder/source/housing.csv')

# Split the data
X = data.drop('median_house_value', axis=1)
y = data['median_house_value']
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

class NearestAnchorDistance(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.anchors = np.array([[37.38, -122.21], [33.99, -118.50], [32.82, -117.31]])
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        distances = np.sqrt(((X[['latitude', 'longitude']].values[:, np.newaxis] - self.anchors) ** 2).sum(axis=2))
        return np.min(distances, axis=1).reshape(-1, 1)

preprocessor = ColumnTransformer(
    transformers=[
        ('anchors', Pipeline([
            ('nearest_anchor', NearestAnchorDistance()),
            ('scaler', StandardScaler())
        ]), ['latitude', 'longitude']),
        ('rooms', Pipeline([
            ('imputer', KNNImputer()),
            ('log1p', FunctionTransformer(np.log1p)),
            ('scaler', StandardScaler())
        ]), ['total_rooms']),
        ('age_income', Pipeline([
            ('imputer', KNNImputer()),
            ('scaler', StandardScaler())
        ]), ['housing_median_age', 'median_income']),
        ('ocean_proximity', OneHotEncoder(handle_unknown='ignore'), ['ocean_proximity'])
    ],
    remainder='drop'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor(n_neighbors=5))
])

pipeline.fit(X_train, y_train)

mae_train = mean_absolute_error(y_train, pipeline.predict(X_train))
mae_valid = mean_absolute_error(y_valid, pipeline.predict(X_valid))
mae_test = mean_absolute_error(y_test, pipeline.predict(X_test))

mae_results = {
    'train': round(mae_train, 2),
    'valid': round(mae_valid, 2),
    'test': round(mae_test, 2)
}

print(mae_results)


{'train': 38028.78, 'valid': 45945.26, 'test': 46600.01}


In [15]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin

# Load the data
data = pd.read_csv('./autograder/source/housing.csv')

# Split the data
train_val, test = train_test_split(data, test_size=0.2, random_state=42)
train, valid = train_test_split(train_val, test_size=0.25, random_state=42)
X_train = train.drop('median_house_value', axis=1)
y_train = train['median_house_value']
X_valid = valid.drop('median_house_value', axis=1)
y_valid = valid['median_house_value']
X_test = test.drop('median_house_value', axis=1)
y_test = test['median_house_value']

class NearestAnchorDistance(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.anchors = np.array([(37.38, -122.21), (33.99, -118.50), (32.82, -117.31)])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        lat_lon = X[['latitude', 'longitude']].values
        distances = np.linalg.norm(lat_lon[:, None] - self.anchors, axis=2)
        return np.min(distances, axis=1).reshape(-1, 1)

preprocessor = ColumnTransformer(
    transformers=[
        ('anchors', Pipeline([
            ('nearest_anchor', NearestAnchorDistance()),
            ('scaler', StandardScaler())
        ]), ['latitude', 'longitude']),
        
        ('rooms', Pipeline([
            ('imputer', KNNImputer()),
            ('log_transform', FunctionTransformer(np.log1p)),
            ('scaler', StandardScaler())
        ]), ['total_rooms']),
        
        ('age_income', Pipeline([
            ('imputer', KNNImputer()),
            ('scaler', StandardScaler())
        ]), ['housing_median_age', 'median_income']),
        
        ('ocean_proximity', OneHotEncoder(handle_unknown='ignore'), ['ocean_proximity'])
    ],
    remainder='drop'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor(n_neighbors=5))
])

pipeline.fit(X_train, y_train)

mae_train = mean_absolute_error(y_train, pipeline.predict(X_train))
mae_valid = mean_absolute_error(y_valid, pipeline.predict(X_valid))
mae_test = mean_absolute_error(y_test, pipeline.predict(X_test))

results = {
    'train': round(mae_train, 2),
    'valid': round(mae_valid, 2),
    'test': round(mae_test, 2)
}

print(results)


{'train': 38028.78, 'valid': 45945.26, 'test': 46600.01}


In [19]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin

class NearestAnchorDistance(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.anchors = np.array([[37.38, -122.21], [33.99, -118.50], [32.82, -117.31]])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # X will be a numpy array when passed from ColumnTransformer
        X_array = X if isinstance(X, np.ndarray) else X.values
        distances = np.linalg.norm(X_array[:, np.newaxis] - self.anchors, axis=2)
        return distances.min(axis=1).reshape(-1, 1)

data = pd.read_csv('./autograder/source/housing.csv')
X = data.drop('median_house_value', axis=1)
y = data['median_house_value']

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('anchors', Pipeline([('nearest_anchor', NearestAnchorDistance()), ('scaler', StandardScaler())]), ['latitude', 'longitude']),
        ('total_rooms', Pipeline([('imputer', KNNImputer()), ('log1p', FunctionTransformer(np.log1p)), ('scaler', StandardScaler())]), ['total_rooms']),
        ('age_income', Pipeline([('imputer', KNNImputer()), ('scaler', StandardScaler())]), ['housing_median_age', 'median_income']),
        ('ocean_proximity', OneHotEncoder(handle_unknown='ignore'), ['ocean_proximity'])
    ],
    remainder='drop'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor(n_neighbors=5))
])

pipeline.fit(X_train, y_train)

mae_train = mean_absolute_error(y_train, pipeline.predict(X_train))
mae_valid = mean_absolute_error(y_valid, pipeline.predict(X_valid))
mae_test = mean_absolute_error(y_test, pipeline.predict(X_test))

mae_results = {
    'train': round(mae_train, 2),
    'valid': round(mae_valid, 2),
    'test': round(mae_test, 2)
}

print(mae_results)


{'train': 38028.78, 'valid': 45945.26, 'test': 46600.01}


In [24]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin

# Load the dataset
data = pd.read_csv('./autograder/source/housing.csv')

# Split the data
X = data.drop(columns='median_house_value')
y = data['median_house_value']

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

class NearestAnchorDistance(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.anchors = np.array([[37.38, -122.21], [33.99, -118.50], [32.82, -117.31]])
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_array = X if isinstance(X, np.ndarray) else X.values
        distances = np.linalg.norm(X_array[:, None, :2] - self.anchors[None, :, :], axis=2)
        return np.min(distances, axis=1).reshape(-1, 1)

preprocessor = ColumnTransformer(
    transformers=[
        ('anchors', Pipeline([('nearest_anchor', NearestAnchorDistance()), ('scaler', StandardScaler())]), ['latitude', 'longitude']),
        ('rooms', Pipeline([('imputer', KNNImputer()), ('log1p', FunctionTransformer(np.log1p)), ('scaler', StandardScaler())]), ['total_rooms']),
        ('age_income', Pipeline([('imputer', KNNImputer()), ('scaler', StandardScaler())]), ['housing_median_age', 'median_income']),
        ('ocean', OneHotEncoder(handle_unknown='ignore'), ['ocean_proximity'])
    ],
    remainder='drop'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor(n_neighbors=5))
])

pipeline.fit(X_train, y_train)

mae_train = mean_absolute_error(y_train, pipeline.predict(X_train))
mae_valid = mean_absolute_error(y_valid, pipeline.predict(X_valid))
mae_test = mean_absolute_error(y_test, pipeline.predict(X_test))

mae_results = {
    'train': round(mae_train, 2),
    'valid': round(mae_valid, 2),
    'test': round(mae_test, 2)
}

print(mae_results)



{'train': 38028.78, 'valid': 45945.26, 'test': 46600.01}


In [26]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin

# Load the dataset
data = pd.read_csv('./autograder/source/housing.csv')

# Split the data
X = data.drop(columns='median_house_value')
y = data['median_house_value']
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_trainval, y_trainval, test_size=0.25, random_state=42)

class NearestAnchorDistance(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.anchors = np.array([[37.38, -122.21], [33.99, -118.50], [32.82, -117.31]])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_array = X if isinstance(X, np.ndarray) else X.values
        distances = np.linalg.norm(X_array[:, :2][:, np.newaxis] - self.anchors, axis=2)
        return np.min(distances, axis=1).reshape(-1, 1)

preprocessor = ColumnTransformer(
    transformers=[
        ('anchors', Pipeline(steps=[
            ('nearest_anchor', NearestAnchorDistance()),
            ('scaler', StandardScaler())
        ]), ['latitude', 'longitude']),
        
        ('rooms', Pipeline(steps=[
            ('imputer', KNNImputer()),
            ('log', FunctionTransformer(np.log1p)),
            ('scaler', StandardScaler())
        ]), ['total_rooms']),
        
        ('age_income', Pipeline(steps=[
            ('imputer', KNNImputer()),
            ('scaler', StandardScaler())
        ]), ['housing_median_age', 'median_income']),
        
        ('ocean_proximity', OneHotEncoder(handle_unknown='ignore'), ['ocean_proximity'])
    ],
    remainder='drop'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor(n_neighbors=5))
])

pipeline.fit(X_train, y_train)

mae_train = mean_absolute_error(y_train, pipeline.predict(X_train))
mae_valid = mean_absolute_error(y_valid, pipeline.predict(X_valid))
mae_test = mean_absolute_error(y_test, pipeline.predict(X_test))

mae_results = {
    'train': round(mae_train, 2),
    'valid': round(mae_valid, 2),
    'test': round(mae_test, 2)
}

print(mae_results)



{'train': 38028.78, 'valid': 45945.26, 'test': 46600.01}


In [29]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin

# Load the data
data = pd.read_csv('./autograder/source/housing.csv')

# Split the data
X = data.drop('median_house_value', axis=1)
y = data['median_house_value']

X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.25, random_state=42)

class NearestAnchorDistance(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.anchors = np.array([[37.38, -122.21], [33.99, -118.50], [32.82, -117.31]])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_array = X if isinstance(X, np.ndarray) else X.values
        distances = np.linalg.norm(X_array[:, :2][:, np.newaxis] - self.anchors, axis=2)
        return np.min(distances, axis=1).reshape(-1, 1)

preprocessor = ColumnTransformer(
    transformers=[
        ('anchors', Pipeline(steps=[
            ('nearest_anchor', NearestAnchorDistance()),
            ('scaler', StandardScaler())
        ]), ['latitude', 'longitude']),
        ('rooms', Pipeline(steps=[
            ('imputer', KNNImputer()),
            ('log_transform', FunctionTransformer(np.log1p)),
            ('scaler', StandardScaler())
        ]), ['total_rooms']),
        ('age_income', Pipeline(steps=[
            ('imputer', KNNImputer()),
            ('scaler', StandardScaler())
        ]), ['housing_median_age', 'median_income']),
        ('ocean_proximity', OneHotEncoder(handle_unknown='ignore'), ['ocean_proximity'])
    ],
    remainder='drop'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor(n_neighbors=5))
])

pipeline.fit(X_train, y_train)

y_train_pred = pipeline.predict(X_train)
y_valid_pred = pipeline.predict(X_valid)
y_test_pred = pipeline.predict(X_test)

mae_scores = {
    'train': round(mean_absolute_error(y_train, y_train_pred), 2),
    'valid': round(mean_absolute_error(y_valid, y_valid_pred), 2),
    'test': round(mean_absolute_error(y_test, y_test_pred), 2)
}

print(mae_scores)


{'train': 38028.78, 'valid': 45945.26, 'test': 46600.01}
