In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Load the data
df = pd.read_csv('../datasets/insurance.csv')

# Prepare the data
categorical_features = ['sex', 'smoker', 'region']
numerical_features = ['age', 'bmi', 'children']
target = 'charges'

# Encode categorical features
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_categorical_features = encoder.fit_transform(df[categorical_features])
encoded_categorical_feature_names = encoder.get_feature_names(categorical_features)
df_encoded = pd.concat([df[numerical_features], pd.DataFrame(encoded_categorical_features, columns=encoded_categorical_feature_names)], axis=1)

# Scale numerical features
scaler = StandardScaler()
df_encoded[numerical_features] = scaler.fit_transform(df_encoded[numerical_features])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df_encoded.drop(target, axis=1), df_encoded[target], test_size=0.2, random_state=42)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean squared error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

# Use the model to make predictions
new_data = pd.DataFrame({
    'age': [40, 25],
    'sex': ['male', 'female'],
    'bmi': [30.0, 22.0],
    'children': [1, 0],
    'smoker': ['yes', 'no'],
    'region': ['northwest', 'southeast']
})

# Encode categorical features of new data
encoded_categorical_features = encoder.transform(new_data[categorical_features])
encoded_categorical_feature_names = encoder.get_feature_names(categorical_features)
new_data_encoded = pd.concat([new_data[numerical_features], pd.DataFrame(encoded_categorical_features, columns=encoded_categorical_feature_names)], axis=1)

# Scale numerical features of new data
new_data_encoded[numerical_features] = scaler.transform(new_data_encoded[numerical_features])

# Make predictions on new data
predictions = model.predict(new_data_encoded)
print(f"Predictions: {predictions}")

# balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
print(f"Balanced accuracy score: {balanced_accuracy_score(y_test, y_pred):.2f}")




KeyError: "['charges'] not found in axis"