In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
df = pd.read_csv('../datasets/insurance.csv')

# Prepare the data
categorical_features = ['sex', 'smoker', 'region']
numerical_features = ['age', 'bmi', 'children']
target = 'charges'


In [4]:
# Encode categorical features
le = LabelEncoder()
for feature in categorical_features:
    df[feature] = le.fit_transform(df[feature])

# Scale numerical features
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(target, axis=1), df[target], test_size=0.2, random_state=42)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [6]:
# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean squared error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")


Mean squared error: 33635210.43
R-squared: 0.78


In [10]:
df['charges']

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [8]:
# balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)


ValueError: continuous is not supported

In [7]:
# Use the model to make predictions
new_data = pd.DataFrame({
    'age': [40, 25],
    'sex': ['male', 'female'],
    'bmi': [30.0, 22.0],
    'children': [1, 0],
    'smoker': ['yes', 'no'],
    'region': ['northwest', 'southeast']
})

new_data[categorical_features] = new_data[categorical_features].apply(le.transform)
new_data[numerical_features] = scaler.transform(new_data[numerical_features])

predictions = model.predict(new_data)
print(f"Predictions: {predictions}")

ValueError: y contains previously unseen labels: 'male'