In [1]:
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn.model_selection import cross_val_score

In [2]:
# Load the data from the csv file
df = pd.read_csv('insurance.csv')

# Split the data into features (X) and target (y)
X = df.drop('charges', axis=1)
y = df['charges']

# Change strings to numbers as follows
# Sex: male -> 1, female -> 2
# Smoker: yes -> 1, no -> 0
# Region: northeast -> 1, northwest -> 2, southeast -> 3, southwest -> 4
sex_mapping = {'male': 1, 'female': 2}
smoker_mapping = {'yes': 1, 'no': 0}
region_mapping = {'northeast': 1, 'northwest': 2, 'southeast': 3, 'southwest': 4}
X = X.replace({'sex': sex_mapping, 'smoker': smoker_mapping, 'region': region_mapping})

In [3]:
# Create a LinearRegression model
model = LinearRegression()

# Fit the model to the data
model.fit(X, y)

# Make predictions on new data
pred_inst = pd.DataFrame([[30, 1, 25, 2, 0, 3]], columns=['age', 'sex', 'bmi', 'children', 'smoker', 'region'])
predictions = model.predict(pred_inst)
print(predictions)

[4337.79140492]


In [4]:
# Use 10-fold cross-validation to evaluate the model's performance
scores = cross_val_score(model, X, y, cv=10)
print(scores)

[0.78522549 0.73213924 0.73490618 0.67241306 0.7716613  0.78358383
 0.79318159 0.67054582 0.74216232 0.76222839]


In [5]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.74 accuracy with a standard deviation of 0.04
