In [1]:
import pandas as pd

df  = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [2]:
df.drop_duplicates(inplace=True)

In [3]:
from sklearn.model_selection import train_test_split

X = df.drop('charges', axis=1)
y = df['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first', sparse_output=False), 
                                      ['sex', 'smoker', 'region']),
                                      ('scaler', MinMaxScaler(), ['age', 'bmi'])], remainder='passthrough')

X_train_prep = ct.fit_transform(X_train)
X_test_prep = ct.transform(X_test)

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

lr = LinearRegression()
scores = cross_val_score(lr, X_train_prep, y_train, cv=5)
scores.mean()

0.725757353610811

In [15]:
# SVR (Kernel, epsilon, C, gamma, degree)

from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

svr = SVR()
param_grid = [
    {'kernel': ['linear'], 'C': [0.01, 0.1, 1, 10], 'epsilon': [0.1, 1, 10]},
    {'kernel': ['rbf'], 'C': [ 0.01, 0.1, 1, 10], 'gamma': [0.1, 1, 10]},
    {'kernel': ['poly'], 'C': [0.01, 0.1, 1, 10], 'degree': [2, 3, 4], 'epsilon': [0.1, 1, 10]}
]
gs = GridSearchCV(svr, param_grid, cv=5, scoring='neg_mean_squared_error')
gs.fit(X_train_prep, y_train)
gs.best_params_

{'C': 10, 'degree': 4, 'epsilon': 0.1, 'kernel': 'poly'}

In [16]:
best_svr = gs.best_estimator_
best_svr.score(X_train_prep, y_train)

0.10750368638112451

In [21]:
# Random Forest

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

rf = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': [None, 'sqrt', 'log2'],
    'max_depth': [5, 10, 20]}

gs = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error')
gs.fit(X_train_prep, y_train)
gs.best_params_

{'max_depth': 5, 'max_features': None, 'n_estimators': 200}

In [22]:
best_rf = gs.best_estimator_
best_rf.score(X_train_prep, y_train)

0.8793596650416469

In [24]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(random_state=42)

param_grid = {
    'max_depth': [5, 10, 20],
    'max_features': [None, 'sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
}

gs = GridSearchCV(dt, param_grid, cv=5, scoring='neg_mean_squared_error')
gs.fit(X_train_prep, y_train)
gs.best_params_


{'max_depth': 5, 'max_features': None, 'min_samples_split': 10}

In [25]:
best_dt = gs.best_estimator_
best_dt.score(X_train_prep, y_train)

0.8668692913700565

# Final Pipeline

In [26]:
ct

In [27]:
best_rf

In [29]:
from sklearn.pipeline import Pipeline

rf_pipeline = Pipeline(steps=[('preprocessor', ct),
                            ('random_forest', best_rf)])
rf_pipeline.fit(X_train, y_train)

In [31]:
print('Final Score on Training Set:', rf_pipeline.score(X_train, y_train))
print('Final Score on Test Set:', rf_pipeline.score(X_test, y_test))

Final Score on Training Set: 0.8793596650416469
Final Score on Test Set: 0.9014489806269765


In [32]:
import joblib

joblib.dump(rf_pipeline, 'rf_model.joblib')

['rf_model.joblib']