In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score
from sklearn.preprocessing import LabelEncoder

In [51]:
df = pd.read_excel("./medical-charges-dataset.xlsx")

In [52]:
print("Number of rows", df.shape[0])
print("Number of columns", df.shape[1])
print("Number of duplicate rows", df.duplicated().sum())
print("Number of null rows", df.isna().sum().sum())

Number of rows 1338
Number of columns 7
Number of duplicate rows 1
Number of null rows 0


In [53]:
df.drop_duplicates(inplace=True)

In [54]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,18,male,53.13,0,no,southeast,1163.4627
1,22,male,52.58,1,yes,southeast,44501.3982
2,23,male,50.38,1,no,southeast,2438.0552
3,58,male,49.06,0,no,southeast,11381.3254
4,46,female,48.07,2,no,northeast,9432.9253


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1337 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1337 non-null   int64  
 1   sex       1337 non-null   object 
 2   bmi       1337 non-null   float64
 3   children  1337 non-null   int64  
 4   smoker    1337 non-null   object 
 5   region    1337 non-null   object 
 6   charges   1337 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 83.6+ KB


In [56]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1337.0,1337.0,1337.0,1337.0
mean,39.222139,30.664637,1.095737,13279.121487
std,14.044333,6.100204,1.205571,12110.359656
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29,0.0,4746.344
50%,39.0,30.4,1.0,9386.1613
75%,51.0,34.7,2.0,16657.71745
max,64.0,53.13,5.0,63770.42801


In [57]:
df = df[["age", "bmi", "children", "charges"]]

In [60]:
X = df.drop("charges", axis=1)
y = df["charges"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [61]:
rndm_cv = RandomizedSearchCV(RandomForestRegressor(), param_distributions={"n_estimators": np.arange(100,500,100),
                                                                            "max_depth":np.arange(2,10,2)},
                                                                              cv=3).fit(X_train, y_train)

print("Best params", rndm_cv.best_params_)
print("Best score", rndm_cv.best_score_)
model = rndm_cv.best_estimator_

Best params {'n_estimators': 100, 'max_depth': 2}
Best score 0.06362926350077976


In [65]:
import joblib

joblib.dump(model, 'random_forest_model.joblib')

['random_forest_model.joblib']