In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [62]:
data=pd.read_csv('insurance.csv')

In [63]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [64]:
data.shape

(1338, 7)

In [65]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [66]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,1338.0,39.207025,14.04996,18.0,27.0,39.0,51.0,64.0
bmi,1338.0,30.663397,6.098187,15.96,26.29625,30.4,34.69375,53.13
children,1338.0,1.094918,1.205493,0.0,0.0,1.0,2.0,5.0
charges,1338.0,13270.422265,12110.011237,1121.8739,4740.28715,9382.033,16639.912515,63770.42801


In [67]:
import pandas_profiling
pandas_profiling.ProfileReport(data, title="Pandas Profiling Report", explorative=True)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [68]:
#missing value
data.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [69]:
#duplicate data
data.duplicated().sum()

1

In [70]:
for col in data.columns.tolist():
    print('_________________________')
    print('column:',col)
    print(data[col].unique())
    print('_________________________')

_________________________
column: age
[19 18 28 33 32 31 46 37 60 25 62 23 56 27 52 30 34 59 63 55 22 26 35 24
 41 38 36 21 48 40 58 53 43 64 20 61 44 57 29 45 54 49 47 51 42 50 39]
_________________________
_________________________
column: sex
['female' 'male']
_________________________
_________________________
column: bmi
[27.9   33.77  33.    22.705 28.88  25.74  33.44  27.74  29.83  25.84
 26.22  26.29  34.4   39.82  42.13  24.6   30.78  23.845 40.3   35.3
 36.005 32.4   34.1   31.92  28.025 27.72  23.085 32.775 17.385 36.3
 35.6   26.315 28.6   28.31  36.4   20.425 32.965 20.8   36.67  39.9
 26.6   36.63  21.78  30.8   37.05  37.3   38.665 34.77  24.53  35.2
 35.625 33.63  28.    34.43  28.69  36.955 31.825 31.68  22.88  37.335
 27.36  33.66  24.7   25.935 22.42  28.9   39.1   36.19  23.98  24.75
 28.5   28.1   32.01  27.4   34.01  29.59  35.53  39.805 26.885 38.285
 37.62  41.23  34.8   22.895 31.16  27.2   26.98  39.49  24.795 31.3
 38.28  19.95  19.3   31.6   25.46  30.115 29

In [71]:
numeric_col=data.select_dtypes(exclude='object') #numeric columns
category_col=data.select_dtypes(include='object')#categorical comulns

In [72]:
#outliers
for col in numeric_col:
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)
    iqr = q3 - q1
    upper = q3 + 1.5 * iqr
    lower = q1 - 1.5 * iqr
    count_outlier = ((data[col] < lower) | (data[col] > upper)).sum()
    total = data.shape[0]
    percentage = count_outlier * 100 / total
    print(f'Outliers in {col} is: {int(percentage)}%')

Outliers in age is: 0%
Outliers in bmi is: 0%
Outliers in children is: 0%
Outliers in charges is: 10%


In [73]:
#encoding data
from sklearn.preprocessing import LabelEncoder

lb = LabelEncoder()

for col in category_col:
    data[col] = lb.fit_transform(data[col])

In [74]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [75]:
#split data
X=data.drop('charges',axis=1)
y=data['charges']

In [76]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler  # or MinMaxScaler

# Perform log transformation on the target variable
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

# Initialize the scaler
scaler = StandardScaler()  # or MinMaxScaler()

# Fit and transform the scaler on the training data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create a dictionary to store results
results = {
    'Model': [],
    'MAE': [],
    'MSE': [],
    'RMSE': [],
    'R2': []
}

# List of regression models to compare
models = [
    ('Linear Regression', LinearRegression()),
    ('Decision Tree Regression', DecisionTreeRegressor(random_state=42)),
    ('Random Forest Regression', RandomForestRegressor(random_state=42))
]

# Loop through models, train, evaluate, and store results
for model_name, model in models:
    # Train the model on scaled data and log-transformed target
    model.fit(X_train_scaled, y_train_log)
    
    # Make predictions on the test data
    y_pred_log = model.predict(X_test_scaled)
    
    # Inverse log transformation to get back the original scale
    y_pred = np.expm1(y_pred_log)
    
    # Calculate evaluation metrics on the original scale
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    # Store results in the dictionary
    results['Model'].append(model_name)
    results['MAE'].append(mae)
    results['MSE'].append(mse)
    results['RMSE'].append(rmse)
    results['R2'].append(r2)

# Create a DataFrame from the results dictionary
results_df = pd.DataFrame(results)

# Display the results
print(results_df)

            Model                MAE           MSE         RMSE         R2   
0         Linear Regression  3909.135871  6.163912e+07  7851.058562  0.602965
1  Decision Tree Regression  2512.916689  3.266052e+07  5714.938304  0.789625
2  Random Forest Regression  2036.911688  1.916606e+07  4377.906003  0.876546


In [77]:
#best model
model=RandomForestRegressor(random_state=42)
model.fit(X_train_scaled, y_train_log)

# Make predictions on the test data
y_pred_log = model.predict(X_test_scaled)

# Inverse log transformation to get back the original scale
y_pred = np.expm1(y_pred_log)

# Calculate evaluation metrics on the original scale
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(r2)

0.8765461025862535


In [78]:
import pickle
file_name='insurance.sav'
pickle.dump(model,open(file_name,'wb'))