<h1 style="font-family: 'Poppins', sans-serif; color: #393E41; font-size: 36px; font-weight: 600; text-align: center;">Medical Costs Predictions</h1>

In [88]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h1 style="font-family: 'Poppins', sans-serif; color: #393E41; font-size: 20px; font-weight: 600; text-align: left; background-color: #E5F2C9; padding: 15px; border-radius: 5px;;">Exploratory Data Analysis</h1>

In [89]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [90]:
df = pd.read_csv('/kaggle/input/insurance/insurance.csv')

In [91]:
df.head()

In [92]:
df.describe()

In [93]:
df.isnull().sum()

<h1 style="font-family: 'Poppins', sans-serif; color: #393E41; font-size: 20px; font-weight: 600; text-align: left; background-color: #E5F2C9; padding: 15px; border-radius: 5px;">Data Visualizations</h1>

In [94]:
columns = ['sex','children', 'smoker', 'region']

plt.figure(figsize=(12, 10))
sns.set(font_scale= 1.2)
sns.set_style('ticks')

for i, feature in enumerate(columns):
    plt.subplot(2, 2, i+1)
    sns.countplot(data=df, x=feature, palette='mako')
    
sns.despine()

In [95]:
plt.figure(figsize=(16,7))
plt.subplot(1,2,1).set_title("Sex vs Charges", fontdict= { 'fontsize': 20, 'fontweight':'bold'})
sns.barplot(x='sex', y='charges', data=df,palette="mako")
sns.despine()

plt.subplot(1,2,2).set_title("Smoker vs Charges", fontdict= { 'fontsize': 20, 'fontweight':'bold'})
sns.barplot(x='smoker', y='charges', data=df,palette="mako")
sns.despine()

In [96]:
plt.figure(figsize=(16,5))
plt.subplot(1,2,1).set_title("Age vs Charges", fontdict= { 'fontsize': 20, 'fontweight':'bold'})
sns.scatterplot(x='charges', y='age', data=df, color='#388697')

plt.subplot(1,2,2).set_title("BMI vs Charges", fontdict= { 'fontsize': 20, 'fontweight':'bold'})
sns.scatterplot(x='charges', y='bmi', data=df, color='#388697')
sns.despine()

In [97]:
plt.figure(figsize=(16,5))
plt.subplot(1,2,1).set_title("BMI vs Charges", fontdict= { 'fontsize': 20, 'fontweight':'bold'})
sns.scatterplot(x='charges', y='bmi', data=df, hue='sex',palette="magma")
sns.despine()

plt.subplot(1,2,2).set_title("BMI vs Charges", fontdict= { 'fontsize': 20, 'fontweight':'bold'})
sns.scatterplot(x='charges', y='bmi', data=df, hue='smoker',palette="magma")
sns.despine()

In [98]:
plt.figure(figsize=(16,7))
plt.subplot(1,2,1).set_title("Number of Children vs Charges", fontdict= { 'fontsize': 20, 'fontweight':'bold'})
sns.barplot(x='children', y='charges', data=df,palette="magma")
sns.despine()

plt.subplot(1,2,2).set_title("Region vs Charges", fontdict= { 'fontsize': 20, 'fontweight':'bold'})
sns.barplot(x='region', y='charges', data=df,palette="magma")
sns.despine()

<h1 style="font-family: 'Poppins', sans-serif; color: #393E41; font-size: 20px; font-weight: 600; text-align: left; background-color: #E5F2C9; padding: 15px; border-radius: 5px;">Data Preparation and Models</h1>

In [99]:
from sklearn.preprocessing import LabelEncoder
non_num = ['sex', 'smoker', 'region']

for feature in non_num:
    df[feature] = LabelEncoder().fit_transform(df[feature])

In [100]:
from sklearn.model_selection import train_test_split
X = df.drop('charges', axis=1)
y = df['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

print("Shape of training set:", X_train.shape)
print("Shape of test set:", X_test.shape)

In [101]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

<h1 style="font-family: 'Poppins', sans-serif; color: #393E41; font-size: 20px; font-weight: 600; text-align: left; ">Linear Regression</h1>

In [102]:
from sklearn.linear_model import LinearRegression
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)
prediction1 = lin_model.predict(X_test)

In [103]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
MAE_li_reg= metrics.mean_absolute_error(y_test, prediction1)
MSE_li_reg = metrics.mean_squared_error(y_test, prediction1)
RMSE_li_reg =np.sqrt(MSE_li_reg)
pd.DataFrame([MAE_li_reg, MSE_li_reg, RMSE_li_reg], index=['MAE_li_reg', 'MSE_li_reg', 'RMSE_li_reg'], columns=['Metrics'])

In [104]:
scores = cross_val_score(lin_model, X_train, y_train, cv=5)
print(np.sqrt(scores))

In [105]:
r2_score(y_test, lin_model.predict(X_test))