In [None]:
# Importing the dependecies
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

In [None]:
# Loading the dataset to a pandas Dataframe
dataset = pd.read_csv('../input/insurance/insurance.csv')

In [None]:
# First 5 rows of the dataset
dataset.head()

In [None]:
# Last 5 rows of the dataset
dataset.tail()

In [None]:
# Number of rows and colummns in dtaset
dataset.shape

In [None]:
# Getting some information about the datset
dataset.info()

In [None]:
# Checking for missing values
dataset.isnull().sum()

In [None]:
# Statistical measures of dataset
dataset.describe()

In [None]:
# Distribution of age value 
ax = plt.axes()
ax.set(title='Age distribution',
       xlabel='Age',
       ylabel='Age percentage',
       )
sns.distplot(dataset['age'])
plt.show()

In [None]:
# Gender column
ax = plt.axes()
ax.set(title='Gender distribution',
       xlabel='Gender',
       ylabel='Gender counts',
       )
sns.countplot(x='sex', data=dataset)
plt.show()

In [None]:
# Region column
ax = plt.axes()
ax.set(title='Region distribution',
       xlabel='Region',
       ylabel='Region counts',
       )
sns.countplot(x='region', data=dataset)
plt.show()

In [None]:
# Distribution of bmi value 
ax = plt.axes()
ax.set(title='BMI distribution',
       xlabel='BMI',
       ylabel='BMI percentage',
       )
sns.distplot(dataset['bmi'])
plt.show()

In [None]:
# Children column
ax = plt.axes()
ax.set(title='Children distribution',
       xlabel='Children',
       ylabel='Children counts',
       )
sns.countplot(x='children', data=dataset)
plt.show()

In [None]:
# Number of childern type
dataset['children'].value_counts()

In [None]:
# Smoker column
ax = plt.axes()
ax.set(title='Smoker distribution',
       xlabel='Smoker',
       ylabel='Smoker counts',
       )
sns.countplot(x='smoker', data=dataset)
plt.show()

# Number of smoker 
dataset['smoker'].value_counts()

In [None]:
# Encoding the categorical features
le = LabelEncoder()

obj_dat = dataset.select_dtypes(include='object')
num_dat = dataset.select_dtypes(exclude='object')

for i in obj_dat:
    labelling = le.fit_transform(obj_dat[i])
    obj_dat[i] = labelling

# Labelling encoder
# le = preprocessing.LabelEncoder()
# for i in range(obj_data.shape[1]):
#     obj_data.iloc[:,i] = le.fit_transform(obj_data.iloc[:,i])
# excludedata = data.select_dtypes(exclude=['object'])
# data = pd.concat([obj_data, excludedata], axis=1)

In [None]:
# Concatenation of the obj_data  and num_data
final_dataset = pd.concat([obj_dat,num_dat], axis=1)

In [None]:
# First 5 rows of the dataset
final_dataset.head()

In [None]:
# Visulazation the correlation between data feature
corrx = final_dataset.corr()
sns.heatmap(corrx, annot=True, cmap='plasma', square=True)

In [None]:
# Showing the distribution feature 
sns.pairplot(final_dataset)

In [None]:
# Splitting the features and Target
x = final_dataset.drop('charges', axis=1)
y = final_dataset['charges']

In [None]:
# Splitting data into training data and testing data
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.1, random_state=2)

print(x.shape)
print(x_train.shape)
print(x_test.shape)

In [None]:
# Loading the model
# model = LinearRegression()
model = DecisionTreeRegressor(max_depth=2)
# model = Ridge()
# model = Lasso() 
# model = ElasticNet()
# model = SGDRegressor()

In [None]:
# Fitting the model
model.fit(x_train, y_train)

In [None]:
# Prediction on training data
train_pred = model.predict(x_train)

# Testing the model accuracy on training data
score_train = metrics.r2_score(y_train, train_pred)
print('R squared value : ', score_train)

In [None]:
# Prediction on testing data
test_pred = model.predict(x_test)

# Testing the model accuracy on training data
score_test = metrics.r2_score(y_test, test_pred)
print('R squared value : ', score_test)

In [None]:
# Building a predictive system
input_data = (0,0,1,37,27.74,3)

# Making input data as numpy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshaping the input data
reshaped_data = input_data_as_numpy_array.reshape(1, -1)

# predicting the input data
prediction = model.predict(reshaped_data)
print('The insurance cost ', prediction)