#### Importing the Dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

##### Data Pre-processing


In [None]:
#Loading the data from csv file to a pandas dataframe
insurance_dataset = pd.read_csv('data/insurance.csv')

#print the first 5 rows
insurance_dataset.head()

In [None]:
#checking for information about the dataset
insurance_dataset.info()

##### Data Visualization

In [None]:
#distribution of age value
plt.figure(figsize=(5,5))
sns.distplot(insurance_dataset['age'])
plt.title('Age Distribution')

In [None]:
#distribution of gender
plt.figure(figsize=(5,5))
sns.countplot(data=insurance_dataset, x='sex', hue='sex')
plt.title('Gender Distribution')

In [None]:
#counting the values
insurance_dataset['sex'].value_counts()

In [None]:
#distribution of smokers
plt.figure(figsize=(5,5))
sns.countplot(data=insurance_dataset, x='smoker', hue='smoker')

In [None]:
#counting the values
insurance_dataset['smoker'].value_counts()

In [None]:
#checking the BMI distribution
plt.figure(figsize=(5,5))
sns.distplot(insurance_dataset['bmi'])

Normal BMI range -> 18.5 - 24.9

In [None]:
#distribution of children
plt.figure(figsize=(5,5))
sns.countplot(data=insurance_dataset, x='children', hue='children')

In [None]:
#counting the children's column distribution
insurance_dataset['children'].value_counts()

In [None]:
#distribution of children
plt.figure(figsize=(5,5))
sns.countplot(data=insurance_dataset, x='region', hue='region')

In [None]:
insurance_dataset['region'].value_counts()

In [None]:
#charges column distribution
plt.figure(figsize=(5,5))
sns.distplot(insurance_dataset['charges'])

LABEL ENCODING 

encoding the categorical feautures (sex, smoker and regon)

In [None]:
#encoding the sex column
insurance_dataset.replace({'sex':{'male':0, 'female':1}}, inplace=True)

#encoding the smoker column
insurance_dataset.replace({'smoker':{'yes':0, 'no':1}}, inplace=True)

#encoding the region column
insurance_dataset.replace({'region':{'southeast':0, 'southwest':1, 'northeast':2, 'northwest':3}}, inplace=True)

In [None]:
#splitting into feautures and target variables
x = insurance_dataset.drop(columns='charges', axis=1)
y = insurance_dataset['charges']

#### Model training and Evaluation

In [None]:
#splitting training and test data
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=3)

In [None]:
#loading linear regression and fitting
model = LinearRegression()

model.fit(x_train, y_train)

In [None]:
#model evaluation on training data
train_data_predction = model.predict(x_train)

#R-Squared error 
error_score = metrics.r2_score(train_data_predction, y_train)
print("R squared error is : ", error_score)


In [None]:
#Actual prices vs predicted prices graph
plt.figure(figsize=(3,3))
plt.scatter(y_train, train_data_predction)
plt.ylabel('Predicted pries')
plt.title('Actual prices vs predicted prices')

In [None]:
#model evaluation on test data
test_data_predction = model.predict(x_test)

#R-Squared error 
error_score = metrics.r2_score(test_data_predction, y_test)
print("R squared error is : ", error_score)


In [None]:
#Actual prices vs predicted prices graph
plt.figure(figsize=(3,3))
plt.scatter(y_test, test_data_predction)
plt.ylabel('Predicted pries')
plt.title('Actual prices vs predicted prices')

Building a predictve system

In [None]:
input_data = (19,1,27.900,0,0,1)

#changng input data to numpy array
input_data_np = np.asarray(input_data)

#reshape array
input_reshaped = input_data_np.reshape(1,-1)

prediction = model.predict(input_reshaped)
print('The insurance cost is USD', prediction)

