In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#let's import 
import matplotlib.pyplot as plt
import seaborn as sns

# what to do

**A medical insurance company has released data for almost 1000 customers.create a model that predict the yearly medical cover cost.**


In [None]:
df = pd.read_csv("/kaggle/input/medical-insurance-premium-prediction/Medicalpremium.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

**so there is no null values**

In [None]:
df.columns

In [None]:
df.describe()

# Data Visualization

In [None]:
plt.figure(figsize=(10,7))
sns.histplot(df['Age'],kde=True)
plt.title('histogramof PremiumPrice')


In [None]:

sns.pairplot(df, x_vars=['Age', 'Diabetes', 'BloodPressureProblems', 'AnyTransplants',
       'AnyChronicDiseases', 'Height', 'Weight', 'KnownAllergies',
       'HistoryOfCancerInFamily', 'NumberOfMajorSurgeries'], y_vars=['PremiumPrice'], kind='scatter')

see the three column age and height, weight are more related to premiumPrice.

**let's calculate correlation**

how much each column correlated to the target column premiumprice


In [None]:
correlation = df.corr()
plt.figure(figsize=(10,7))
sns.heatmap(correlation, annot=True, cmap='PuBuGn')
plt.title('correlation matrix')

so the heatmap shows that age is the most correlated column to the target column.

In [None]:
#let's split the data into X, and y variable
X = df.drop('PremiumPrice',axis=1)
y = df['PremiumPrice']

In [None]:
X

In [None]:
y

# train the data

In [None]:
#split the data into test and train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#import libraries
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor


# Creating Pipeline for preprocessing and model.

In [None]:
#create a pipeline for preprocessing and model
pipeline = Pipeline([
    ('scaler',StandardScaler()),
    ('model',RandomForestRegressor())
])

In [None]:
#fit the pipeline on training data
pipeline.fit(X_train,y_train)

Let's predict on the data

In [None]:
#predict on X_test
y_pred = pipeline.predict(X_test)

# Evaluate the model

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
mae = mean_absolute_error(y_pred, y_test)
mse = mean_squared_error(y_pred, y_test)
r2 = r2_score(y_test, y_pred)
print('Mean absolute error :',mae)
print('Mean squared error :',r2)
print('R^2 :', r2)

# Let's use the Hyperparameter Tuning to increase the model accuracy

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
#hyperparameter define 
param_grid = {
   'model__n_estimators': [50, 100, 200], 
    'model__max_depth': [None, 10, 20],  # Maximum depth of the trees
    'model__min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'model__min_samples_leaf': [1, 2, 4]  # Minimum number of sample
    
}

In [None]:
# create a grid search object and pass pipeline
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

#get the best pamameter and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

#predict on the test data
y_pred_2 = best_model.predict(X_test)

# Evaluate the best model
mse = mean_squared_error(y_test, y_pred_2)
mae = mean_absolute_error(y_test, y_pred_2)
r2 = r2_score(y_test, y_pred_2)
print("Best Model Mean Squared Error:", mse)
print("Mean absolute error :",mae)
print("r2 :",r2)
print("Best Model Hyperparameters:", best_params)

# Conclusion

**overall the model perform better than the base model after hyperparameter tuning  the Mean absolute error is 1014 which is low and r^2 is around 90% so the model is performing good.**