## import libraries

In [74]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

from sklearn.base import BaseEstimator, RegressorMixin

In [75]:
df = pd.read_csv('insurance.csv')

### *some insight and details of database*

In [76]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [77]:
df.shape

(1338, 7)

### *we must check if there exist any missing values or duplicated records*

In [78]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [79]:
df.duplicated().sum()

1

In [80]:
df = df.drop_duplicates()

## *encode the Ordinal and Nominal  columns*
---
- Ordinal columns via label encoding

In [81]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['smoker'] = encoder.fit_transform(df['smoker'])

- Nominal columns via onehot encoding 

In [82]:
encoded = pd.get_dummies(df[['sex','region']])
df = pd.concat([df.iloc[:,:-1],encoded, df.iloc[:,-1]], axis=1)
df = df.drop(columns=['sex', 'region'])

-  In practice, we work we three separate sets of data:
     - Training set, 
     - Validation set, 
     - Test set 
 ---
 
- The Validation and Test sets are called hold-out sets
---
- There’s no optimal proportion to split the dataset into these three subsets. 
     - In the past: 70/15/15
     - With big datasets: 95/2.5/2.5
---
- We use the validation set to 
     - Choose the learning algorithm
     - find the best values of hyper-parameters



___

### here becuase our model is fixed we dont need validation set

>At first due to the  distribution of charges we apply log function to this feature becuase the log function is not dependent of training set charachtrastic we do it before spliting the dataset into train and test set 

> Secondly we split into training and test only and apply linear regression model then we use sklearn cross validation function and pass X and y to it, it breaks it into k-1 fold for training and a fold as validation(here test) set  



In [83]:
df['charges'] = np.log(df['charges'])

In [84]:
X = df.drop('charges', axis=1)
y = df['charges']

X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, random_state=42)

## *get profile report of training set and preprocess the data*
---

Before delving into modeling, it is essential to normalize/standardize:
  - The Training Set
  - Test Set with Respect to Training Set Parameters

In [85]:
training_set = pd.concat([X_train, pd.DataFrame(y_train, columns=['charges'])], axis = 1)

In [86]:
from ydata_profiling import ProfileReport

profile = ProfileReport(training_set, title='insurance report')
profile.to_file('insurance report.html')

Summarize dataset: 100%|████████████████████████████████████████████████████| 36/36 [00:01<00:00, 19.61it/s, Completed]
Generate report structure: 100%|█████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.51s/it]
Render HTML: 100%|███████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.55it/s]
Export report to file: 100%|████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 335.04it/s]


- According to the distribution of bmi, we apply the standard scaler on these  feature but for age we apply minmax scaler


In [87]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

mm_scaler = MinMaxScaler()
X_train[['age','children']] = mm_scaler.fit_transform(X_train[['age','children']])
X_test[['age','children']] = mm_scaler.transform(X_test[['age','children']])


std_scaler = StandardScaler()
X_train['bmi'] = std_scaler.fit_transform(X_train[['bmi']])
X_test['bmi'] = std_scaler.transform(X_test[['bmi']])

In [88]:
class LinearRegressionClassifier:
    
    
    def __init__(self, learning_rate=0.03, n_iterations=10000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Gradient Descent
        for _ in range(self.n_iterations):
            y_pred = np.dot(X, self.weights) + self.bias
            error = y_pred - y

            self.weights -= (self.learning_rate / n_samples) * np.dot(X.T, error)
            self.bias -= (self.learning_rate / n_samples) * np.sum(error)

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

In [89]:
model = LinearRegressionClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [90]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 0.15828783932410606
R-squared: 0.8294642961668092


### with sklearn linear regression and corss validation

In [91]:
from sklearn.linear_model import LinearRegression
scores = cross_val_score(LinearRegression(), X, y, cv=10)

In [92]:
scores.mean()

0.7607924685763823

### polynomial regression 

In [93]:
class PolynomialRegression(LinearRegressionClassifier):
    def __init__(self, degree=2, learning_rate=0.03, n_iterations=10000):
        super().__init__(learning_rate, n_iterations)
        self.degree = degree

    def _add_polynomial_features(self, X):
        # Add polynomial features to the input data
        X_poly = X.copy()
        for d in range(2, self.degree + 1):
            X_poly = np.concatenate((X_poly, X ** d), axis=1)
        return X_poly

    def fit(self, X, y):
        X_poly = self._add_polynomial_features(X)
        super().fit(X_poly, y)

    def predict(self, X):
        X_poly = self._add_polynomial_features(X)
        return super().predict(X_poly)

# Let's create an instance of the PolynomialRegression class and fit the model
poly_model = PolynomialRegression(degree=2)
poly_model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_poly = poly_model.predict(X_test)

# Evaluate the polynomial regression model
mse_poly = mean_squared_error(y_test, y_pred_poly)
r2_poly = r2_score(y_test, y_pred_poly)

print(f'Polynomial Regression Mean Squared Error: {mse_poly}')
print(f'Polynomial Regression R-squared: {r2_poly}')


Polynomial Regression Mean Squared Error: 0.15878588200446794
Polynomial Regression R-squared: 0.828927716355011
