### Create a Linear Regression model with PyTorch Components

1. Data gathering
2. Data preprocessing
3. Feature engineering
4. Model training
5. Testing

In [104]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.10.0+cu126
False


In [105]:
import os
import kagglehub
path = kagglehub.dataset_download("mirichoi0218/insurance")
print("Path to dataset files:", path)

Path to dataset files: /home/thinkpad/.cache/kagglehub/datasets/mirichoi0218/insurance/versions/1


In [106]:
import pandas as pd
print(os.listdir(path))
df=pd.read_csv(os.path.join(path, "insurance.csv"))
print(df.info())

['insurance.csv']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
None


In [107]:
print(df.head())

   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


In [108]:
print(df.describe())

               age          bmi     children       charges
count  1338.000000  1338.000000  1338.000000   1338.000000
mean     39.207025    30.663397     1.094918  13270.422265
std      14.049960     6.098187     1.205493  12110.011237
min      18.000000    15.960000     0.000000   1121.873900
25%      27.000000    26.296250     0.000000   4740.287150
50%      39.000000    30.400000     1.000000   9382.033000
75%      51.000000    34.693750     2.000000  16639.912515
max      64.000000    53.130000     5.000000  63770.428010


In [109]:
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

In [110]:
# split dataset before encoding
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# encode categorical features
label_encoder = {}
categorical_features = ['sex', 'smoker', 'region']
for feature in categorical_features:
    le = LabelEncoder()
    train_df[feature] = le.fit_transform(train_df[feature])
    test_df[feature] = le.transform(test_df[feature])
    label_encoder[feature] = le

In [111]:
# features and target
X_train = train_df.drop(columns=['charges'])
y_train = train_df['charges']
X_test = test_df.drop(columns=['charges'])
y_test = test_df['charges']

In [112]:
print(X_train.head())
print(y_train.head())

      age  sex    bmi  children  smoker  region
560    46    0  19.95         2       0       1
1285   47    0  24.32         0       0       0
1142   52    0  24.86         0       0       2
969    39    0  34.32         5       0       2
486    54    0  21.47         3       0       1
560      9193.83850
1285     8534.67180
1142    27117.99378
969      8596.82780
486     12475.35130
Name: charges, dtype: float64


In [113]:
# normalize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [114]:
print(X_train)
print(y_train)

[[ 0.47222651 -1.0246016  -1.75652513  0.73433626 -0.50874702 -0.45611589]
 [ 0.54331294 -1.0246016  -1.03308239 -0.91119211 -0.50874702 -1.35325561]
 [ 0.8987451  -1.0246016  -0.94368672 -0.91119211 -0.50874702  0.44102382]
 ...
 [ 1.3252637   0.97598911 -0.89153925 -0.91119211 -0.50874702 -1.35325561]
 [-0.16755139 -1.0246016   2.82086429  0.73433626  1.96561348  1.33816354]
 [ 1.1120044   0.97598911 -0.10932713 -0.91119211 -0.50874702  1.33816354]]
560      9193.83850
1285     8534.67180
1142    27117.99378
969      8596.82780
486     12475.35130
           ...     
1095     4561.18850
1130     8582.30230
1294    11931.12525
860     46113.51100
1126    10214.63600
Name: charges, Length: 1070, dtype: float64


In [115]:
X_train_tensor=torch.tensor(X_train, dtype=torch.float32)
y_train_tensor=torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_test_tensor=torch.tensor(X_test, dtype=torch.float32)
y_test_tensor=torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

In [116]:
print(y_train_tensor)
print(y_train_tensor.shape)

tensor([[ 9193.8389],
        [ 8534.6719],
        [27117.9941],
        ...,
        [11931.1250],
        [46113.5117],
        [10214.6357]])
torch.Size([1070, 1])


In [117]:
# define neural network model
class SimpleNNRegression(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.network(x)

In [118]:
print(X_train_tensor.shape)
X_train_tensor.shape[1]

torch.Size([1070, 6])


6

In [119]:
model = SimpleNNRegression(input_size=X_train_tensor.shape[1])
print(model)

SimpleNNRegression(
  (network): Sequential(
    (0): Linear(in_features=6, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=1, bias=True)
  )
)


In [120]:
# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [121]:
# training loop
epochs = 50000
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    predictions = model(X_train_tensor)
    loss = criterion(predictions, y_train_tensor)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 100==0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

Epoch 100/50000, Loss: 320772448.0000
Epoch 200/50000, Loss: 307184928.0000
Epoch 300/50000, Loss: 267343488.0000
Epoch 400/50000, Loss: 199529808.0000
Epoch 500/50000, Loss: 125031224.0000
Epoch 600/50000, Loss: 73518040.0000
Epoch 700/50000, Loss: 50417896.0000
Epoch 800/50000, Loss: 41650476.0000
Epoch 900/50000, Loss: 37707528.0000
Epoch 1000/50000, Loss: 35675728.0000
Epoch 1100/50000, Loss: 34553748.0000
Epoch 1200/50000, Loss: 33862000.0000
Epoch 1300/50000, Loss: 33357552.0000
Epoch 1400/50000, Loss: 32933908.0000
Epoch 1500/50000, Loss: 32532636.0000
Epoch 1600/50000, Loss: 32132098.0000
Epoch 1700/50000, Loss: 31714344.0000
Epoch 1800/50000, Loss: 31264830.0000
Epoch 1900/50000, Loss: 30772780.0000
Epoch 2000/50000, Loss: 30223924.0000
Epoch 2100/50000, Loss: 29600640.0000
Epoch 2200/50000, Loss: 28905346.0000
Epoch 2300/50000, Loss: 28159830.0000
Epoch 2400/50000, Loss: 27382398.0000
Epoch 2500/50000, Loss: 26584640.0000
Epoch 2600/50000, Loss: 25786896.0000
Epoch 2700/50000

In [122]:
# model evaluation
model.eval()
y_pred = model(X_test_tensor).detach().numpy()

In [123]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
y_test_numpy=y_test_tensor.numpy()


# calculate metrics
mse = mean_squared_error(y_test_numpy, y_pred)
rmse=mse**0.5
mae = mean_absolute_error(y_test_numpy, y_pred)
r2 = r2_score(y_test_numpy, y_pred)

print(f"Test MSE: {mse:.4f}")
print(f"Test RMSE: {rmse:.4f}")
print(f"Test MAE: {mae:.4f}")
print(f"Test R²: {r2:.4f}")

Test MSE: 47672744.0000
Test RMSE: 6904.5452
Test MAE: 4811.9199
Test R²: 0.6929


In [124]:
def predict_charges(age, sex, bmi, children, smoker, region):
    # create a DataFrame for the input
    input_df = pd.DataFrame({
        'age': [age],
        'sex': [sex],
        'bmi': [bmi],
        'children': [children],
        'smoker': [smoker],
        'region': [region]
    })

    for feature in categorical_features:
        input_df[feature] = label_encoder[feature].transform(input_df[feature])

    input_df = scaler.transform(input_df)
    input_tensor = torch.tensor(input_df, dtype=torch.float32)
    predicted_charge = model(input_tensor).item()
    return predicted_charge

In [130]:
predicted=predict_charges(19, 'male', 27.9, 0, 'yes', 'southwest')
print(f"Predicted insurance charge: ${predicted:.2f}")

Predicted insurance charge: $14531.51
