In [9]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import (
    r2_score,
    mean_absolute_error,
    mean_squared_error,
    accuracy_score,
    confusion_matrix,
    roc_auc_score
)

# Load the Excel file
file_path = "Actuarial Table Sythetic to discuss Feature And TestTrain Split.xlsx"
df = pd.read_excel(file_path)

# Look at the first rows
df.head()


Unnamed: 0,PersonID,AgeOfF,AgeOfM,ZipCode,Salary,Smoker,Drinker,Married,Weight,AgeAtDeath,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,1,92,85,154,5416,2,3,0,-1,86.5,,,
1,2,103,40,177,4546,1,2,1,-1,83.5,,,
2,3,48,78,165,1996,3,1,1,1,50.0,,,AgeOf = Age of Parents
3,4,40,47,149,4783,0,3,0,1,16.5,,,Smoker --> No of cig per day
4,5,105,50,131,5450,2,1,1,1,63.5,,,Drinker --> No of drinks


In [10]:
# Drop comment columns that start with "Unnamed"
df = df.drop(columns=[c for c in df.columns if c.startswith("Unnamed")])

print(df.columns)
df.head()


Index(['PersonID', 'AgeOfF', 'AgeOfM', 'ZipCode', 'Salary', 'Smoker',
       'Drinker', 'Married', 'Weight', 'AgeAtDeath'],
      dtype='object')


Unnamed: 0,PersonID,AgeOfF,AgeOfM,ZipCode,Salary,Smoker,Drinker,Married,Weight,AgeAtDeath
0,1,92,85,154,5416,2,3,0,-1,86.5
1,2,103,40,177,4546,1,2,1,-1,83.5
2,3,48,78,165,1996,3,1,1,1,50.0
3,4,40,47,149,4783,0,3,0,1,16.5
4,5,105,50,131,5450,2,1,1,1,63.5


In [11]:
X = df.drop(columns=["AgeAtDeath", "PersonID"])  # use everything except ID and target
y = df["AgeAtDeath"]

# Train–test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create and train model
linreg = LinearRegression()
linreg.fit(X_train, y_train)

# Evaluate on test data
y_pred = linreg.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred) # Removed squared=False
rmse = np.sqrt(mse) # Calculate RMSE manually

print("Regression model performance:")
print(f"R²:   {r2:.3f}")
print(f"MAE:  {mae:.2f} years")
print(f"RMSE: {rmse:.2f} years")

Regression model performance:
R²:   0.945
MAE:  3.83 years
RMSE: 4.66 years


In [12]:
# Example new prospective customer
new_customer = {
    "AgeOfF": 60,      # age of father when he died
    "AgeOfM": 70,      # age of mother when she died
    "ZipCode": 150,    # some area code
    "Salary": 3000,    # monthly salary in Euro
    "Smoker": 1,       # cigarettes per day
    "Drinker": 1,      # drinks per day
    "Married": 1,      # 1 = married, 0 = not married
    "Weight": 0        # -1 underweight, 0 normal, 1 overweight
}

new_customer_df = pd.DataFrame([new_customer])
predicted_age = linreg.predict(new_customer_df)[0]

print("New customer details:")
print(new_customer_df)
print(f"\nPredicted age at death: {predicted_age:.1f} years")


New customer details:
   AgeOfF  AgeOfM  ZipCode  Salary  Smoker  Drinker  Married  Weight
0      60      70      150    3000       1        1        1       0

Predicted age at death: 67.9 years


I used the provided dataset of 100 deceased customers to build a regression model that predicts the possible age at death (AgeAtDeath) of a new prospective customer. As predictors I used: Age of father at death (AgeOfF), age of mother at death (AgeOfM), area code (ZipCode), monthly salary (Salary), number of cigarettes per day (Smoker), number of drinks per day (Drinker), marital status (Married, 0/1) and weight class (Weight, −1 underweight, 0 normal, 1 overweight).

I removed the ID column and split the data into a training set (80%) and a test set (20%) using train_test_split with random_state=42. I then fitted a multiple linear regression model using scikit-learn’s LinearRegression.

On the test set the model achieved an R² of about 0.94, meaning it explains roughly 94% of the variation in age at death. The mean absolute error is around 3.8 years and the root mean squared error about 4.7 years, so on average the prediction is within about 4 years of the true age at death.

As an example, for a hypothetical customer with parents dying at ages 60 and 70, living in area 150, salary 3000 €, smoking 1 cigarette and drinking 1 drink per day, married and with normal weight, the model predicts an age at death of approximately 67.9 years.

**QUESTION 2:**

In [13]:
# New binary target: 1 = lived beyond 86, 0 = did not
y_cls = (df["AgeAtDeath"] > 86).astype(int)

y_cls.value_counts()


Unnamed: 0_level_0,count
AgeAtDeath,Unnamed: 1_level_1
0,82
1,18


In [14]:
# Train–test split for classification
X_train, X_test, y_train_cls, y_test_cls = train_test_split(
    X, y_cls, test_size=0.2, random_state=42, stratify=y_cls
)

# Logistic Regression classifier
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train_cls)

# Predictions & probabilities
y_pred_cls = logreg.predict(X_test)
y_proba_cls = logreg.predict_proba(X_test)[:, 1]  # probability of class 1 (beyond 86)

# Evaluation
acc = accuracy_score(y_test_cls, y_pred_cls)
cm = confusion_matrix(y_test_cls, y_pred_cls)
auc = roc_auc_score(y_test_cls, y_proba_cls)

print("Classification model performance:")
print(f"Accuracy: {acc:.3f}")
print("Confusion matrix:\n", cm)
print(f"ROC AUC: {auc:.3f}")


Classification model performance:
Accuracy: 0.850
Confusion matrix:
 [[16  0]
 [ 3  1]]
ROC AUC: 0.859


In [15]:
# Use the same new_customer example for classification
proba_beyond_86 = logreg.predict_proba(new_customer_df)[0, 1]
pred_label = logreg.predict(new_customer_df)[0]

answer = "YES" if pred_label == 1 else "NO"

print("New customer details:")
print(new_customer_df)

print(f"\nPredicted probability of living beyond 86: {proba_beyond_86:.3f}")
print(f"Predicted answer (live beyond 86?): {answer}")


New customer details:
   AgeOfF  AgeOfM  ZipCode  Salary  Smoker  Drinker  Married  Weight
0      60      70      150    3000       1        1        1       0

Predicted probability of living beyond 86: 0.008
Predicted answer (live beyond 86?): NO


In the second part, the company no longer needs the exact age at death. Instead, they only want to know whether a prospective customer will live beyond age 86.

I kept the same predictors (AgeOfF, AgeOfM, ZipCode, Salary, Smoker, Drinker, Married, Weight) and again split the data into training (80%) and test (20%) sets, this time stratifying by the new label because only a minority of customers live beyond 86.

For this classification problem I used logistic regression (LogisticRegression in scikit-learn, with max_iter=1000). On the test set, the model obtained an accuracy of about 85%, with a ROC AUC of around 0.85.9, indicating good separation between the two classes. The confusion matrix shows that most customers are correctly classified.
