In [43]:
import os
import numpy as np
import pandas as pd
from scipy.optimize import minimize_scalar

In [44]:
data = pd.read_csv('/Users/adityaamehra/Desktop/ML projects/Comprehensive Lung Cancer Dataset/data.csv')

# Drop unnecessary columns
data.drop(['Patient_ID','Pack_Years','Treatment_Type','Comorbidities','Air_Quality_Index','Occupation','Year_of_Diagnosis'], axis=1, inplace=True)

# Handle missing values
data.dropna(inplace=True)

# Encode categorical variables
data['Gender'] = data['Gender'].map({'Male': 1, 'Female': -1, 'Other': 0})
data['Symptom_Progression'] = data['Symptom_Progression'].map({'Stable': 0, 'Worsening': 1, 'Improving': -1})
data['Medication_Response'] = data['Medication_Response'].map({'Good': -1, 'Moderate': 0, 'Poor': 1})
data['Stage_of_Cancer'] = data['Stage_of_Cancer'].map({'I': 1, 'II': 2, 'III': 3, 'IV': 4})

# Boolean Columns
bool_columns = [
    'Metastasis_Status', 'Previous_Cancer_Diagnosis', 'Weight_Loss', 'Chronic_Cough',
    'Shortness_of_Breath', 'Chest_Pain_Symptoms', 'Exposure_to_Toxins', 'Family_History_Cancer'
]
for col in bool_columns:
    data[col] = data[col].map({True: -1, False: 1})

# Lifestyle factors
data['Dietary_Habits'] = data['Dietary_Habits'].map({'Good': -1, 'Average': 0, 'Poor': 1})
data['Physical_Activity_Level'] = data['Physical_Activity_Level'].map({'High': -1, 'Moderate': 0, 'Low': 1})
data['Residential_Area'] = data['Residential_Area'].map({'Urban': 1, 'Suburban': 0, 'Rural': -1})
data['Smoking_History'] = data['Smoking_History'].map({'Never': -1, 'Former': 0, 'Current': 1})

# Normalize numerical columns
num_cols = ['Lung_Function_Test_Result', 'BMI', 'Years_Smoked', 'Age','Tumor_Size_cm']
data[num_cols] = data[num_cols] / 100.0

print(data.head())  # Check final dataset

    Age  Gender  Smoking_History  Years_Smoked  Family_History_Cancer  \
0  0.69       1               -1          0.30                      1   
1  0.32      -1                0          0.06                      1   
2  0.89       1               -1          0.02                     -1   
3  0.78      -1               -1          0.11                      1   
4  0.38       1                0          0.11                      1   

   Exposure_to_Toxins  Residential_Area    BMI  Lung_Function_Test_Result  \
0                   1                 1  0.278                      0.381   
1                   1                 1  0.163                      0.951   
2                  -1                -1  0.181                      0.624   
3                  -1                 1  0.223                      0.629   
4                   1                -1  0.283                      0.706   

   Chest_Pain_Symptoms  ...  Physical_Activity_Level  Dietary_Habits  \
0                    1  ..

In [45]:
X = data.drop(columns=['Tumor_Size_cm']).values  # Features
y = data['Tumor_Size_cm'].values  # Target variable

In [46]:
m=y.size

In [47]:
def nm(X):
    X_norm =X.copy()
    mu=np.mean(X_norm,axis=0)
    sigma=np.std(X_norm,axis=0)
    X_norm = (X - mu)/sigma
    return X_norm,mu,sigma

In [48]:
X_norm, mu, sigma = nm(X)

print('Computed mean:', mu)
print('Computed standard deviation:', sigma)

Computed mean: [ 5.34504033e-01 -6.30000000e-04 -9.90466667e-02  2.45064433e-01
  3.99180000e-01  2.01226667e-01  2.00630000e-01  2.80085407e-01
  6.49543343e-01  6.00846667e-01  4.01440000e-01  3.00586667e-01
  6.99200000e-01  3.00143333e-01  2.00780000e-01  8.01793333e-01
  4.97820000e-01  2.45026667e+00  9.50222667e+00  2.44880400e+01
 -3.00716667e-01  2.00346667e-01]
Computed standard deviation: [ 0.20781023  0.97982801  0.83118174  0.14432934  0.91687258  0.9795447
  0.87160633  0.06931683  0.20196888  0.7993643   0.91588532  0.95375451
  0.71492612  0.78053228  0.74825623  0.59760141  0.86728037  1.07169333
  5.76240127 14.43416792  0.7806874   0.74858169]


In [49]:
if X[0].all() == 1:
    print('The first column is all ones')
else:
    X = np.concatenate([np.ones((m, 1)), X_norm], axis=1)

In [50]:
def cost(X,y,theta):
    m=y.shape[0]
    J=0
    y_hat=np.dot(X,theta)
    err=y_hat-y
    serr=err**2
    J=(1/(2*m))*(serr.sum())
    return J

In [51]:
def mgd(X, y, theta, alpha, n):
    theta = theta.copy()
    J_history = []
    m = y.shape[0]
    for i in range(n):
        y_hat = np.dot(X, theta)
        err = y_hat - y
        grad = (1 / m) * np.dot(X.T, err)
        theta -= alpha * grad
        J_history.append(cost(X, y, theta))
    return theta, J_history


In [52]:
def cfa(alpha, X, y, theta, n):
    theta = np.zeros(X.shape[1])
    theta, J_history = mgd(X, y, theta, alpha, n)
    return J_history[-1]

In [54]:
y = y.reshape(-1) 
print(y.shape)
print(X.shape)

(300000,)
(300000, 23)


In [55]:
n = 10     
theta = np.zeros(X.shape[1])
res = minimize_scalar(lambda alpha: cfa(alpha, X, y, theta, n), bounds=(1e-6, 1), method='bounded')
alpha = res.x
print(alpha)
theta, J_history = mgd(X, y, theta, alpha, n)
print('theta computed from gradient descent: {:s}'.format(str(theta)))

0.8966774004357938
theta computed from gradient descent: [ 7.49139053e-02 -1.83880984e-05  2.61627408e-05  8.77161147e-05
  3.79841272e-05  1.00561704e-04  1.60673759e-04 -5.06247431e-05
  1.15961632e-04  9.34640030e-05 -4.42640826e-05  1.56159974e-04
  4.37457433e-05  1.29121964e-04 -4.85003909e-05  1.65406622e-04
  7.53916430e-05 -2.73774067e-05 -6.17082773e-05  2.20662118e-05
  2.55583692e-05 -7.09309431e-05 -1.63819879e-05]


In [62]:
samples = np.array([
    [1, -1, 1, 0, 1, -1, -1, -1, -1, 0, -1, -1, 1, -1, -1, 0, 0, 0.85, 0.22, 0.0,1,1,1],  # Sample 1
    [1, -1, 0, -1, 1, 0, 1, -1, 1, -1, -1, 0, -1, -1, 2, 1, 1, 0.70, 0.26, 10.0,1,1,1],   # Sample 2
    [1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 0, 1, 1, 1, 4, -1, -1, 0.55, 0.32, 30.0,1,1,1],     # Sample 3
    [1, 0, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, -1, 3, 0, 1, 0, 0.68, 0.28, 5.0,1,1,1],     # Sample 4
    [1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 0, 1, 0.92, 0.20, 0.0,1,1,1] # Sample 5
])

# Ensure X_test has the correct shape
X_test = np.vstack(samples)
print(f"X_test shape: {X_test.shape}")  # Should match the number of columns in training data

# Ensure mu and sigma have the same shape as X_test (excluding bias)
mu_features = mu.reshape(1, -1)
sigma_features = sigma.reshape(1, -1)
# Normalize only the non-bias features
X_test[:, 1:] = (X_test[:, 1:] - mu_features) / sigma_features  
print(theta.size)
# Compute predictions
price = X_test @ theta

# Clip predictions to range [0, 100]
price = np.clip(price, 0, 100)

# Display results
for i, pred in enumerate(price):
    print(f"Sample {i+1} - Predicted Tumor Size: {pred * 100:.23f} cm")

X_test shape: (5, 23)
23
Sample 1 - Predicted Tumor Size: 7.19764933224653624677103 cm
Sample 2 - Predicted Tumor Size: 7.54453700037685237589358 cm
Sample 3 - Predicted Tumor Size: 7.21608978309906401449325 cm
Sample 4 - Predicted Tumor Size: 7.27322485127091944434596 cm
Sample 5 - Predicted Tumor Size: 7.10987486069304974733996 cm
