In [9]:
import pandas as pd
import numpy as np
from lifelines import CoxPHFitter
from lifelines.utils import k_fold_cross_validation

try:
    df = pd.read_csv('novice.csv')
    print("Columns in dataset:", df.columns.tolist())
except FileNotFoundError:
    raise SystemExit("Error: File 'novice.csv' not found in current directory.")


#Create Essential Survival Analysis Columns

required_cols = ['Stage', 'Match Length', 'Closure Reason',
                 'Big Birthdate', 'Little Birthdate',
                 'Big Gender', 'Little Gender']
missing = [col for col in required_cols if col not in df.columns]
if missing:
    raise SystemExit(f"Missing required columns: {missing}")

#Create time-to-event column
df['time'] = df['Match Length']

#Create event indicator (0 = censored/success, 1 = event/failure)
success_conditions = (
    (df['Stage'] == 'Active') |
    (df['time'] >= 24) |
    (df['Closure Reason'].isin(['Child: Graduated', 'Successful Match Closure']))
)
df['event'] = np.where(success_conditions, 0, 1)


#Calculate Age Difference
try:
    #datetime
    df['Big Birthdate'] = pd.to_datetime(df['Big Birthdate'], errors='coerce')
    df['Little Birthdate'] = pd.to_datetime(df['Little Birthdate'], errors='coerce')
    
    #years
    df['age_diff'] = (
        df['Big Birthdate'].dt.year - 
        df['Little Birthdate'].dt.year
    ).abs()
    
    #missing values
    median_age = df['age_diff'].median()
    df['age_diff'] = df['age_diff'].fillna(median_age)
    
except Exception as e:
    raise SystemExit(f"Error calculating age difference: {str(e)}")

#Feature Engineering with Validation

def calculate_alignment(big_col, little_col):
    #Calculate Jaccard similarity between Big/Little interests
    #Jaccard is intersection divided by union
    def _similarity(row):
        try:
            big = set(str(row[big_col]).split('; ')) if pd.notna(row[big_col]) else set()
            little = set(str(row[little_col]).split('; ')) if pd.notna(row[little_col]) else set()
            big.discard('None/Other')
            little.discard('None/Other')
            intersection = big.intersection(little)
            union = big.union(little)
            return len(intersection)/len(union) if union else 0
        except:
            return 0
    return _similarity

#Create alignment features only if columns exist
alignment_features = []
for interest in ['Sports', 'Hobbies']:
    big_col = f'Big Contact: Interest Finder - {interest}'
    little_col = f'Little Contact: Interest Finder - {interest}'
    
    if big_col in df.columns and little_col in df.columns:
        df[f'{interest.lower()}_alignment'] = df.apply(
            calculate_alignment(big_col, little_col), axis=1
        ).fillna(0)
        alignment_features.append(f'{interest.lower()}_alignment')
    else:
        print(f"Warning: Skipping {interest} alignment - columns missing")

#Create gender alignment
df['gender_alignment'] = (df['Big Gender'] == df['Little Gender']).astype(int)

#Final Dataset
features = [
    'gender_alignment',
    'age_diff',
    *alignment_features
]

#Remove low-variance features 
analysis_df = df[['time', 'event'] + features].dropna()
variances = analysis_df[features].var()
valid_features = variances[variances > 0.02].index.tolist()

if not valid_features:
    raise SystemExit("No valid features remaining after variance filtering")

analysis_df = analysis_df[['time', 'event'] + valid_features]

#Survival Analysis

cph = CoxPHFitter(penalizer=10)
cph.fit(analysis_df, duration_col='time', event_col='event')
print("\nSurvival Analysis Results:")
print("="*50)
print(f"Number of observations: {len(analysis_df)}")
print(f"Number of events: {analysis_df['event'].sum()}")
print("\nTop predictive features:")
print(cph.summary.sort_values('exp(coef)').head(10))


Columns in dataset: ['Match ID 18Char', 'Stage', 'Little ID', 'Big ID', 'Big County', 'Big Age', 'Big Occupation', 'Big: Military', 'Big Approved Date', 'Big Level of Education', 'Big Languages', 'Big Gender', 'Big Birthdate', 'Big Employer', 'Program', 'Program Type', 'Big Race/Ethnicity', 'Closure Reason', 'Closure Details', 'Match Activation Date', 'Match Closure Meeting Date', 'Rationale for Match', 'Big Enrollment: Record Type', 'Big Assessment Uploaded', 'Big Acceptance Date', 'Big Car Access', 'Big Days Acceptance to Match', 'Big Days Interview to Acceptance', 'Big Days Interview to Match', 'Big Open to Cross-Gender Match', 'Big Re-Enroll', 'Big Contact: Preferred Communication Type', 'Big Contact: Former Big/Little', 'Big Contact: Interest Finder - Sports', 'Big Contact: Interest Finder - Places To Go', 'Big Contact: Interest Finder - Hobbies', 'Big Contact: Interest Finder - Entertainment', 'Big Contact: Created Date', 'Big Enrollment: Created Date', 'Big Contact: Volunteer Av

KeyError: 'E'