# Old SwanData Analysis (no economic SDOH)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

swandf = pd.read_csv('/Users/aj/Downloads/swandata.csv')

numeric_features = ['Age', 'LangInt10']
categorical_features = ['Race', 'MenoStatus']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

model = RandomForestClassifier(n_estimators=100, random_state=0)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

X = swandf[['Race', 'MenoStatus', 'Age', 'LangInt10']]

targets = ['Estr110', 'EstrInjec110', 'EstrInjec210', 'EstrProgComb110', 'EstrProgComb210', 
           'OtherHormone1', 'OtherHormone2', 'OtherHormone3', 'OtherHormone4']

models = {}

for target in targets:
    y = swandf[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.999, random_state=42)

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f'Test accuracy for {target}: {accuracy}')
    
    models[target] = pipeline

# new_data = pd.DataFrame({'Race': ['Race1'], 'MenoStatus': ['Status1'], 'Age': [50], 'LangInt10': [5]})
# new_pred = models['Estr110'].predict(new_data)
# print(f'Prediction for new data on Estr110: {new_pred}')


Test accuracy for Estr110: 0.9634030418250951
Test accuracy for EstrInjec110: 0.9923954372623575
Test accuracy for EstrInjec210: 0.9923954372623575
Test accuracy for EstrProgComb110: 0.9757604562737643
Test accuracy for EstrProgComb210: 0.9757604562737643
Test accuracy for OtherHormone1: 0.9605513307984791
Test accuracy for OtherHormone2: 0.9605513307984791
Test accuracy for OtherHormone3: 0.9966730038022814
Test accuracy for OtherHormone4: 0.998574144486692
Prediction for new data on Estr110: [-1]


In [2]:
from sklearn.metrics import classification_report
targets = ['Estr110', 'EstrInjec110', 'EstrInjec210', 'EstrProgComb110', 'EstrProgComb210', 
           'OtherHormone1', 'OtherHormone2', 'OtherHormone3', 'OtherHormone4']

swandf = pd.read_csv('/Users/aj/Downloads/swandata.csv')

swandf['has_2'] = swandf[targets].apply(lambda x: 2 in x.values, axis=1).astype(int)

X = swandf[['Race', 'MenoStatus', 'Age', 'LangInt10']]
y = swandf['has_2']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', RandomForestClassifier(n_estimators=100, random_state=0))])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print(report)


Accuracy: 0.8908659549228944
              precision    recall  f1-score   support

           0       0.90      0.99      0.94       757
           1       0.20      0.02      0.04        86

    accuracy                           0.89       843
   macro avg       0.55      0.51      0.49       843
weighted avg       0.83      0.89      0.85       843



In [3]:
new_subject = pd.DataFrame({
    'Race': ['Caucasian/ White Non-Hispanic'], 
    'MenoStatus': ['Natural Post'], 
    'Age': [53], 
    'LangInt10': [1]
})

new_prediction = pipeline.predict(new_subject)

print("Prediction:", new_prediction)


Prediction: [0]


In [4]:
from sklearn.metrics import classification_report

targets = ['Estr110', 'EstrInjec110', 'EstrProgComb110', 'OtherHormone1']

swandf_filtered = swandf[swandf[targets].apply(lambda x: 2 in x.values, axis=1)]

for target in targets:
    swandf_filtered[target] = (swandf_filtered[target] == 2).astype(int)

X = swandf_filtered[['Race', 'MenoStatus', 'Age', 'LangInt10']]
y = swandf_filtered[targets]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', RandomForestClassifier(n_estimators=100, random_state=0))])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

report = classification_report(y_test, y_pred, target_names=targets)
print(report)


                 precision    recall  f1-score   support

        Estr110       0.35      0.24      0.29        25
   EstrInjec110       0.50      0.17      0.25         6
EstrProgComb110       0.50      0.29      0.36        14
  OtherHormone1       0.52      0.45      0.48        33

      micro avg       0.46      0.33      0.39        78
      macro avg       0.47      0.29      0.35        78
   weighted avg       0.46      0.33      0.38        78
    samples avg       0.35      0.33      0.34        78



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  swandf_filtered[target] = (swandf_filtered[target] == 2).astype(int)
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier

targets = ['Estr110', 'EstrInjec110', 'EstrProgComb110', 'OtherHormone1']

swandf_filtered = swandf[swandf[targets].apply(lambda x: 2 in x.values, axis=1)]

for target in targets:
    swandf_filtered[target] = (swandf_filtered[target] == 2).astype(int)

X = swandf_filtered[['Race', 'MenoStatus', 'Age', 'LangInt10']]
y = swandf_filtered[targets]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

models = [
    ('RandomForestClassifier', RandomForestClassifier(random_state=0), {'model__n_estimators': [50, 100, 200],
                                                                         'model__max_depth': [None, 10, 20],
                                                                         'model__min_samples_split': [2, 5, 10]}),
    ('SVC', OneVsRestClassifier(SVC(random_state=0)), {'model__estimator__C': [0.1, 1.0, 10.0], 
                                                        'model__estimator__gamma': [0.1, 1.0, 10.0]})
]

for model_name, model, param_grid in models:
    print(f"\n{model_name} Grid Search:")
    
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])

    grid_search = GridSearchCV(pipeline, param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    print("\nBest Parameters from Grid Search: ", grid_search.best_params_)

    y_pred = grid_search.predict(X_test)

    report = classification_report(y_test, y_pred, target_names=targets)
    print("\nClassification Report:")
    print(report)



RandomForestClassifier Grid Search:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  swandf_filtered[target] = (swandf_filtered[target] == 2).astype(int)



Best Parameters from Grid Search:  {'model__max_depth': 10, 'model__min_samples_split': 2, 'model__n_estimators': 200}

Classification Report:
                 precision    recall  f1-score   support

        Estr110       0.36      0.30      0.33        27
   EstrInjec110       0.50      0.14      0.22         7
EstrProgComb110       0.33      0.28      0.30        18
  OtherHormone1       0.48      0.42      0.45        38

      micro avg       0.42      0.33      0.37        90
      macro avg       0.42      0.28      0.33        90
   weighted avg       0.42      0.33      0.37        90
    samples avg       0.31      0.34      0.31        90


SVC Grid Search:


  _warn_prf(average, modifier, msg_start, len(result))



Best Parameters from Grid Search:  {'model__estimator__C': 10.0, 'model__estimator__gamma': 0.1}

Classification Report:
                 precision    recall  f1-score   support

        Estr110       0.47      0.30      0.36        27
   EstrInjec110       0.50      0.14      0.22         7
EstrProgComb110       0.50      0.28      0.36        18
  OtherHormone1       0.40      0.26      0.32        38

      micro avg       0.44      0.27      0.33        90
      macro avg       0.47      0.25      0.32        90
   weighted avg       0.45      0.27      0.33        90
    samples avg       0.27      0.27      0.27        90



  _warn_prf(average, modifier, msg_start, len(result))


# NEW ANALYSIS (includes economic SDOH

In [8]:
fullswan = pd.read_csv('/Users/aj/Downloads/32961-0001-Data.tsv',sep='\t')

  fullswan = pd.read_csv('/Users/aj/Downloads/32961-0001-Data.tsv',sep='\t')


In [9]:
fullswan

Unnamed: 0,SWANID,VISIT,AGE10,INTDAY10,LANGINT10,PRGNAN10,ANTICO110,ACOATW110,ANTICO210,ACOATW210,...,TBWNHAN10,FFMNHAN10,TBFNHAN10,PBFNHAN10,MISSCON10,MISSPHY10,FLAGSRP10,FLGBIOV10,CAMDAY10,RACE
0,10046,10,62,3768,1,1,1,-1,-1,-1,...,37.948159459459,49.867257297297,39.532742702703,44.220070137251,0,0,0,0,3768,2
1,10056,10,61,3660,1,1,1,-1,-1,-1,...,29.231053187919,39.060639597315,14.339360402685,26.852734836488,0,0,0,0,3660,4
2,10153,10,61,3635,1,1,1,-1,-1,-1,...,35.292138407494,45.621390070258,23.978609929742,34.452025761124,0,0,0,0,3635,3
3,10196,10,56,3634,1,1,1,-1,-1,-1,...,26.18823317757,35.091765981308,18.108234018692,34.038033869721,0,0,0,0,3634,2
4,10245,10,57,3612,1,1,1,-1,-1,-1,...,34.079366176471,46.688702352941,25.411297647059,35.244518234478,0,0,0,0,3612,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2240,99805,10,52,3661,1,1,1,-1,-1,-1,...,32.674415544041,43.928784041451,36.871215958549,45.632693018006,0,0,0,0,3661,1
2241,99809,10,53,3624,1,1,1,-1,-1,-1,...,37.695115384615,49.913391794872,31.586608205128,38.75657448482,0,0,0,0,3624,4
2242,99888,10,58,3661,1,1,1,-1,-1,-1,...,31.425286725664,41.864915469027,25.335084530973,37.701018647282,0,0,0,0,3661,3
2243,99898,10,55,3626,1,1,1,-1,-1,-1,...,53.326535211268,71.943321126761,59.556678873239,45.290250093718,0,0,0,0,3626,4


In [4]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

swannew = pd.read_csv('/Users/aj/Downloads/yale-main/SWANnew.csv')

In [12]:
targets = ['Estr110', 'EstrInjec110', 'EstrProgComb110', 'OtherHormone1']

# Predictor variables
X = swannew[['Race', 'MenoStatus', 'Age', 'Language', 'data.INSURAN10', 'data.NOTAFFR10', 'data.INCOME10', 'data.HOTFLAS10']]

# Preprocessing
numeric_features = ['Age', 'data.INSURAN10', 'data.NOTAFFR10', 'data.INCOME10', 'data.HOTFLAS10']
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_features = ['Race', 'MenoStatus', 'Language']
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Loop over the targets to train individual models
for target in targets:
    y = swannew[target]
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

    # Create pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', RandomForestClassifier(n_estimators=100, random_state=0))])

    # Fit model
    pipeline.fit(X_train, y_train)

    # Predict test set
    y_pred = pipeline.predict(X_test)

    # Compute accuracy and classification report
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"Results for {target}:")
    print("Accuracy:", accuracy)
    print(report)
    print("\n" + "=" * 50 + "\n")

Results for Estr110:
Accuracy: 0.888030888030888
              precision    recall  f1-score   support

       False       0.77      0.59      0.66       147
        True       0.91      0.96      0.93       630

    accuracy                           0.89       777
   macro avg       0.84      0.77      0.80       777
weighted avg       0.88      0.89      0.88       777



Results for EstrInjec110:
Accuracy: 0.8854568854568855
              precision    recall  f1-score   support

       False       0.76      0.59      0.67       150
        True       0.91      0.96      0.93       627

    accuracy                           0.89       777
   macro avg       0.83      0.77      0.80       777
weighted avg       0.88      0.89      0.88       777



Results for EstrProgComb110:
Accuracy: 0.8918918918918919
              precision    recall  f1-score   support

       False       0.79      0.59      0.67       148
        True       0.91      0.96      0.94       629

    accuracy    