# Import

In [56]:
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')
sample_df = pd.read_csv('/content/drive/MyDrive/sample_test.csv')
main_df = pd.read_csv('/content/drive/MyDrive/nyt_articles_04-24_final.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [57]:
main_df.head(2)


Unnamed: 0,year,month,headline,lead_paragraph,news_desk,keywords
0,2004,1,FreeMarkets Agrees to Buy Auction Unit of Covi...,"FreeMarkets, which makes Internet-auction soft...",Business/Financial Desk,"General Motors Corp, Ford Motor Co, DaimlerChr..."
1,2004,1,National Briefing | South: Arkansas: Assembly ...,Both houses of the General Assembly recessed u...,National Desk,


# Classification


In [58]:
df = main_df.copy()
df['new'] = df['headline'].fillna('') + ' ' + df['lead_paragraph'].fillna('') + ' ' + df['keywords'].fillna('')
df.head(1)

Unnamed: 0,year,month,headline,lead_paragraph,news_desk,keywords,new
0,2004,1,FreeMarkets Agrees to Buy Auction Unit of Covi...,"FreeMarkets, which makes Internet-auction soft...",Business/Financial Desk,"General Motors Corp, Ford Motor Co, DaimlerChr...",FreeMarkets Agrees to Buy Auction Unit of Covi...


In [59]:
import os
import re

TEXT_COLUMN = 'new'
KEYWORD_THRESHOLD = 1

WOMENS_HEALTH_KEYWORDS = set([
    # Pregnancy & Reproduction
    'pregnancy', 'pregnant', 'prenatal', 'postpartum', 'postnatal', 'obstetrics', 'obstetric',
    'gynecology', 'gynecologist', 'fertility', 'infertility', 'ivf', 'iui', 'contraception',
    'birth control', 'iud', 'fetus', 'fetal', 'embryo', 'miscarriage', 'stillbirth', 'childbirth',
    'breastfeeding', 'lactation', 'maternity', 'c-section', 'cesarean', 'amniotic', 'placenta',
    'midwifery', 'doula', 'ob-gyn', 'egg freezing', 'oocyte',

    # Cancers & Screenings
    'breast cancer', 'ovarian cancer', 'cervical cancer', 'uterine cancer', 'vaginal cancer', 'vulvar cancer',
    'mammogram', 'pap smear', 'pap test', 'hpv', 'human papillomavirus', 'lumpectomy', 'mastectomy',
    'colposcopy', 'brca',

    # Conditions & Syndromes
    'endometriosis', 'pcos', 'polycystic ovary syndrome', 'menopause', 'perimenopause',
    'uterine fibroids', 'pelvic inflammatory disease', 'pid', 'osteoporosis',
    'menstrual', 'menstruation', 'period', 'premenstrual syndrome', 'pms', 'pmdd',
    'yeast infection', 'bacterial vaginosis', 'vulvodynia', 'adenomyosis', 'toxic shock syndrome',
    'pelvic organ prolapse', 'cystitis',

    # Procedures & Anatomy
    'uterus', 'uterine', 'ovary', 'ovarian', 'cervix', 'vagina', 'vaginal', 'vulva',
    'pelvic floor', 'fallopian', 'hysterectomy', 'oophorectomy', 'hormone replacement therapy', 'hrt'
])

# STEP 1
def count_keywords_in_text(text, keywords):
    """
    Counts the number of unique keywords from a given set found in a text string.
    """
    text_lower = str(text).lower()
    # The \b ensures we match whole words only (e.g., 'pms' not 'symptoms')
    found_keywords = {
        keyword for keyword in keywords
        if re.search(r'\b' + re.escape(keyword) + r'\b', text_lower)
    }
    return len(found_keywords)

# STEP 2
def classify_dataframe(df, keywords, text_column, threshold):
    """
    Processes a DataFrame to count keywords and assign a classification label.
    """
    if text_column not in df.columns:
        print(f"Error: Text column '{text_column}' not found in the DataFrame. Aborting.")
        return df

    df_copy = df.copy()
    df_copy.dropna(subset=[text_column], inplace=True)

    df_copy['keyword_match_count'] = df_copy[text_column].apply(
        lambda txt: count_keywords_in_text(txt, keywords)
    )
    df_copy['is_womens_health'] = df_copy['keyword_match_count'].apply(
        lambda count: True if count >= threshold else False
    )

    classified_count = df_copy['is_womens_health'].sum()
    print(f"Found {classified_count} matching articles in the DataFrame.")

    return df_copy

processed_df = classify_dataframe(df, WOMENS_HEALTH_KEYWORDS, TEXT_COLUMN, KEYWORD_THRESHOLD)


Found 18464 matching articles in the DataFrame.


In [60]:
output_filename = 'classified.csv'
processed_df.to_csv(output_filename, index=False)

# Manual Review


In [61]:
random_sample_df = processed_df.groupby('is_womens_health').sample(n=1000, random_state=42)
output_filename = '2k_sample_manual.csv'
random_sample_df.to_csv(output_filename, index=False)

In [62]:
# Manual Review in Spreadsheet https://docs.google.com/spreadsheets/d/1fRYUzsd30QlCdr9i8KzNpNreB_ThlQdM7k-ttzNSrCs/edit?gid=0#gid=0
# Adding back to the colab

from google.colab import drive
import pandas as pd
drive.mount('/content/drive')
manual_review = pd.read_csv('/content/drive/MyDrive/manual_review.csv')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [63]:
manual_review.head()

Unnamed: 0,year,month,headline,lead_paragraph,news_desk,keywords,new,keyword_match_count,is_womens_health
0,2021,6,‘12 Mighty Orphans’ Review: A Team Effort,Inspired by a true story of parentless teenage...,Weekend,"Movies, Roberts, Ty (1974- ), Sheen, Martin, W...",‘12 Mighty Orphans’ Review: A Team Effort Insp...,0,False
1,2018,6,‘A Catharsis Sculpture’: An Artist Makes a Mon...,"For the last two years or so, the artist Prune...",Culture,"Nourry, Prune, Sculpture, Breast Cancer, Art, ...",‘A Catharsis Sculpture’: An Artist Makes a Mon...,1,True
2,2021,10,‘A Mouthful of Air’ Review: Depression Clouds ...,A young mother battles postpartum depression i...,Weekend,"Movies, Seyfried, Amanda, Wittrock, Finn, Kopp...",‘A Mouthful of Air’ Review: Depression Clouds ...,1,True
3,2015,9,‘A Watchful Eye on Farm Families’ Health’,This is the eighth in a series of videos about...,OpEd,"Agriculture and Farming, Children and Childhoo...",‘A Watchful Eye on Farm Families’ Health’ This...,0,False
4,2015,10,"‘A Wild Swan,’ by Michael Cunningham",My high-minded parents discouraged the Saturda...,BookReview,"Benfey, Christopher, Books and Literature, Cun...","‘A Wild Swan,’ by Michael Cunningham My high-m...",0,False


# Building the Model

In [64]:
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report


X_text = manual_review['new']
y = manual_review['is_womens_health']

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=0.3, random_state=42, stratify=y
)

#Vectorize w/ TFIDF
tfidf_vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=5000,
    ngram_range=(1, 2)
)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
X_test_tfidf = tfidf_vectorizer.transform(X_test_text)


#Hyperparameter

scale_pos_weight = y_train.value_counts()[False] / y_train.value_counts()[True]

param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}

#XGBoost
xgb_model = xgb.XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

#Randomized Search with cross-validation
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=25,
    scoring='f1',
    n_jobs=-1,
    cv=5,
    random_state=42
)

random_search.fit(X_train_tfidf, y_train)

print(f"\nBest parameters: {random_search.best_params_}")

best_classifier = random_search.best_estimator_
y_pred = best_classifier.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


Best parameters: {'subsample': 1.0, 'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.2, 'colsample_bytree': 0.9}
              precision    recall  f1-score   support

       False       0.94      0.99      0.96       571
        True       0.98      0.88      0.93       329

    accuracy                           0.95       900
   macro avg       0.96      0.94      0.95       900
weighted avg       0.95      0.95      0.95       900



In [66]:
#Saving the Model

import joblib

joblib.dump(best_classifier, 'womens_health_classifier.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

print("Model and vectorizer saved successfully!")

Model and vectorizer saved successfully!


# Applying to original dataset

In [67]:
loaded_classifier = joblib.load('womens_health_classifier.pkl')
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')
print("Model and vectorizer loaded.")

Model and vectorizer loaded.


In [68]:
main_df.head()

Unnamed: 0,year,month,headline,lead_paragraph,news_desk,keywords
0,2004,1,FreeMarkets Agrees to Buy Auction Unit of Covi...,"FreeMarkets, which makes Internet-auction soft...",Business/Financial Desk,"General Motors Corp, Ford Motor Co, DaimlerChr..."
1,2004,1,National Briefing | South: Arkansas: Assembly ...,Both houses of the General Assembly recessed u...,National Desk,
2,2004,1,Clark Courts Veterans in Swing Through South,Winding up several days of campaigning in the ...,National Desk,"Southern States (US), Health Insurance and Man..."
3,2004,1,"Paid Notice: Deaths PAONE, NICOLA","PAONE--Nicola. On December 25, 2003 at age 88....",Classified,"PAONE, NICOLA"
4,2004,1,The Time We Thought We Knew,"It was an unlikely place to be at 4:30 a.m., s...",Editorial Desk,"Newton, Isaac, Einstein, Albert, Time, RELATIV..."


In [69]:
df = main_df.copy()
df['new'] = df['headline'].fillna('') + ' ' + df['lead_paragraph'].fillna('') + ' ' + df['keywords'].fillna('')
df.head(1)

Unnamed: 0,year,month,headline,lead_paragraph,news_desk,keywords,new
0,2004,1,FreeMarkets Agrees to Buy Auction Unit of Covi...,"FreeMarkets, which makes Internet-auction soft...",Business/Financial Desk,"General Motors Corp, Ford Motor Co, DaimlerChr...",FreeMarkets Agrees to Buy Auction Unit of Covi...


In [70]:
X_full_dataset_tfidf = loaded_vectorizer.transform(df['new'])
predictions = loaded_classifier.predict(X_full_dataset_tfidf)

probabilities = loaded_classifier.predict_proba(X_full_dataset_tfidf)
confidence_scores = probabilities[:, 1]

df['is_womens_health_pred'] = predictions
df['confidence_score'] = confidence_scores
df.head()

Unnamed: 0,year,month,headline,lead_paragraph,news_desk,keywords,new,is_womens_health_pred,confidence_score
0,2004,1,FreeMarkets Agrees to Buy Auction Unit of Covi...,"FreeMarkets, which makes Internet-auction soft...",Business/Financial Desk,"General Motors Corp, Ford Motor Co, DaimlerChr...",FreeMarkets Agrees to Buy Auction Unit of Covi...,0,0.056229
1,2004,1,National Briefing | South: Arkansas: Assembly ...,Both houses of the General Assembly recessed u...,National Desk,,National Briefing | South: Arkansas: Assembly ...,0,0.024317
2,2004,1,Clark Courts Veterans in Swing Through South,Winding up several days of campaigning in the ...,National Desk,"Southern States (US), Health Insurance and Man...",Clark Courts Veterans in Swing Through South W...,0,0.004694
3,2004,1,"Paid Notice: Deaths PAONE, NICOLA","PAONE--Nicola. On December 25, 2003 at age 88....",Classified,"PAONE, NICOLA","Paid Notice: Deaths PAONE, NICOLA PAONE--Nic...",0,0.145481
4,2004,1,The Time We Thought We Knew,"It was an unlikely place to be at 4:30 a.m., s...",Editorial Desk,"Newton, Isaac, Einstein, Albert, Time, RELATIV...",The Time We Thought We Knew It was an unlikely...,0,0.010241


In [71]:
display(df[df['is_womens_health_pred'] == True].sort_values(by='confidence_score', ascending=False).head())

Unnamed: 0,year,month,headline,lead_paragraph,news_desk,keywords,new,is_womens_health_pred,confidence_score
25610,2004,3,A Misleading Fetal Violence Law,The law that Congress passed last week making ...,Editorial Desk,"United States, ABORTION, Pregnancy and Obstetr...",A Misleading Fetal Violence Law The law that C...,1,1.0
953138,2012,2,"The ‘Safe, Legal, Rare’ Illusion",AMID the sound and fury of the latest culture-...,Editorial,"United States, Liberalism (US Politics), Teena...","The ‘Safe, Legal, Rare’ Illusion AMID the soun...",1,1.0
947798,2012,1,"Pregnant, and Pushed Out of a Job",FEW people realize that getting pregnant can m...,OpEd,"Disabilities, Pregnancy and Obstetrics, Women ...","Pregnant, and Pushed Out of a Job FEW people r...",1,1.0
947390,2012,1,Required Delay Between Sonogram and Abortion C...,Moments after Amy Hagstrom Miller heard the ne...,National,"Texas, ABORTION, Pregnancy and Obstetrics, Law...",Required Delay Between Sonogram and Abortion C...,1,1.0
1490782,2020,3,Getting Pregnant Was a Challenge. Then the Cor...,After nearly two years of trying to have a bab...,Parenting,"Infertility, Coronavirus (2019-nCoV), Egg Dona...",Getting Pregnant Was a Challenge. Then the Cor...,1,1.0


In [74]:
df.to_csv('/content/drive/My Drive/final_articles_classified.csv', index=False)