# Step 2: Detection of cancer and LGBTQ+ campaigns in GFM data
## Author: Caleb Easterly

In [1]:
import os
import pandas as pd
import re
import csv

# change directory
os.chdir("C:\\Users\\caleb\\OneDrive - University of North Carolina at Chapel Hill\\Documents\\Projects\\Cancer care crowdfunding")

medcamp = pd.read_csv("GoFundMeUU/data/filtered_full_samp.csv",
    lineterminator='\n', encoding='utf-8', index_col="URL")

# for testing
# medcamp = medcamp.sample(frac=0.05, replace=True, random_state=1498)

# combine title and description (look in both)
medcamp["alltext"] = medcamp.Title + ' ' + medcamp.Description
medcamp.alltext = medcamp.alltext.str.lower()

# function to detect a specific term
def detect_term(term, df, prefix, textcol='alltext'):
    term_as_word = ''.join(['\\b', term, '\\b'])
    term_clean_for_name = re.sub('[^0-9A-Za-z]', '_', term.strip().lower())
    term_as_column_name = prefix + term_clean_for_name
    outser = pd.DataFrame({term_as_column_name: df[textcol].str.contains(term_as_word, regex=True, case=False)})
    return outser

# get terms from text file
def get_terms(termpath):
    with open(termpath, 'r') as tfile:
        terms = []
        for line in tfile.readlines():
            line = line.strip().lower()
            if (line[0] != "#") & (line != ""):
                terms.append(line)
    return terms

## Cancer Campaign Identification

In [2]:
cancer_terms_fpath = "GoFundMeUU/termlists/final_cancer.txt"
cterms = get_terms('GoFundMeUU/termlists/final_cancer.txt')
print(cterms)
cdf_list = [detect_term(t, medcamp, prefix='c_') for t in cterms]
cdf = pd.concat(cdf_list, axis=1, ignore_index=False)

# summary variables - number of term hits and any term hits
cdf["c_total"] = cdf.sum(axis=1)
cdf["c_any"] = cdf.c_total > 0

# join back to main dataset
medcamp = medcamp.join(cdf)

['malignan.*', 'cancer', 'carcinoma', 'radiation therap.*', 'radiotherap.*', 'immune therap.*', 'immunotherap.*', 'chemo', 'chemotherap.*', 'mastectomy', 'lumpectomy', 'lukemia', 'leukemia', 'luekemia', 'lymphoma', 'melanoma', 'glioblastoma', 'myeloma', 'myloma', 'neuroblastoma', 'neurblastoma', 'neruoblastoma', 'nuroblastoma', 'nueroblastoma', 'astrocytoma', 'renal cell', 'squamous cell', 'medulloblastoma', 'adenocarcinoma', 'retinoblastoma', 'ductal carcinoma', 'neuroendocrine tumor', 'histiocytosis', 'carcinoid', 'germ cell tumor', 'desmoplastic', 'wilms tumor', "wilm\\'s tumor", 'seminoma', 'ependymoma', 'thymoma', 'langerhans', 'ductile carcinoma', 'oligodendroglioma', 'clear cell', 'non-hodgkins lymphoma', 'non hodgkins lymphoma', 'nonhodgkins lymphoma', 'rhadbdomyosarcoma', 'rhabdomyosaroma']


## LGBTQ+ Campaign Identification

In [3]:
# repeat for lgbtq+
lgbtq_terms_fpath = "GoFundMeUU/termlists/final_lgbtq.txt"
qterms = get_terms('GoFundMeUU/termlists/final_lgbtq.txt')
print(qterms)

# test potentially trickier terms to make sure this works
testdf = pd.DataFrame(
    {'alltext': ['drag king', 'drag on', 'they/them', 'they', 'he/him', '2 spirit']}
)
test_results = pd.concat([detect_term(t, testdf, prefix='q_') for t in qterms], axis=1, ignore_index=False)
all([test_results.q_drag_k__[0] == True,
    test_results.q_drag_k__[1] == False,
    test_results.q_they____[2] == True,
    test_results.q_they____[3] == False,
    test_results.q_they____[4] == False,
    test_results.q_2_spirit[5] == True])

# now do detection on whole dataset
qdf_list = [detect_term(t, medcamp, prefix='q_') for t in qterms]
qdf = pd.concat(qdf_list, axis=1, ignore_index=False)

# overall summary
qdf["q_total"] = qdf.sum(axis=1)
qdf["q_any"] = qdf.q_total > 0

# merge back to main file
medcamp = medcamp.join(qdf)

['ace', 'aromantic', 'asexual', 'bisexual', 'demi', 'dyke', 'fag', 'gay', 'lesbian', 'lgbt.*', 'pansexual', 'poly', 'queer', 'same\\-gender loving', 'her girlfriend', 'her wife', 'his boyfriend', 'his husband', '2 spirit', '2\\-spirit', 'afab', 'agender', 'amab', 'assigned female at birth', 'assigned male at birth', 'bigender', 'drag p.*', 'drag k.*', 'drag q.*', 'enby', 'femme', 'ftm', 'gender aff.*', 'gender confirmation', 'gender dysphoria', 'gender euphoria', 'gender f.*', 'gender non.*', 'gender queer', 'gender transition', 'gender\\-aff.*', 'genderf.*', 'genderqueer', 'gnc', 'intersex', 'masc.*', 'mtf', 'mx\\.', 'nb', 'non-binary', 'nonbinary', 'omnigender', 'qtpoc', 'trans', 'transgender', 'transsexual', 'two spirit', 'two\\-spirit', 'ey\\/.*', 'ne\\/.*', 'they\\/.*', 've\\/.*', 'xe\\/.*', 'zie\\/.*', 'bottom surg.*', 'phalloplasty', 'sex reassignment', 'top surgery', 'vaginoplasty', 'fruity', 'sexual and gender minority', 'sgm']


## Write out datasets

In [4]:
# cleanup - drop the alltext column
medcamp = medcamp.drop(columns=['alltext'])
# test LGBTQ dataset for eyeballing success of terms
qmed = medcamp.query('q_any')
qmed.to_csv("GoFundMeUU/data/lgbtq_med_campaigns.csv",
    line_terminator='\n', index=True, encoding='utf-8', quoting=csv.QUOTE_ALL)

## write out dataset with cancer/lgbtq detection
medcamp.to_csv("GoFundMeUU/data/analytic_file.csv",
    line_terminator='\n', index=True, encoding='utf-8', quoting=csv.QUOTE_ALL)