In [2]:
# for connecting to a database 
import os
import psycopg2 as pg

# For data manipulation and analysis
import pandas as pd
import pandas.io.sql as psql
pd.options.display.max_colwidth = 200

In [3]:
# connect to the database, download data 
connection = pg.connect(host = 'postgis1', database = 'sdad', 
                        user = os.environ.get('db_user'), 
                        password = os.environ.get('db_pwd'))

diversity_abstracts = '''SELECT fk_pmid, year, abstract, diversity, soc_diversity
                         FROM pubmed_2021.soc_diversity_abstracts
                         WHERE diversity = 1 OR soc_diversity = 1'''

# convert to a dataframe, show how many missing we have (none)
diversity_abstracts = pd.read_sql_query(diversity_abstracts, con=connection)
diversity_abstracts.head()

Unnamed: 0,fk_pmid,year,abstract,diversity,soc_diversity
0,8956565,1996,"OBJECTIVE:\nDetermination of skeletal or bone age is often used in pediatrics and orthopedics. The most commonly used bone age standards in the United States, those published by Greulich and Pyle,...",1.0,1.0
1,30764487,2019,"Prenatal tobacco exposure is a significant, preventable cause of childhood morbidity, yet little is known about exposure risks for many race/ethnic subpopulations. We studied active smoking and en...",1.0,1.0
2,17015498,2006,"OBJECTIVE:\nAmong premature infants, formula feeding increases the risk for necrotizing enterocolitis, delayed brainstem maturation, decreased scoring on cognitive and developmental tests, and del...",1.0,1.0
3,29020025,2017,OBJECTIVE:\nThe causes of the large and persistent Black-White disparity in preterm birth (PTB) are unknown. It is biologically plausible that chronic stress across a woman's life course could be ...,1.0,1.0
4,21863024,2011,BACKGROUND:\nThis study aimed to examine the incidence and survival of lung cancer patients from several different ethnic groups in a large ethnically diverse population in the United Kingdom.\nME...,1.0,1.0


In [4]:
len(diversity_abstracts)

18301

In [8]:
diversity_abstracts['soc_diversity'].value_counts()

1.0    11789
0.0     6512
Name: soc_diversity, dtype: int64

In [12]:
val = {1990:'early',1991:'early',1992:'early',1993:'early',1994:'early',1995:'early',1996:'early',1997:'early',1998:'early',1999:'early',2000:'early',
       2010:'later',2011:'later',2012:'later',2013:'later',2014:'later',2015:'later',2016:'later',2017:'later',2018:'later',2019:'later',2020:'later'}
diversity_abstracts['sample'] = diversity_abstracts['year'].map(val, na_action = 'ignore')
diversity_abstracts_output = diversity_abstracts[diversity_abstracts['sample'].notnull()]
diversity_abstracts_output

Unnamed: 0,fk_pmid,year,abstract,diversity,soc_diversity,sample
0,8956565,1996,"OBJECTIVE:\nDetermination of skeletal or bone age is often used in pediatrics and orthopedics. The most commonly used bone age standards in the United States, those published by Greulich and Pyle,...",1.0,1.0,early
1,30764487,2019,"Prenatal tobacco exposure is a significant, preventable cause of childhood morbidity, yet little is known about exposure risks for many race/ethnic subpopulations. We studied active smoking and en...",1.0,1.0,later
3,29020025,2017,OBJECTIVE:\nThe causes of the large and persistent Black-White disparity in preterm birth (PTB) are unknown. It is biologically plausible that chronic stress across a woman's life course could be ...,1.0,1.0,later
4,21863024,2011,BACKGROUND:\nThis study aimed to examine the incidence and survival of lung cancer patients from several different ethnic groups in a large ethnically diverse population in the United Kingdom.\nME...,1.0,1.0,later
7,29494587,2018,"BACKGROUND:\nMigrant and ethnic minority groups are often assumed to have poor health relative to the majority population. Few countries have the capacity to study a key indicator, mortality, by e...",1.0,1.0,later
...,...,...,...,...,...,...
18296,33296699,2020,Variants of bladder exstrophy are a rare but diverse spectrum of bladder exstrophy-epispadias complex. This case series describes a group of four unique exstrophy variant cases who had an intact p...,1.0,0.0,later
18297,33303767,2020,To better predict population evolution of invasive species in introduced areas it is critical to identify and understand the mechanisms driving genetic diversity and structure in their native rang...,1.0,0.0,later
18298,33303927,2020,"Attachment of microorganisms to natural or artificial surfaces and the development of biofilms are complex processes which can be influenced by several factors. Nevertheless, our knowledge on biof...",1.0,0.0,later
18299,33306734,2020,"Next-Generation Sequencing (NGS) technologies, by reducing the cost and increasing the throughput of sequencing, have opened doors to generate genomic data in a range of previously poorly studied ...",1.0,0.0,later


In [14]:
len(diversity_abstracts_output)
diversity_abstracts_output['soc_diversity'].value_counts()

1.0    9514
0.0    5556
Name: soc_diversity, dtype: int64

In [17]:
diversity_abstracts_output.to_csv('/sfs/qumulo/qhome/kb7hp/git/diversity/data/bert_data/diversity_test_abstracts.csv', index=False)