# Import libraries

In [1]:
import os
import shutil
import json
import math
import random
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

# https://www.thepythoncode.com/article/extract-pdf-images-in-python
import fitz # PyMuPDF
import io
from PIL import Image

# https://github.com/joke2k/faker
from faker import Faker
fake = Faker()

# Load Bik data

In [2]:
# Load data into dataframe from file
# Bik_v2 is updated with First Author Affiliation completely filled
bik_df = pd.read_csv('Bik_v2.tsv', sep='\t', encoding='unicode-escape')

# Show all columns
pd.set_option('display.max_columns', None)

# Clean data

In [3]:
# Drop first column (indices), since pandas automatically provides indices - column 0
# Drop duplicate university name column - column 27
bik_df = bik_df.drop(bik_df.columns[[0,27]], axis=1)

In [4]:
# Reformat lab size as single integer (instead of list of integers)
bik_df["Lab Size"] = [len(i) for i in bik_df["Lab Size"]]

In [5]:
# Remove special characters from Affiliations
# Easier for LaTeX

fa_aff = bik_df["First Author Affiliation"].tolist()
fa_aff = [i.strip() if isinstance(i,str) else np.nan for i in fa_aff]

for i in range(len(fa_aff)):
    if fa_aff[i] == "UniversitÃ\xa0 degli Studi di Bari Aldo Moro":
        fa_aff[i] = "University of Bari Aldo Moro"
    elif fa_aff[i] == "Blood transfusion centre of Slovenia - Zavod Republike Slovenije za transfuzijsko medicino":
        fa_aff[i] = "Blood Transfusion Centre of Slovenia"
    elif fa_aff[i] == "Justus-Liebig-UniversitÃ¤t GieÃ\x9fen":
        fa_aff[i] = "Justus Liebig University Giessen"
    elif fa_aff[i] == "University of Wisconsinâ\x80\x93Madison":
        fa_aff[i] = "University of Wisconsin-Madison"
    elif fa_aff[i] == "University of Iowa, Iowa City, USA":
        fa_aff[i] = "University of Iowa"
    elif fa_aff[i] == "Universidad PolitÃ©cnica de Valencia-C.S.I.C":
        fa_aff[i] = "Technical University of Valencia"
    elif fa_aff[i] == "Concordia Universityâ\x80\x93Ann Arbor":
        fa_aff[i] = "Concordia University Ann Arbor"
    elif fa_aff[i] == "University of Maryland, College Park":
        fa_aff[i] = "University of Maryland"
    elif fa_aff[i] == "UniversitÃ¤tsklinikum Erlangen":
        fa_aff[i] = "University Hospital Erlangen"
    elif fa_aff[i] == "Universidad AndrÃ©s Bello":
        fa_aff[i] = "Andres Bello National University"
    elif fa_aff[i] == "Hospital Son Dureta and Instituto Universitario de InvestigaciÃ³n en Ciencias de la Salud":
        fa_aff[i] = "Hospital Son Dureta and Instituto Universitario de Investigacion en Ciencias de la Salud"
    elif fa_aff[i] == "Department of OncologyFaculty of MedicineMcGill UniversityLady Davis Institute-Segal Cancer Center from the Jewish General HospitalMontrealQuebecCanada":
        fa_aff[i] = "McGill University"
    elif fa_aff[i] == "Cancer Research LaboratoryFudan University Shanghai Cancer CenterShanghai China":
        fa_aff[i] = "Fudan University"
    elif fa_aff[i] == "Department of Biochemistry and Molecular BiophysicsBiochemistry and Molecular and Cellular Biology Graduate ProgramUniversity of ArizonaTucson AZ USA":
        fa_aff[i] = "University of Arizona"
    elif fa_aff[i] == "Department of Experimental and Clinical MedicineUniversitÃ\xa0 Politecnica delle MarcheVia Tronto 10/aAncona60020 Italy":
        fa_aff[i] = "Marche Polytechnic University Faculty of Medicine"
    elif fa_aff[i] == "UniversitÃ© de Strasbourg-CNRS" or fa_aff[i] == "CNRS UniversitÃ© de Strasbourg":
        fa_aff[i] = "University of Strasbourg"
    elif fa_aff[i] == "Divisiont of Urology Kobe University Graduate School of Medicine Kobe Hyogo 650-0017 Japan":
        fa_aff[i] = "Kobe University"
    elif fa_aff[i] == "School of Human Science and Environment University of Hyogo Himeji Hyogo 670-0092 Japan":
        fa_aff[i] = "University of Hyogo"
    elif fa_aff[i] == "Johns Hopkins University School of Medicine and the Sidney Kimmel Comprehensive Cancer Center at Johns Hopkins":
        fa_aff[i] = "Johns Hopkins University"
    elif fa_aff[i] == "UniversitÃ© Paris-Saclay":
        fa_aff[i] = "University of Paris-Saclay"
    elif fa_aff[i] == "INFN - Istituto Nazionale di Fisica Nuclear":
        fa_aff[i] = "Istituto Nazionale di Fisica Nuclear"
    elif fa_aff[i] == 'UniversitÃ\xa0 degli Studi della Campania "Luigi Vanvitelli':
        fa_aff[i] = "University of Campania Luigi Vanvitelli"
    elif fa_aff[i] == "UniversitÃ© CÃ´te d'Azur":
        fa_aff[i] = "Cote d'Azur University"
    elif fa_aff[i] == "Institut de CancÃ©rologie Gustave Roussy":
        fa_aff[i] = "Gustave Roussy Institute of Cancerology"
    elif fa_aff[i] == "Graduate School at Shenzhen, Tsinghua Univeristy":
        fa_aff[i] = "Tsinghua University"
    elif fa_aff[i] == "Ecole Normale SupÃ©rieure de Paris":
        fa_aff[i] = "Ecole Normale Superieure, Paris"   

In [6]:
bik_df = bik_df.drop(columns=["First Author Affiliation"])
bik_df.insert(22,"First Author Affiliation",fa_aff)

In [None]:
# bik_df

# Download PDFs

- **USC VPN required**: `Wiley` [119:121], `Cancer` [135:143], `SciDirect` [144:177], `Science` [178:180], `Nature` [181:187]

In [None]:
# Initialize dictionary of Elsevier IDs to use later for file renaming
elsevierID = {}

# Download PDFs
def download_pdf(df,url,indx):
    
    # Create PDFS folder to save PDFs to
    folder_location = r'PDFS'
    if not os.path.exists(folder_location):os.mkdir(folder_location)
    
    # Get PDF download link
    if df["Home Site"][indx] == "PLOS":
        pdf_link = "https://journals.plos.org/plosone/article/file?id=" + df["DOI"][indx] + "&type=printable"
    elif df["Home Site"][indx] == "ASM":
        pdf_link = "https://journals.asm.org/doi/pdf/" + df["DOI"][indx]
    elif df["Home Site"][indx] == "PubMed":
        # Scrape page
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        # Get DOI
        doi = soup.find("a",{"data-ga-action":"DOI"}).text.replace("\n","").strip()
        # Replace PMID with DOI in dataframe
        df.at[indx,'DOI'] = doi
        # PDF download link format for ASM Journals
        pdf_link = "https://journals.asm.org/doi/pdf/" + doi
    elif df["Home Site"][indx] == "Hindawi":
        if df["Citation"][indx][:3] == "Bio":
            pdf_link = "https://downloads.hindawi.com/journals/bmri/" + "/".join(df["DOI"][indx].split("/")[-2:]) + ".pdf"
        else: # == "Evi"
            pdf_link = "https://downloads.hindawi.com/journals/ecam/" + "/".join(df["DOI"][indx].split("/")[-2:]) + ".pdf"
    elif df["Home Site"][indx] == "Wiley":
        if df["DOI"][indx][3:7] == "1002":
            pdf_link = "https://onlinelibrary.wiley.com/doi/pdf/" + df["DOI"][indx]
        else: # == "1111"
            pdf_link = "https://sfamjournals.onlinelibrary.wiley.com/doi/pdf/" + df["DOI"][indx]
    elif df["Home Site"][indx] == "BioMed":
        if df["DOI"][indx][8:10] == "gb":
            pdf_link = "https://genomebiology.biomedcentral.com/track/pdf/" + df["DOI"][indx] + ".pdf"
        elif df["DOI"][indx][8:10] == "bc":
            pdf_link = "https://breast-cancer-research.biomedcentral.com/track/pdf/" + df["DOI"][indx] + ".pdf"
        else: # == "14"
            pdf_link = "https://bmcmicrobiol.biomedcentral.com/track/pdf/" + df["DOI"][indx] + ".pdf"
    elif df["Home Site"][indx] == "Cancer":
        if df["DOI"][indx][8:12] == "j.cc":
            # Scrape page
            s = Service(ChromeDriverManager().install())
            driver = webdriver.Chrome(service = s)
            driver.get(url)
            html = driver.page_source
            soup = BeautifulSoup(html)
            # Choose PDF viewing option
            pdf_options = soup.find("li", class_="article-tools__item article-tools__pdf").find_all("a")
            for a in pdf_options:
                if pdf_options[0]["href"] == "#":
                    url_path = pdf_options[1]["href"]
                else:
                    url_path = pdf_options[0]["href"]
            url_domain = "https://www.cell.com"
            # Combine URL domain and scraped path
            pdf_link = url_domain + url_path
        else: # == S016 or j.lu
            s = Service(ChromeDriverManager().install())
            driver = webdriver.Chrome(service = s)
            driver.get(url)
            time.sleep(5) # Wait while URL redirects
            driver.switch_to.window(driver.window_handles[-1])
            pdf_link = driver.current_url.split("?")[0] + "/pdfft?isDTMRedir=true&download=true"
            elsevierID[indx] = str(driver.current_url.split("?")[0].split("/")[-1])
    elif df["Home Site"][indx] == "SciDirect":
        # Scrape page
        s = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service = s)
        driver.get(url)
        time.sleep(5) # Wait while URL redirects
        driver.switch_to.window(driver.window_handles[-1])
        pdf_link = driver.current_url.split("?")[0] + "/pdfft?isDTMRedir=true&download=true"
        elsevierID[indx] = str(driver.current_url.split("?")[0].split("/")[-1]) 
    elif df["Home Site"][indx] == "Science":
        pdf_link = "https://www.science.org/doi/pdf/" + df["DOI"][indx]
    elif df["Home Site"][indx] == "Nature":
        if df["DOI"][indx][8:11] == "onc":
            pdf_link = "https://www.nature.com/articles/" + df["DOI"][indx].split("/")[1].replace(".","") + ".pdf"
        else:
            pdf_link = "https://www.nature.com/articles/" + df["DOI"][indx].split("/")[1] + ".pdf"
    elif df["Home Site"][indx] == "RUPress":
        pdf_link = "https://rupress.org/jcb/article-pdf/199/3/481/1357646/" + df["DOI"][indx].split("/")[1].replace(".","_") + ".pdf"
    elif df["Home Site"][indx] == "Spandidos":
        pdf_link = "https://www.spandidos-publications.com/" + df["DOI"][indx] + "/download"
    elif df["Home Site"][indx] == "PNAS":
        pdf_link = "https://www.pnas.org/doi/pdf/" + df["DOI"][indx]
    else:
        pdf_link = str(indx) + ": ERROR - Link not recognized."
    
    # Feedback
    print(pdf_link)
    
    # File name: Bik index
    filename = folder_location + "/" + str(indx) + ".pdf"
    
    # Download PDF from link
    # https://stackoverflow.com/questions/54616638/download-all-pdf-files-from-a-website-using-python
    try:
        # Not blocked by Cloudflare
        if df["Home Site"][indx] in ["PLOS","Hindawi","BioMed","Cancer","Nature","Spandidos"]:
            with open(filename, 'wb') as f:
                f.write(requests.get(pdf_link).content)
        # Bypass Cloudflare
        elif df["Home Site"][indx] in ["ASM"]:#,"PubMed","Wiley","Cancer","SciDirect","Science","RUPress","PNAS"]:
            s = Service(ChromeDriverManager().install())
            # Bypass Cloudflare: https://blog.m157q.tw/posts/2020/09/11/bypass-cloudflare-detection-while-using-selenium-with-chromedriver
            options = webdriver.ChromeOptions()
            options.add_experimental_option("excludeSwitches", ["enable-automation"])
            options.add_experimental_option('useAutomationExtension', False)
            options.add_argument("--disable-blink-features=AutomationControlled")
            # Download PDF with Selenium: https://stackoverflow.com/questions/43149534/selenium-webdriver-how-to-download-a-pdf-file-with-python
            options.add_experimental_option('prefs', {
            "download.default_directory": "/Users/auderoy/dsci550/Assignment 2/PDFS", # SET FILE PATH
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "plugins.always_open_pdf_externally": True
            })
            driver = webdriver.Chrome(service=s,options=options)
            driver.get(pdf_link)
            time.sleep(5)
        # Bypass Cloudflare, click to confirm download
        elif df["Home Site"][indx] == "Wiley":
            s = Service(ChromeDriverManager().install())
            # Bypass Cloudflare
            options = webdriver.ChromeOptions()
            options.add_experimental_option("excludeSwitches", ["enable-automation"])
            options.add_experimental_option('useAutomationExtension', False)
            options.add_argument("--disable-blink-features=AutomationControlled")
            # Download PDF with Selenium
            options.add_experimental_option('prefs', {
            "download.default_directory": "/Users/auderoy/dsci550/Assignment 2/PDFS", # SET FILE PATH
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "plugins.always_open_pdf_externally": True
            })
            driver = webdriver.Chrome(service=s,options=options)
            driver.get(pdf_link)
            time.sleep(5)
            # Click button in iframe to confirm download: https://www.selenium.dev/documentation/webdriver/browser/frames
            driver.switch_to.frame(driver.find_element(By.CSS_SELECTOR,'body.pb-ui').find_element(By.TAG_NAME,'iframe'))
            driver.find_element(By.XPATH,'''/html/body/div/div/a/button''').click()
            time.sleep(5)    
    except:
        print("------------------------------")
        print(f"ERROR: Index - {indx}, Home Site - {df['Home Site'][indx]}, PDF link - {pdf_link}")
        print("------------------------------")

In [None]:
# Call download_pdf for each paper
for index, row in bik_df.iterrows():
    print("INDEX: ", index)
    print(row["Home Site"], ", ", row["DOI"])
    print(row["Title"])
    print(row["URL"])
    # download_pdf(bik_df,row["URL"],index) # PDFs already downloaded; uncomment to redownload

In [8]:
# PubMed DOI saved for convenience, derived from download_pdf
# Update PubMed PMID with DOI

bik_df.at[51,'DOI'] = "10.1128/iai.64.6.2282-2287.1996"
bik_df.at[52,'DOI'] = "10.1128/IAI.67.1.80-87.1999"
bik_df.at[80,'DOI'] = "10.1128/JCM.36.6.1666-1673.1998"

## Rename PDFs downloaded with Selenium &rarr; "{Bik index}.pdf"

In [None]:
# Dictionary saved for convenience, derived from download_pdf
# Matching Bik index to Elselvier ID
elsevierID = {135: 'S0169500201002124',
 136: 'S0169500203002393',
 137: 'S016950020600287X',
 138: 'S0169500206003308',
 139: 'S0169500208003085',
 140: 'S0169500209003687',
 141: 'S0169500209005418',
 142: 'S0169500211000481',
 143: 'S0169500211005150',
 144: 'S089684110500123X',
 145: 'S0896841104001234',
 146: 'S0896841106000357',
 147: 'S0896841107000182',
 148: 'S089684110800139X',
 149: 'S089684111400064X',
 150: 'S1043466603004423',
 151: 'S1043466605000323',
 152: 'S1043466605000128',
 153: 'S104346660600216X',
 154: 'S1043466607000038',
 155: 'S1043466606003450',
 156: 'S1043466606000020',
 157: 'S1043466607000828',
 158: 'S1043466608001853',
 159: 'S1043466608001816',
 160: 'S1043466608000021',
 161: 'S1043466608000409',
 162: 'S1043466609001896',
 163: 'S1043466609000271',
 164: 'S1043466609000325',
 165: 'S1043466609000179',
 166: 'S1043466609008813',
 167: 'S1043466611001839',
 168: 'S1043466611006806',
 169: 'S1043466611000573',
 170: 'S1043466610006897',
 171: 'S1043466611001827',
 172: 'S1043466612001597',
 173: 'S1043466612006667',
 174: 'S1043466612002177',
 175: 'S1043466613001749',
 176: 'S1043466613001531',
 177: 'S1043466614004748'}

In [None]:
# Rename ASM, PubMed, Science, RUPress, PNAS papers
# SET FILE PATH (x3)
for f in os.listdir('PDFS'):
    count = 0
    for c in bik_df["DOI"]:
        if c[12:] == f[4:-4]:
            shutil.move(os.path.join('PDFS',f),os.path.join('PDFS',f"{count}.pdf"))
        else:
            count += 1

In [None]:
# Rename Elsevier (Cancer, SciDirect) papers
# SET FILE PATH (x3)
for f in os.listdir('PDFS'):
    for key,value in elsevierID.items():
        try:
            if value in f.split("-")[2]:
                shutil.move(os.path.join('PDFS',f),os.path.join('PDFS',f"{key}.pdf"))
        except:
            continue
            
            

In [None]:
# Rename Wiley papers
# Manual: 111, 114, 116, 119
# SET FILE PATH (x3)
for f in os.listdir('PDFS'):
    count = 0
    try:
        filename = f.split("-")[3:][0].strip()
    except:
        filename = f
    for c in bik_df["Title"]:
        if c[:15] == filename[:15]:
            shutil.move(os.path.join('PDFS',f),os.path.join('PDFS',f"{count}.pdf"))
        else:
            count += 1

# Extract images from PDFs

- [The Python Code: Extract PDF images in Python](https://www.thepythoncode.com/article/extract-pdf-images-in-python)
- Different organizational methods for image generation tests

## Method 1: Save images to individual folders (per paper)

In [None]:
def extract_images(df,file,indx):
    
    print("------------------------------")
    print("INDEX: ", indx)
    
    # Open file
    pdf_file = fitz.open(file)
    
    # Create folder for images
    folder_location = "images/extracted_images/" + df["Title"][indx][:240].replace("/","-") + "-images"
    if not os.path.exists(folder_location):os.mkdir(folder_location)
        
    # Iterate over PDF pages
    for page_index in range(len(pdf_file)):
        # Get page
        page = pdf_file[page_index]
        image_list = page.get_images()
        # Print number of images found on page
        if image_list:
            print(f"[+] Found a total of {len(image_list)} images on page {page_index}")
        else:
            print("[!] No images found on page", page_index)
        for image_index, img in enumerate(page.get_images(), start=1):
            try:
                # Get image XREF
                xref = img[0]
                # Extract image bytes
                base_image = pdf_file.extract_image(xref)
                image_bytes = base_image["image"]
                # Get image extension
                image_ext = base_image["ext"]
                # Load to PIL
                image = Image.open(io.BytesIO(image_bytes))
                # Save to local disk
                image.save(open(f"{folder_location}/image{page_index+1}_{image_index}.{image_ext}", "wb"))
            except:
                continue

In [None]:
# Call extract_images for each PDF
# PDFs named by index
for i in range(0,214):
    try:
        pdf_file = f"PDFS/{i}.pdf"
        extract_images(bik_df,pdf_file,i)
    except:
        continue

## Method 2: Save all images to one folder

In [None]:
def extract_images_new(df,file,indx):
    
    print("------------------------------")
    print("INDEX: ", indx)
    
    # Open file
    pdf_file = fitz.open(file)
    
    # Create folder for images
    folder_location = "images/extracted_images_new"
    if not os.path.exists(folder_location):os.mkdir(folder_location)
    
    # Iterate over PDF pages
    for page_index in range(len(pdf_file)):
        # Get page
        page = pdf_file[page_index]
        image_list = page.get_images()
        # Print number of images found on page
        if image_list:
            print(f"[+] Found a total of {len(image_list)} images on page {page_index}")
        else:
            print("[!] No images found on page", page_index)
        for image_index, img in enumerate(page.get_images(), start=1):
            try:
                # Get image XREF
                xref = img[0]
                # Extract image bytes
                base_image = pdf_file.extract_image(xref)
                image_bytes = base_image["image"]
                # Get image extension
                image_ext = base_image["ext"]
                # Load to PIL
                image = Image.open(io.BytesIO(image_bytes))
                # Save to local disk
                image.save(open(f"{folder_location}/image{indx}_{page_index+1}_{image_index}.{image_ext}", "wb"))
            except:
                continue

In [None]:
# Call extract_images_new for each PDF
# PDFs named by index
for i in range(0,214):
    try:
        pdf_file = f"PDFS/{i}.pdf"
        extract_images_new(bik_df,pdf_file,i)
    except:
        continue

# Generate fake images

- [GitHub: Face Generator](https://github.com/gsurma/face_generator)
- If using your own dataset, generate text file below, then run `FaceGeneratorDCGAN.ipynb`

In [None]:
# Export image file names to extracted_images.txt for FaceGeneratorDCGAN.ipynb

imagepaths = []

for folder in os.listdir('images/extracted_images'):
    try:
        for image in os.listdir(f'images/extracted_images/{folder}'):
            imagepaths.append(f"{folder}/{image}")
    except:
        continue

with open('images/extracted_images.txt', 'w') as f:
    for imagepath in imagepaths:
        if imagepath[-9:] != ".DS_Store":
            f.write(imagepath)
            f.write('\n')

# Generate fake captions

- Build Docker
- Execute `python caption_generation/part6.py` on Terminal
- Output file: `out.out`
- Import below and parse

In [10]:
# Convert string with multiple dictionaries: https://stackoverflow.com/questions/20037630/how-to-convert-a-string-with-multiple-dictionaries-so-json-load-can-parse-it

# Load file
c = open("../caption_generation/out.out", "r")
caption_str = c.read()

# Put each output on a new line
caption_str = caption_str.replace("}}","}}\n")

# Make a list (of strings) from each line
caption_list = [d.strip() for d in caption_str.splitlines()]
caption_list

# Parse JSON strings as Python dictionaries
caption_dict = [json.loads(i) for i in caption_list]
caption_dict

# Choose caption with highest confidence
captions = []
for i in caption_dict:
    best_caption = max(i["captions"], key=lambda x:x["confidence"])
    captions.append(best_caption["sentence"])
captions
%store captions


Stored 'captions' (list)


# Generate fake features

## Author names

In [11]:
# List of all authors from all Bik papers
authors214 = bik_df["Authors"].tolist()

# List of number of authors per paper
authors_per_paper = []
for i in authors214:
    authors_per_paper.append(len(i.split(",")))
# authors_per_paper

In [12]:
# Generate list of author names (string type)
def generate_authors():
    # for i in range(0,random.randint(3,9)): # Most papers have 3-9 authors
    for i in range(0,random.choice(authors_per_paper)): # Weighted random
        name = fake.name()
        # First author
        if i == 0:
            authors = name
        # Subsequent authors require leading comma
        else:
            authors = authors + ", " + name
    return authors

## Affiliations

In [13]:
# Sample affiliation from Bik papers
# Remove empty floats
affs214 = bik_df["First Author Affiliation"].tolist()
sample_aff = [i for i in affs214 if isinstance(i,str)]

# Generate affiliation
def generate_aff():
    return random.choice(sample_aff)

## Other

In [14]:
# Sample all values from dataframe column to get weighted random sample

def weighted_random(df,col_name):
    values214 = df[col_name].tolist()
    return random.choice(values214)

In [15]:
# Year

def generate_year():
    return weighted_random(bik_df,"Year")

In [16]:
# Month

months214 = bik_df["Month"].tolist()
sample_months = [i for i in months214 if i>0] # Remove nan from list

def generate_month():
    return random.choice(sample_months)

months_abc = {1:"January",
             2:"February",
             3:"March",
             4:"April",
             5:"May",
             6:"June",
             7:"July",
             8:"August"}

In [17]:
# 0

def generate_0():
    return weighted_random(bik_df,"0")

In [18]:
# 1

def generate_1():
    return weighted_random(bik_df,"1")

In [19]:
# 2

def generate_2():
    return weighted_random(bik_df,"2")

In [20]:
# 3

def generate_3():
    return weighted_random(bik_df,"3")

In [21]:
# Findings

def generate_findings():
    return weighted_random(bik_df,"FINDINGS")

In [22]:
# Retraction

def generate_retraction():
    return weighted_random(bik_df,"Retraction")

In [23]:
# Correction Date

sample_correctionrange = []
for i in range(0,214):
    m = bik_df["Month"][i]
    y = bik_df["Year"][i]
    c = bik_df["Correction Date"][i]
    if isinstance(c,str):
        cm = int(c.split("/")[0])
        cy = int("20" + c.split("/")[2])
        if m > 0:  
            m = int(m)
            if m == cm:
                if cy - y == 0:
                    sample_correctionrange.append(6)
                else:
                    sample_correctionrange.append(12*(cy-y))
            elif m > cm:
                sample_correctionrange.append(12*(cy-y-1) + (12-m) + cm)
            else: # m < cm
                sample_correctionrange.append(12*(cy-y) + (cm-m))
        else: # m == nan
            if cy - y == 0:
                sample_correctionrange.append(6)
            else:
                sample_correctionrange.append(12*(cy-y))
sample_correctionrange

def generate_correctiondate(mo,yr):
    m = mo + random.choice(sample_correctionrange)
    return f"{m%12}/1/{yr + math.floor(m/12)}"

In [24]:
# Correction

def generate_correction():
    return weighted_random(bik_df,"Correction")

In [25]:
# No Action

def generate_noaction():
    return weighted_random(bik_df,"No Action")

In [26]:
# First Author Career Duration

def generate_duration():
    return weighted_random(bik_df,"First Author Career Duration")

In [27]:
# First Author Degree

def generate_degree():
    return random.choices(["PhD","MD","MSc","BSc"],weights=(33/39,4/39,1/39,1/39))

In [28]:
# First Author Degree Area

degreearea214 = bik_df["First Author Degree Area"].tolist()
sample_degreearea = [i for i in degreearea214 if isinstance(i,str)] # Remove nan from list

def generate_degreearea():
    return random.choice(sample_degreearea)

In [29]:
# Home Site

def generate_homesite():
    return weighted_random(bik_df,"Home Site")

In [30]:
# DOI

def generate_doi(hs,yr):
    if hs == "PLOS":
        x = random.choice(["00","10"])
        y = random.randint(0,99999)
        return f"10.1371/journal.pone.{x}{y}"
    elif hs == "ASM" or hs == "PubMed":
        x = random.choice(["mBio","IAI.","JCM."])
        if x == "mBio":
            y = f".00{random.randint(200,500)}-{random.randint(10,15)}"
        elif x == "IAI.":
            y = f"0{random.choice([0,1,5,6])}{random.randint(100,900)}-{random.randint(10,15)}"
        else: # x == "JCM."
            y = f"00{random.randint(100,500)}-0{random.randint(0,9)}"
        return f"10.1128/{x}{y}"
    elif hs == "Hindawi":
        return f"10.1155/{yr}/{random.randint(150000,999999)}"
    elif hs == "Wiley":
        x = random.choice(["10.1002/ijc.","10.1111/j."])
        if x == "10.1002/ijc.":
            y = f"{random.randint(27000,29000)}"
        else: # x == "10.1111/j."
            y = f"{random.randint(1300,1500)}-{random.choice([0,2])}{random.randint(400,999)}.{yr}.0{random.randint(1000,4000)}.x"
        return f"{x}{y}"
    elif hs == "BioMed":
        x = random.choice(["1471","gb","bcr"])
        if x == "1471":
            y = f"-2180-{random.randint(10,15)}-{random.randint(10,200)}"
        elif x == "gb":
            y = f"-{yr}-r{random.randint(100,200)}"
        else: # x == "bcr"
            y = random.randint(3000,4000)
        return f"10.1186/{x}{y}"
    elif hs == "Cancer" or hs == "SciDirect":
        if hs == "Cancer":
            x = random.choice(["S0169","j.lungcan","j.ccr","j.ccell"])
        else: # hs = "SciDirect"
            x = random.choice(["j.jaut","j.cyto"])
        if x == "S0169":
            y = f"-5002(0{random.randint(1,5)})00{random.randint(200,300)}-{random.randint(1,5)}"
        else:
            y = f".{yr}.{random.choice([0,1])}{random.randint(0,9)}.0{random.randint(10,99)}"
        return f"10.1016/{x}{y}"
    elif hs == "Science":
        return f"10.1126/science.{random.randint(1100000,1500000)}"
    elif hs == "Nature":
        x = random.choice(["","nature","onc"])
        if x == "":
            y = random.randint(30000,50000)
        elif x == "nature":
            y = random.randint(10000,15000)
        else: # x == "onc"
            y = f".{yr}.{random.randint(10,999)}"
        return f"10.1038/{x}{y}"
    elif hs == "RUPress":
        return f"10.1083/jcb.{yr}0{random.randint(1000,5000)}"
    elif hs == "Spandidos":
        return f"10.3892/ijo.{yr}.{random.randint(1000,5000)}"
    elif hs == "PNAS":
        return f"10.1073/pnas.{random.randint(1000000000,1500000000)}"

In [31]:
# Pub Rate
sample_pubrate = []
for i in range(0,214):
    pr = bik_df["Pub Rate"][i][1:-1].split(",")
    for j in pr:
        if j not in ["nan"," nan",""]:
            sample_pubrate.append(float(j))

def generate_pubrate():
    return random.choice(sample_pubrate)

In [32]:
# Other Journals

def generate_otherjournals():
    return weighted_random(bik_df,"Other Journals")

## Call all functions to generate all features

In [34]:
authors = []
title = []
year = []
month = []
month_abc = []
zero = []
one = []
two = []
three = []
findings = []
correctiondate = []
retraction = []
correction = []
noaction = []
homesite = []
doi = []
url = []
citation = []
labsize = []
pubrate = []
otherjournals = []
affiliations = []
duration = []
degree = []
degreearea = []

for i in range(0,500):
    
    authors.append(generate_authors())
    
    generated_text = open(f'../fake_text/fake_text/{i}.txt', 'r').read().replace("\\n","\n\n").replace("$","\$")[1:-2]
    t = " ".join(generated_text.split(" ")[:10])
    title.append(t)
    
    year.append(generate_year())
    month.append(generate_month())
    month_abc.append(months_abc[int(month[i])])
    zero.append(generate_0())
    one.append(generate_1())
    two.append(generate_2())
    three.append(generate_3())
    findings.append(generate_findings())
    correctiondate.append(generate_correctiondate(month[i],year[i]))
    retraction.append(generate_retraction())
    correction.append(generate_correction())
    noaction.append(generate_noaction())
    homesite.append(generate_homesite())
    doi.append(generate_doi(homesite[i],year[i]))
    url.append("http://doi.org/" + doi[i])
    citation.append(f"{homesite[i]}. {year[i]}. {random.randint(10,99)}({random.randint(10,99)}), {random.randint(100,999)}.")
    labsize.append(len(authors[i].split(",")))
    pubrate.append(generate_pubrate())
    otherjournals.append(generate_otherjournals())
    affiliations.append(generate_aff())
    duration.append(generate_duration())
    degree.append(generate_degree())
    degreearea.append(generate_degreearea())

# Make accessible to LaTeX.ipynb
%store authors
%store title
%store year
%store month
%store month_abc
%store affiliations

Stored 'authors' (list)
Stored 'title' (list)
Stored 'year' (list)
Stored 'month' (list)
Stored 'month_abc' (list)
Stored 'affiliations' (list)


# Update dataframe

In [35]:
%store -r h_probs
%store -r m_probs

In [36]:
bik_df["(Grover) Falsified Media"] = h_probs
bik_df["Falsified Media (Manually labeled)"] = [0 for i in range(0,214)]
# bik_df

Unnamed: 0,Authors,Title,Citation,DOI,Year,Month,0,1,2,3,FINDINGS,Reported,Correction Date,Retraction,Correction,No Action,SUM \nCompleted,URL,Home Site,Lab Size,Pub Rate,Other Journals,First Author Affiliation,First Author Career Duration,First Author Degree,First Author Degree Area,world_rank_x,country_x,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations_x,broad_impact,patents,score,year_x,world_rank_y,country_y,teaching,international,research,citations_y,income,total_score,num_students,student_staff_ratio,international_students,female_male_ratio,year_y,city_ascii,state_id,state_name,county_fips,county_name,lat,lng,population,density,source,military,incorporated,timezone,ranking,zips,county,labor_force,employed,unemployed,rate,(Grover) Falsified Media,Falsified Media (Manually labeled)
0,"Inka Regine Weingaertner, Sarah Koutnik, Herma...",Chronic Morphine Treatment Attenuates Cell Gro...,"PLOS One, January 2013 | Volume 8 | Issue 1 | ...",10.1371/journal.pone.0053510,2013,1.0,1.0,,1.0,,"duplicated blot, shifted microscopy",1,10/9/15,,1.0,,1,http://doi.org/10.1371/journal.pone.0053510,PLOS,9,"[0.5, nan, nan]","[['PLoS ONE ', ""Naunyn-Schmiedeberg's Archives...",Ludwig-Maximilians-University of Munich,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.632741,0
1,"Jessica M. Esparza, Eileen OÃÂToole, Linya L...",Katanin Localization Requires Triplet Microtub...,"PLOS ONE, January 2013 | Volume 8 | Issue 1 | ...",10.1371/journal.pone.0053940,2013,1.0,,1.0,,,duplicated colony strip,1,1/15/16,,1.0,,1,http://doi.org/10.1371/journal.pone.0053940,PLOS,21,"[0.7647058823529411, nan, nan, 0.23, nan, nan,...","[['PLoS ONE ', 'Molecular Biology of the Cell ...",Washington University in St. Louis,13.0,,Department of Genetics,52.0,USA,35.0,101.0,96.0,71.0,41.0,21.0,29.0,17.0,169.0,55.03,2015.0,,,,,,,,,,,,,,St. Louis,MO,Missouri,29510.0,St. Louis,38.6358,-90.2451,2024074.0,1879.0,polygon,False,True,America/Chicago,1.0,63110 63111 63112 63113 63115 63116 63118 6312...,st.louis,101110,93824,7286,7.2,0.398791,0
2,"Sreedevi Avasarala, Fangfang Zhang, Guangliang...",Curcumin Modulates the Inflammatory Response a...,"PLOS ONE, February 2013 | Volume 8 | Issue 2 |...",10.1371/journal.pone.0057285,2013,2.0,,,1.0,,reuse of one lane in different blot experiment,1,8/4/15,,1.0,,1,http://doi.org/10.1371/journal.pone.0057285,PLOS,18,"[nan, 0.09090909090909091, nan, nan, nan, nan]","[[], ['Current Developments in Nutrition ', 'T...",University of Illinois at Chicago,11.0,,Department of Dental Medicine,203.0,USA,88.0,367.0,541.0,218.0,126.0,196.0,287.0,179.0,129.0,47.39,2015.0,201-225,United States of America,43.0,53.4,34.1,55.1,40.7,-,24313,9.2,17%,53:47:00,2015.0,Chicago,IL,Illinois,17031.0,Cook,41.8373,-87.6862,8604203.0,4574.0,polygon,False,True,America/Chicago,1.0,60018 60649 60641 60640 60643 60642 60645 6064...,cook,8102,7718,384,4.7,0.735152,0
3,"Rounak Nassirpour, Pramod P. Mehta, Sangita M....",miR-221 Promotes Tumorigenesis in Human Triple...,"PLOS ONE, April 2013 | Volume 8 | Issue 4 | e6...",10.1371/journal.pone.0062170,2013,4.0,1.0,,,1.0,Duplicated / mirrored bands in blot,1,4/10/17,1.0,,,1,http://doi.org/10.1371/journal.pone.0062170,PLOS,12,"[nan, nan, nan, nan]","[[], [], [], []]",,10.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.704388,0
4,"Jianuo Liu, Peng Xu, Cory Collins, Han Liu, J...",HIV-1 Tat Protein Increases Microglial Outward...,"PLOS ONE, May 2013 | Volume 8 | Issue 5 | e649...",10.1371/journal.pone.0064904,2013,5.0,,1.0,,,multiple duplicated microscopy images,1,9/19/14,,1.0,,1,http://doi.org/10.1371/journal.pone.0064904,PLOS,21,"[nan, nan, nan, nan, nan, nan, nan]","[[], [], [], [], [], [], []]",University of Nebraska Medical Center,19.0,,Department of Pharmacology and Experimental Ne...,399.0,USA,136.0,367.0,567.0,218.0,463.0,399.0,368.0,344.0,253.0,45.16,2015.0,301-350,United States of America,36.4,34.8,16.5,51.5,34.5,-,2857,2.6,8%,67:33:00,2015.0,,,,,,,,,,,,,,,,,,,,,0.616020,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,"Nathalie Delgehyr, Uta Wieland, He_lÃÂne Ran...","Drosophila Mgr, a Prefoldin subunit cooperatin...","PNAS | April 10, 2012 | vol. 109 | no. 15 | 57...",10.1073/pnas.1108537109,2012,,,1.0,,,,1,11/14/16,,1.0,,1,http://doi.org/10.1073/pnas.1108537109,PNAS,48,"[nan, 0.38095238095238093, nan, nan, nan, nan,...","[[], ['British Journal of Dermatology ', 'Jour...","Ecole Normale Superieure, Paris",20.0,,DÃ©partement de Biologie,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.807401,0
210,"Rajesh K. Grover, Julong Cheng, Yingjie Peng, ...",The costimulatory immunogen LPS induces the B-...,"PNAS _ April 17, 2012 _ vol. 109 _ no. 16, 603...",10.1073/pnas.1202214109,2012,,,1.0,,,,1,10/17/16,,1.0,,1,http://doi.org/10.1073/pnas.1202214109,PNAS,40,"[nan, nan, nan, 0.35365853658536583, nan, nan,...","[[], [], [], [], [], [], ['Publications', ""Rea...",Delhi State Cancer Institute,35.0,Dr,Clinical Oncology,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.795469,0
211,"Kevin Y. Lee, Yuji Yamamoto, Jeremie Boucher, ...",Shox2 is a molecular determinant of depot-spec...,"PNAS | July 9, 2013 | vol. 110 | no. 28 | 1140...",10.1073/pnas.1310331110,2013,,,1.0,,,,1,4/11/16,,1.0,,1,http://doi.org/10.1073/pnas.1310331110,PNAS,24,"[0.26153846153846155, 0.7142857142857143, nan,...","[[], ['Proceedings of the National Academy of ...",Harvard University,17.0,PhD,,1.0,USA,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,100.00,2015.0,2,United States of America,92.9,67.6,98.6,98.9,44,93.3,20152,8.9,25%,,2015.0,Cambridge,MA,Massachusetts,25017.0,Middlesex,42.3759,-71.1185,118927.0,7182.0,polygon,False,True,America/New_York,2.0,02142 02140 02141 02139 02138 02238 02477,middlesex,92221,86211,6010,6.5,0.775420,0
212,"Shuyu Ren, Bryce G. Johnson, Yujiro Kida, Coli...",LRP-6 is a coreceptor for multiple fibrogenic ...,"PNAS | January22,2013 | vol.110 | no.4 | 1441",10.1073/pnas.1211179110,2013,,,,1.0,,,1,11/14/16,,1.0,,1,http://doi.org/10.1073/pnas.1211179110,PNAS,44,"[0.5714285714285714, nan, 0.16666666666666666,...","[[], [], ['Biochemical and Biophysical Researc...",University of Washington Seattle,6.0,,,31.0,USA,23.0,48.0,103.0,41.0,9.0,10.0,10.0,5.0,40.0,60.61,2015.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.572420,0


In [37]:
# Add new rows to dataframe

for i in range(0,500):
    features_1 = [authors[i],title[i],citation[i],doi[i],year[i],month[i],zero[i],one[i],two[i],three[i],findings[i],1,correctiondate[i],retraction[i],correction[i],noaction[i],1,url[i],homesite[i],labsize[i],pubrate[i],otherjournals[i],affiliations[i],duration[i],degree[i],degreearea[i]]
    features_2 = bik_df.iloc[bik_df.index[bik_df["First Author Affiliation"] == affiliations[i]][0]][26:-2].tolist()
    features_3 = [m_probs[i],1]
    bik_df.loc[len(bik_df.index)] = features_1 + features_2 + features_3
bik_df    
    
    

Unnamed: 0,Authors,Title,Citation,DOI,Year,Month,0,1,2,3,FINDINGS,Reported,Correction Date,Retraction,Correction,No Action,SUM \nCompleted,URL,Home Site,Lab Size,Pub Rate,Other Journals,First Author Affiliation,First Author Career Duration,First Author Degree,First Author Degree Area,world_rank_x,country_x,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations_x,broad_impact,patents,score,year_x,world_rank_y,country_y,teaching,international,research,citations_y,income,total_score,num_students,student_staff_ratio,international_students,female_male_ratio,year_y,city_ascii,state_id,state_name,county_fips,county_name,lat,lng,population,density,source,military,incorporated,timezone,ranking,zips,county,labor_force,employed,unemployed,rate,(Grover) Falsified Media,Falsified Media (Manually labeled)
0,"Inka Regine Weingaertner, Sarah Koutnik, Herma...",Chronic Morphine Treatment Attenuates Cell Gro...,"PLOS One, January 2013 | Volume 8 | Issue 1 | ...",10.1371/journal.pone.0053510,2013,1.0,1.0,,1.0,,"duplicated blot, shifted microscopy",1,10/9/15,,1.0,,1,http://doi.org/10.1371/journal.pone.0053510,PLOS,9,"[0.5, nan, nan]","[['PLoS ONE ', ""Naunyn-Schmiedeberg's Archives...",Ludwig-Maximilians-University of Munich,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.632741,0
1,"Jessica M. Esparza, Eileen OÃÂToole, Linya L...",Katanin Localization Requires Triplet Microtub...,"PLOS ONE, January 2013 | Volume 8 | Issue 1 | ...",10.1371/journal.pone.0053940,2013,1.0,,1.0,,,duplicated colony strip,1,1/15/16,,1.0,,1,http://doi.org/10.1371/journal.pone.0053940,PLOS,21,"[0.7647058823529411, nan, nan, 0.23, nan, nan,...","[['PLoS ONE ', 'Molecular Biology of the Cell ...",Washington University in St. Louis,13.0,,Department of Genetics,52.0,USA,35.0,101.0,96.0,71.0,41.0,21.0,29.0,17.0,169.0,55.03,2015.0,,,,,,,,,,,,,,St. Louis,MO,Missouri,29510.0,St. Louis,38.6358,-90.2451,2024074.0,1879.0,polygon,False,True,America/Chicago,1.0,63110 63111 63112 63113 63115 63116 63118 6312...,st.louis,101110,93824,7286,7.2,0.398791,0
2,"Sreedevi Avasarala, Fangfang Zhang, Guangliang...",Curcumin Modulates the Inflammatory Response a...,"PLOS ONE, February 2013 | Volume 8 | Issue 2 |...",10.1371/journal.pone.0057285,2013,2.0,,,1.0,,reuse of one lane in different blot experiment,1,8/4/15,,1.0,,1,http://doi.org/10.1371/journal.pone.0057285,PLOS,18,"[nan, 0.09090909090909091, nan, nan, nan, nan]","[[], ['Current Developments in Nutrition ', 'T...",University of Illinois at Chicago,11.0,,Department of Dental Medicine,203.0,USA,88.0,367.0,541.0,218.0,126.0,196.0,287.0,179.0,129.0,47.39,2015.0,201-225,United States of America,43.0,53.4,34.1,55.1,40.7,-,24313,9.2,17%,53:47:00,2015.0,Chicago,IL,Illinois,17031.0,Cook,41.8373,-87.6862,8604203.0,4574.0,polygon,False,True,America/Chicago,1.0,60018 60649 60641 60640 60643 60642 60645 6064...,cook,8102,7718,384,4.7,0.735152,0
3,"Rounak Nassirpour, Pramod P. Mehta, Sangita M....",miR-221 Promotes Tumorigenesis in Human Triple...,"PLOS ONE, April 2013 | Volume 8 | Issue 4 | e6...",10.1371/journal.pone.0062170,2013,4.0,1.0,,,1.0,Duplicated / mirrored bands in blot,1,4/10/17,1.0,,,1,http://doi.org/10.1371/journal.pone.0062170,PLOS,12,"[nan, nan, nan, nan]","[[], [], [], []]",,10.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.704388,0
4,"Jianuo Liu, Peng Xu, Cory Collins, Han Liu, J...",HIV-1 Tat Protein Increases Microglial Outward...,"PLOS ONE, May 2013 | Volume 8 | Issue 5 | e649...",10.1371/journal.pone.0064904,2013,5.0,,1.0,,,multiple duplicated microscopy images,1,9/19/14,,1.0,,1,http://doi.org/10.1371/journal.pone.0064904,PLOS,21,"[nan, nan, nan, nan, nan, nan, nan]","[[], [], [], [], [], [], []]",University of Nebraska Medical Center,19.0,,Department of Pharmacology and Experimental Ne...,399.0,USA,136.0,367.0,567.0,218.0,463.0,399.0,368.0,344.0,253.0,45.16,2015.0,301-350,United States of America,36.4,34.8,16.5,51.5,34.5,-,2857,2.6,8%,67:33:00,2015.0,,,,,,,,,,,,,,,,,,,,,0.616020,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
709,"Jessica Gregory, Latoya Bender, Jason Cochran,...",It’s been nearly 60 years since cosmetic proce...,"SciDirect. 2013. 70(15), 407.",10.1016/j.jaut.2013.01.082,2013,2.0,,,,,"duplicated, shifted blot panels",1,2.0/1/2017,,1.0,1.0,1,http://doi.org/10.1016/j.jaut.2013.01.082,SciDirect,4,0.75,"[[], [], [], [], [], [], []]",Korea Institute of Science and Technology Info...,24.0,[PhD],Discipline of Microbiology and Immunology,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000000,1
710,"Michael Martin, Felicia Potter",Dr. Paul Baglio on the subject\n\nA national s...,"ASM. 2006. 45(83), 200.",10.1128/IAI.01886-12,2006,1.0,,,,1.0,Duplicated microscopy image; might be error,1,5.0/1/2007,,1.0,,1,http://doi.org/10.1128/IAI.01886-12,ASM,2,1.166667,"[[], [], [], [], []]",University of Cambridge,8.0,[PhD],Department of Biomedical Engineering,4.0,United Kingdom,1.0,2.0,10.0,5.0,11.0,6.0,12.0,13.0,48.0,96.81,2015.0,5,United Kingdom,89.7,87.8,95.6,95.2,51.1,92,18812,11.8,34%,46:54:00,2015.0,,,,,,,,,,,,,,,,,,,,,0.000000,1
711,"Yvonne Guerrero, Kevin Johnson, Patricia Mille...",Modulation with CAR-TECT\n\nReferences:\n\nSca...,"Nature. 2012. 64(77), 133.",10.1038/nature12971,2012,4.0,,,,1.0,Duplicated blot strips,1,4.0/1/2014,1.0,1.0,1.0,1,http://doi.org/10.1038/nature12971,Nature,10,0.297297,"[[], [], [], [], []]",Leiden University,6.0,[PhD],Division of Comparative Physiology,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000000,1
712,"Danielle Simon, Michael Garrett, Deanna Bell",Dental disease patients who had unframed a 20-...,"Hindawi. 2013. 64(18), 138.",10.1155/2013/971073,2013,3.0,,,,1.0,Multiple cases of overlapping microscopy panels,1,4.0/1/2014,,1.0,,1,http://doi.org/10.1155/2013/971073,Hindawi,3,0.652174,"[[], [], [], [], []]",University of Delaware,,[PhD],Facultad de Ciencias BioquÃ­micas y FarmaceÃºt...,281.0,USA,109.0,318.0,337.0,135.0,272.0,304.0,234.0,292.0,120.0,46.21,2015.0,180,United States of America,29.0,40.6,36.3,73.1,99.3,47,19262,15.9,10%,56:44:00,2015.0,Newark,NJ,New Jersey,34013.0,Essex,40.7245,-74.1725,282011.0,4509.0,polygon,False,True,America/New_York,2.0,07103 07102 07105 07104 07107 07106 07108 0711...,essex,414660,374075,40585,9.8,0.000000,1


# Export TSV

In [39]:
bik_df.to_csv('Bik_v2_updated.tsv', sep="\t")