#### Pulling the data from Github

The dataset, includes train, test and dev sets, which we pull from the [Github repository](https://github.com/shreyashub/BioFLAIR/tree/master/data/ner/BC5CDR-disease).

In [None]:
## download data from githup 

import urllib.request
from pathlib import Path

def download_file(url, output_file):
  Path(output_file).parent.mkdir(parents=True, exist_ok=True)
  urllib.request.urlretrieve (url, output_file)

download_file('https://raw.githubusercontent.com/shreyashub/BioFLAIR/master/data/ner/bc5cdr/train.txt', '/content/data/train.txt')
download_file('https://raw.githubusercontent.com/shreyashub/BioFLAIR/master/data/ner/bc5cdr/test.txt', '/content/data/test.txt')
download_file('https://raw.githubusercontent.com/shreyashub/BioFLAIR/master/data/ner/bc5cdr/dev.txt', '/content/data/dev.txt')

In [None]:
# ## get data from githup 
# !wget https://github.com/shreyashub/BioFLAIR/blob/master/data/ner/BC5CDR-disease/train.tsv 
# !wget https://github.com/shreyashub/BioFLAIR/blob/master/data/ner/BC5CDR-disease/test.tsv
# !wget https://github.com/shreyashub/BioFLAIR/blob/master/data/ner/BC5CDR-disease/devel.tsv
# !wget https://github.com/shreyashub/BioFLAIR/blob/master/data/ner/BC5CDR-disease/train_dev.tsv



 Since the data is formatted in the CoNLL BIO type format (you can read more on the tagging format from this wikipedia article), we need to format it into a pandas dataframe with the following function. The 2 important columns in the dataframe are a word token (for mandarin this is a single character), a BIO label and a sentence_id to differentiate samples/sentences.





In [None]:
## read tsv file 
import numpy as np
import pandas as pd


def read_conll(filename):
    df = pd.read_csv(filename,
                    sep = '\t', header = None, keep_default_na = False,
                    names = ['words', 'pos', 'chunk', 'labels'],
                    quoting = 3, skip_blank_lines = False)
    df = df[~df['words'].astype(str).str.startswith('-DOCSTART- ')] # Remove the -DOCSTART- header
    # df['sentence_id'] = (df.words == '').cumsum()
    return df[df.words != '']

In [None]:
## read text ata as dataframe 
train_df = read_conll('/content/data/train.txt')
test_df = read_conll('/content/data/test.txt')
dev_df = read_conll('/content/data/dev.txt')
train_df.head(10)

In [None]:
test_df.head(3)

In [None]:
dev_df.head(3)

In [None]:
## concat all dataframe with other 
df = pd.concat([train_df, test_df , dev_df] )
df.head(3)

In [None]:
df.head(4)

In [None]:
## shape of full dataset 
df.shape

In [None]:
## count chunk
df['chunk'].value_counts()

In [None]:
## count all speech tagging in labels 
df['labels'].value_counts()

## This is All Text Tagging 

* NOUN    
* PUNCT    
* ADP     
* VERB   
* ADJ      
* DET      
* PROPN    
* NUM      
* CCONJ    
* ADV       
* SYM       
* PART      
* PRON      
* X          
* INTJ        

In [None]:
## create dictionary from tagging for word in dataset
df['pos'].value_counts()

In [None]:
## create dictionary of word and pos for each word 
AllWords = list(df['words'].values)
Allpos = list(df['pos'].values)
AllLabels = list(df['labels'].values)


In [None]:
## create dictionary 
wordsPos = []
wordsLabels = []
for i in range(len(AllWords)):
    wordsPos.append({"word" : AllWords[i] , "pos" : Allpos[i] })
    wordsLabels.append({"word" : AllWords[i] , "label" : AllLabels[i] })

In [None]:
wordsLabels[:3]

In [None]:
## save list of pos
pos = pd.DataFrame(wordsPos) 
print(pos.head(3)) 

In [None]:
## save list of labels as dataframe 
labels = pd.DataFrame(wordsLabels) 
print(labels.head(3)) 

In [None]:
## save list of pos and labels entity 
labels.to_csv("Entity_labels.csv" , index=False)
pos.to_csv("pos.csv" , index=False)

In [None]:
## list columns 
df.columns

In [None]:
raw_text =  ' '.join(AllWords)

In [None]:
## save data into text
with open('BC5CDR_DATA.txt', 'w') as f:
    f.write(raw_text)