In [1]:
# Libraries:
import pandas as pd
import numpy as np
import string
import re
import collections

import matplotlib.pyplot as plt
import matplotlib.cm as cm

from nltk.corpus import stopwords
from ftfy import fix_encoding
from collections import Counter
from collections import OrderedDict


In [2]:
# Import data and intial variable
train_var  = pd.read_csv('training_variants')
train_text = pd.read_csv('training_text', sep='\|\|', engine='python')
test_var   = pd.read_csv('test_variants')
test_text  = pd.read_csv('test_text', sep='\|\|', engine='python')

# Drop the rows whose text is NA and correct index in df after dropping elements
fixed_train_var        = train_var.drop([1109, 1277, 1407, 1639, 2755])
fixed_train_var.index  = range(len(fixed_train_var))                
fixed_train_text       = train_text.drop([1109, 1277, 1407, 1639, 2755])
fixed_train_text.index = range(len(fixed_train_text))

fixed_test_var        = test_var.drop(1623)
fixed_test_var.index  = range(len(fixed_test_var))
fixed_test_text       = test_text.drop(1623)
fixed_test_text.index = range(len(fixed_test_text))

In [3]:
# Cleaning the data
def clean(text):
    
    # Document-level processing:
    stop = set(stopwords.words('english'))
    sample = text   
    
    sample = sample.lower()                         # Lower case
    
    sample = sample.replace('fig. ', 'fig.')        # Return 'figure' to a format that can be easily processed in the sentence-level
    sample = sample.replace('figure ', 'fig.')
    
    regex  = re.compile("\w+\S\w+|\w+|[!?,-.:]")    # Regular expression for format the sentence structure as ' sign ' for better filtering
    sample = re.findall(regex, sample)              # Find all sentences meet the requirement of the regular expression
    
    sample = filter(lambda val: val != ',', sample)             # Remove ','
    sample = [w for w in sample if w not in stop]               # Remove stopwords 
    sample = [w for w in sample if 'supplementary' not in w]    # Remove 'supplementary' 
    sample = [w for w in sample if 'shown' not in w]            # Remove 'shown'
    sample = " ".join(sample)                                   # Ensemble the document after regex
    
    
    
    # Sentence-level processing:
    corpus     = sample.split(' . ') # Or sentences
    new_corpus = []                  # Collection of senteces satisfying conditions
    
    for sentence in corpus:
        
        sentence_trim = re.sub("\([^()]*\)", "", sentence)            # Remove contents from parenthesis and square brackets in a sentence
        words         = sentence_trim.split()                         # Words from a sentence
        year          = [format(x, '02d') for x in range(1950,2022)]  # Getting the 'year' list ranging from 1950 to 2022 
        
        
        # Filter out short sentences less than 5 words
        if len(words) >= 5:
            
            # Clean the years
            if words[0] in year:                                     
                del words[0]    
                
            # Clean references(X-et-al)
            if words[-1] == 'al':                                    
                del words[-3:]
                
            # Clean fig. / tables
            words = [w for w in words if 'fig.' not in w]           # Initial filter
            fig_tab = ['fig','figs','figures','table','tables']     # Back filter
            for idx in range(-4,0,1):
                if len(words) >= -idx+1 and words[idx] in fig_tab:
                    del words[idx:]
                        
            # Remove repeated 'download figureopen new tabdownload powerpoint' from the figures' notes
            while len(words) != 0 and words[0] == 'download':
                del words[0:5]
            
            # Setting threshold for AminoAcid case(collection of short words)
            len_words =  [len(char) for char in words]      
            small_w   = [len_words.count(1),len_words.count(2),len_words.count(3)]
            threshold = [8,10,6]   # Can be tweaked     
            amino     = any([count > limit for count, limit in zip(small_w,threshold)])  
            
            # Omit short final result which contains lengthy words/nucleotides seq or too short 
            sentence_trim = " ".join(words)
            if len(sentence_trim) > 30:                              
                if len(max(words, key=len)) < 20:
                    if amino == False:
                        new_corpus.append(sentence_trim) 
                    
    return new_corpus

In [6]:
"""""
# Run this code for getting the new data schema

row        = fixed_train_text.shape[0]
clean_text = []

for i in range(0,row):
    processed_input = clean(fixed_trin_text.iloc[i][0])
    output_str      = " . ".join(processed_input)
    clean_text.append(output_str)

result = {'clean_text': clean_text}
df     = pd.DataFrame(result) 
df

"""""

'""\n# Run this code for getting the new data schema\n\nrow = fixed_train_text.shape[0]\nclean_text = []\nfor i in range(0,row):\n    input_str = " . ".join(clean(fixed_trin_text.iloc[i][0]))\n    clean_text.append(input_str)\n\nresult = {\'clean_text\': corpus}\ndf = pd.DataFrame(result) \ndf\n'

In [59]:
# Simple expansion for variation text and fix encoding
def expansion(df):
    clone    = df
    var_text = []
    row      = len(clone)
    # Expansion of variation column
    for i in range(row):
        sentence = clone.iloc[i][3].split(' . ')
        target   = clone.iloc[i][1]
        output   = []
        for idx in range(len(sentence)):
             if (target.lower() in sentence[idx]):
                output.append(sentence[idx])
        var_text.append(" . ".join(output))
    clone['var_text'] = var_text
    # Fix encoding
    for j in range(row):
        clean_text = clone.iloc[i][3] 
        var_text   = clone.iloc[i][4] 
        clone.at[i,'clean_text'] = fix_encoding(clean_text)
        clone.at[i,'var_text']   = fix_encoding(var_text)

    return clone   

In [24]:
"""""
# Run this code for getting the new data schema

df            = df
combine_train = pd.concat([fixed_train_var.drop(['ID'],axis=1),df],axis=1)
new_schema    = expansion(combine_train)
new_schema.to_csv('New_train_data.csv') 
"""""

'""\n# Run this code for getting the new data schema\n\ncombine_train    = pd.concat([fixed_train_var.drop([\'ID\'],axis=1),df],axis=1)\nexpansion(combine_train)\n\n'

In [58]:
# Returns a table report all duplicated cases
def report_dup(data):
    # Setting up variables:
    sorted_data = data.sort_values(by = ['ID,Text'])
    rows        = sorted_data.shape[0]
    final_text  = []
    duplicate   = []
    i = 0
    # First loop is to go through each text
    while i != rows-1:
        if clean(sorted_data.iloc[i][0])[0] == clean(sorted_data.iloc[i+1][0])[0]: # whether the first sentences of two adjacent documents are the same 
            text  = sorted_data.iloc[i][0]
            index = [sorted_data.index[i]]
            k = 1 # index k of duplicated elements for a text
            if i == rows-2:
                index.append(sorted_data.index[i+1])
            else:
                # Second loop is to count how many texts are consecutively the same as the target text
                while clean(sorted_data.iloc[i+k][0])[0] == clean(sorted_data.iloc[i][0])[0] and i+k < rows-1: 
                    index.append(sorted_data.index[i+k])
                    k += 1
            final_text.append(text)
            duplicate.append(index)
            i += k
        else:
            lone_text = sorted_data.iloc[i][0]
            lone_index = [sorted_data.index[i]]
            final_text.append(lone_text)
            duplicate.append(lone_index)
            i += 1
        
    final_table = pd.DataFrame(list(zip(final_text, duplicate)), columns = ['Text', 'List_of_index'])
    count_table = final_table.assign(Duplicates = final_table.List_of_index.apply(lambda x: len(x)))    # Count number of duplicates
    extreme = count_table.sort_values(by = ['Duplicates'], ascending =0)                                # Sort the resulting table to observe extreme cases
    return extreme

In [57]:
# We want to avoid non-unique variation names when checking the distribution
stop_var = ['truncating mutations','deletion','promoter mutations','amplification','promoter hypermethylation','overexpression','epigenetic silencing','dna binding domain deletions','dna binding domain insertions','dna binding domain missense mutations','copy number loss','hypermethylation','wildtype','fusions']

# Extract info from variation data
var_gene  = fixed_train_var.iloc[:,1]
var_df    = fixed_train_var.iloc[:,2]
var_class = fixed_train_var.iloc[:,3]


combine = list(zip(var_df,var_class,var_gene))
combine_2 = map(lambda x: (x[0].lower(),x[1],x[2]),combine)   # Lowercase the variation names
combine_3 = filter(lambda x: x[0] not in stop_var,combine_2)  # Filter out variations that collides with stop_var
var_dict = list(OrderedDict.fromkeys(combine_3))              # Dictionary of variations -> class/gene

# Dictionary for class -> color when plotting
class_color = dict(zip([1,2,3,4,5,6,7,8,9], ['r','g','b','y','m','c','k','grey','cyan']))

In [55]:
def dis_report(data,idx):
    
    print('\033[1m' +'Case '+str(idx) + '\n'+ '\033[0m')
    
    # Define intial variable
    text       = data.iloc[idx][0]
    document   = clean(text)
    var_gene   = fixed_train_var.iloc[:,1]
    major_gene = Counter(map(lambda x: var_gene[x], data.iloc[idx][1])).most_common(1)[0][0]
    
    ranges     = list(range(len(document)))
    all_res    = []
    report     = []
    
    # Brief description on the target document:
    print('\033[1m' +'Brief description on the document:' + '\033[0m')
    print(data.iloc[idx])
    print('')
    print('')
    
    print('\033[1m' +'Report on detection of multiple entities in a sentence and deletion of same variant with different class:' + '\033[0m')
    # Looping thru each sentence
    for i in range(len(document)):
        sentence = document[i]
        candidate = []
        exist = []
        
        # Finding variaitons appearing in a sentence
        for j in range(len(var_dict)):
            if var_dict[j][0] in sentence:
                candidate.append(j)

        name   = list(map(lambda x: var_dict[x][0], candidate))
        classV = list(map(lambda x: var_dict[x][1], candidate))
        typeG  = list(map(lambda x: var_dict[x][2], candidate))
        
        # Report on multiple names appearing in a sentence
        if len(candidate) > 1:
            classV_str = list(map(str, classV))
            print('Multiple names are mentioned at sentence ' + str(i) + ' like ' + ', '.join(name) + ' from class ' + ', '.join(classV_str) + ' respectively.\n')
        
        # If overlaps happens to have same name mentioned then probably there exist a variaiton whose genes are shared, check with the majority gene and excludes the other possibilities
        overlap = [item for item, count in collections.Counter(name).items() if count > 1]   
        if len(overlap) > 0:
            delete = []
            for k in range(len(name)):
                if name[k] in overlap:
                    if typeG[k] != major_gene:
                        delete.append(candidate[k])
                        print('Delete ' + name[k] +' from ' + typeG[k] + ' from class ' + str(classV[k]) + ' at sentence ' + str(i))
                        print('')
            candidate = [x for x in candidate if x not in delete]
    
        # Finalize the final report of variations for a sentence
        for val in candidate:
            report.append((var_dict[val][0],var_dict[val][1],var_dict[val][2]))
            exist.append(var_dict[val][1])
        all_res.append(exist)
    
    
    # Plotting the distribution using different colors for each class
    for xe, ye in zip(ranges,all_res):
        if len(ye) > 0:
            for i in range(len(ye)):
                plt.scatter(xe,ye[i], c=class_color.get(ye[i]))
    print('')
    print('')
    print('\033[1m' + 'Plot for extreme Case ' + str(idx)+'\033[0m')
    plt.xlabel("x_th sentence in the document")
    plt.ylabel("Classes")
    plt.show()
    
    # Report on the mentioned variations in the text
    print('\033[1m' + 'Report on variants in the document' + str(idx)+'\033[0m')
    report = list(OrderedDict.fromkeys(report))
    classes =  list(OrderedDict.fromkeys([x[1] for x in report]))
    for idx in classes:
        variation = filter(lambda x: x[1] == idx,report)
        var_gene = [x[0] +' ('+ x[2] +')' for x in variation]
        print('Class ' + str(idx) + ' : ' +  ', '.join([text for text in var_gene]))

In [56]:
"""""

# Run this code for reporting the distribution of variations for case ?th

extreme = report_dup(fixed_train_text)
index = ? #Index for the most ?th extreme case (max 1323 / 1322 for input)
dis_report(extreme,index)

"""""

'""\n\n# Run this code for reporting the distribution of variations for case ?th\n\nextreme = report_dup(fixed_train_text)\nindex = ? #Index for the most ?th extreme case (max 1323 / 1322 for input)\ndis_report(extreme,index)\n\n'