# Filter data based on keywords

## Introduction
The aim of this notebook is to read a set of keywords and a set of scraped data and filter out all non-Covid-19 related entries/rows.

## Import libraries and set up defaults

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#%xmode Verbose
# Set global default figure size
plt.rc('figure', figsize=(20, 12)) # It's nice with figures that fill the whole space in width
# Show maximum of 8 rows when printing dataframes
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 8
# Show only 4 digits when printing floating point number
np.set_printoptions(precision=4, suppress=True)

## Two sets of keywords

### Key words from [Zhang Lab](https://zhanglab.ccmb.med.umich.edu/COVID-19/) 

#### Read in the keywords

In [2]:
path = "/data/01_raw/Covid-19_keywords/"
key_words_df = pd.read_csv(path + "SARS-COV-2_Protein_Names.tsv",
                           sep = '\t',
                           header = 0,
                           usecols = ['ORF','Gene','Gene2','Full_Name'] # Dropping: 'Function', Source and a url
                          )
key_words_df

Unnamed: 0,ORF,Gene,Gene2,Full_Name
0,ORF1AB,nsp1,,Host translation inhibitor nsp1
1,ORF1AB,nsp2,,Non-structural protein 2
2,ORF1AB,nps3,,Papain-like proteinase
3,ORF1AB,nps4,,Non-structural protein 4
...,...,...,...,...
20,ORF7A,,,Protein 7a
21,ORF8,,,Protein 8
22,N,,,Nucleoprotein
23,ORF10,,,3` UTR


#### Create a unique Python list of keywords

### Key words from [Zhang Lab](https://zhanglab.ccmb.med.umich.edu/COVID-19/) 

In [3]:
first_term = (key_words_df['ORF']
              .dropna() # Drop np.nan:s
              .unique() # Filter all non-unique values
              .tolist() # Make a python list
             ) + \
key_words_df['Gene'].dropna().unique().tolist() + \
key_words_df['Gene2'].dropna().unique().tolist() + \
key_words_df['Full_Name'].dropna().unique().tolist()
print(first_term)

['ORF1AB', 'S', 'ORF3A', 'E', 'M', 'ORF6', 'ORF7A', 'ORF8', 'N', 'ORF10', 'nsp1', 'nsp2', 'nps3', 'nps4', 'nsp5', 'nsp6', 'nsp7', 'nsp8', 'nsp9', 'nsp10', 'RDRP', 'Hel', 'Exon', 'NendoU', "2'-O-MT", 'Spike', '3CL-PRO', 'Spike trimeric complex (S1, S2, S`)', 'Host translation inhibitor nsp1', 'Non-structural protein 2', 'Papain-like proteinase', 'Non-structural protein 4', '3C-like proteinase', 'Non-structural protein 6', 'Non-structural protein 7', 'Non-structural protein 8', 'Non-structural protein 9', 'Non-structural protein 10', 'RNA-Directed RNA Polymerase', 'Helicase', 'Proofreading exoribonuclease (Guanine-N7 methyltransferase)', 'Uridylate-specific endoribonuclease', "2'-O-methyltransferase", 'Spike surface glycoprotein (monomer)', 'Protein 3a', 'Envelope small membrane proteins', 'Membrane protein', 'Protein 6', 'Protein 7a', 'Protein 8', 'Nucleoprotein', '3` UTR']


### Key words for filtering scraped data from [Mendeley Database](https://data.mendeley.com/)

In [4]:
first_term = ["sars-cov-2", "sars-cov2", "covid19", "covid-19", "sars", "coronavirus", "ncov2019", "ncov-2019"]
second_term = ["molecular dynamics", "trajectories", "dynamics simulation", "trajectory", "molecular docking"]

### Key words for filtering scraped data from [Figshare](https://figshare.com/)

In [5]:
# Search from "description" 
first_term = ["sars-cov-2"] 
# AND from "keyword" 
second_term = ["molecular dynamics"]

## Dummy data for testing the filtering

In [6]:
data = {'col_1': ["Helicase", "sars-cov-2", "https://figshare.com/articles/nsp10/12162405", "sars-cov-2"], 
        'col_2': ['a', 'This is a Protein 8 pdf abstract', 'molecular dynamics', 'molecular dynamics']}
df = pd.DataFrame.from_dict(data)
df

Unnamed: 0,col_1,col_2
0,Helicase,a
1,sars-cov-2,This is a Protein 8 pdf abstract
2,https://figshare.com/articles/nsp10/12162405,molecular dynamics
3,sars-cov-2,molecular dynamics


## Initialise a boolen pd.Series
The purpose of pd.Series name `found` is to hold a boolen index rows with matches (if a certain keyword is found in the particular entry).

In [7]:
search_column = "col_1"
falses = np.zeros(len(df[search_column]), dtype=bool) # https://stackoverflow.com/a/21174962
found1 = pd.Series(data = falses,
                   dtype = bool)
found2 = pd.Series(data = falses,
                   dtype = bool)

## Find all indexes with a match

### Find matches in the first column

In [8]:
first_column_to_search_in = "col_1"
for word in first_term:
    # Find out if the current search term can be found in the column
    cur_match = df[first_column_to_search_in].str.contains(word) # https://stackoverflow.com/a/15333283
    # Join the found matches to one Series
    found1 = found1 | cur_match


In [9]:
found1

0    False
1     True
2    False
3     True
dtype: bool

### Find matches in the second column

In [10]:
second_column_to_search_in = "col_2"
for word in second_term:
    # Find out if the current search term can be found in the column
    cur_match = df[second_column_to_search_in].str.contains(word) # https://stackoverflow.com/a/15333283
    # Join the found matches to one Series
    found2 = found2 | cur_match

In [11]:
found2

0    False
1    False
2     True
3     True
dtype: bool

### Join the matches from both columns

In [12]:
joined = found1 & found2
joined

0    False
1    False
2    False
3     True
dtype: bool

## Select only rows with matches in both columns

In [13]:
df[joined]

Unnamed: 0,col_1,col_2
3,sars-cov-2,molecular dynamics
