# Key Word Extraction

## BioBERT

The list of surface cooccurrences generated is great. But let's look at the top 25 most frequent word pairs.

In [208]:
import csv

sorted_pairs = []
with open('data/genetics_surface_cooccurrences.csv', 'r', newline='') as csvfile:
    csv_reader = csv.reader(csvfile)
    # csv_reader read first row
    for row in csv_reader:
        word1, word2, frequency = row
        sorted_pairs.append([word1, word2, frequency])

In [209]:
print(sorted_pairs[:25])

[['Word1', 'Word2', 'Frequency'], ['turku', 'turku', '17.314512438893168'], ['lille', 'lille', '17.075681452588835'], ['aacute', 'aacute', '17.01017840402054'], ['louisville', 'louisville', '16.999207286199535'], ['gerais', 'minas', '16.993550442336325'], ['aires', 'buenos', '16.98815209769054'], ['buenos', 'aires', '16.977011540085343'], ['cape', 'town', '16.90829879000133'], ['town', 'cape', '16.90829879000133'], ['compostela', 'santiago', '16.88464752193668'], ['aviv', 'tel', '16.861132430899236'], ['tel', 'aviv', '16.851003425126496'], ['aacute', 'eacute', '16.848427334023928'], ['eacute', 'aacute', '16.83614900424549'], ['freiburg', 'freiburg', '16.834750714281245'], ['maastricht', 'maastricht', '16.833279747579173'], ['minas', 'gerais', '16.78324995598144'], ['nashville', 'vanderbilt', '16.762759919764168'], ['bern', 'bern', '16.741142450265244'], ['hiroshima', 'hiroshima', '16.720671786825555'], ['humboldt', 'berlin', '16.720671786825555'], ['preto', 'ribeirão', '16.707251271063

As you can see, most of these cooccurrences aren't related to biology. The next step will involve filtering out word pairs that aren't deemed to be related to biology. This will be achieved by using **BioBERT**, a pre-trained BERT model.

To do this, I will carry out the following steps.
- Load my surface cooccurrences data into a Pandas Dataframe.
- Load a BERT model.
- Use the BERT model to generate a list of key words. The BERT model will look at words and measure their similarity to the word 'biology'. If they are under a certain similarity threshold, they will be deleted from the dataframe.

In [210]:
# Loading surface cooccurrences into a pandas dataframe.
import pandas as pd

df = pd.read_csv("data/genetics_surface_cooccurrences.csv", encoding='UTF-8')
df.head()

Unnamed: 0,Word1,Word2,Frequency
0,turku,turku,17.314512
1,lille,lille,17.075681
2,aacute,aacute,17.010178
3,louisville,louisville,16.999207
4,gerais,minas,16.99355


In [211]:
from transformers import AutoModel, AutoTokenizer

model_name = "dmis-lab/biobert-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [212]:
# Getting a list of unique words in our word pairs. 
unique_words = list(set(df["Word1"].unique().tolist() 
                        + df["Word2"].unique().tolist()))

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

In [347]:
# From this list of keywords an average embedding can be found to which each word can be compared.
keywords = ["genetics","evolution","ecology","organism","species","population","cells","DNA", "RNA","protein", 
         "enzymes", "metabolism", "reproduction", "adaptation", "biodiversity", "photosynthesis","respiration", 
         "homeostasis","osmosis","diffusion","mitosis","meiosis","chromosomes","mutation","gene","heredity",
         "natural", "selection","phylogeny","taxonomy","anatomy","physiology","biochemistry","molecular",
         "microbiology","virology","immunology","neuroscience","developmental","plant","zoology", "antibody",
         "chemistry", "virus", "bacteria", "organ", "mitochondria", "chloroplast"]

stop_words = set(stopwords.words('english'))
filter_words = stop_words.union(
    "a", "an", "the", "and", "or", "but", "so", "as", "if", "then",
    "in", "on", "at", "with", "without", "for", "by", "about", "of", "from", "to",
    "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having",
    "do", "does", "did", "doing", "will", "would", "can", "could", "shall", "should", "may", "might", "must",
    "this", "that", "these", "those", "there", "where", "when", "how", "why", "which", "who", "whom", "whose",
    "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them", "my", "your", "his", "its", "our", "their",
    "mine", "yours", "hers", "ours", "theirs",
    "all", "any", "some", "many", "several", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
    "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth",
    "more", "less", "most", "least", "own", "other", "another", "each", "every", "much", "such", "few", "both", "either", "neither",
    "thing", "place", "time", "person", "people", "man", "woman", "child", "year", "day", "month", "week", "hour", "minute", "second")

academic_words = [
    "analysis", "approach", "area", "assessment", "assume", "authority", "available", "benefit", "concept", "consistent", "constitutional", "context", "contract", "create",
    "data", "definition", "derived", "distribution", "economic", "environment", "established",
    "achieve", "acquisition", "administration", "affect", "appropriate", "aspects", "assistance", "categories", "chapter", "commission", "community", "complex", "computer", "conclusion", "conduct", "consequences", "construction", "consumer", "credit",
    "cultural",
    "estimate", "evidence", "export",
    "factors", "financial", "formula", "function", "identified", "income", "indicate", "individual", "interpretation", "involved", "issues",
    "labour", "legal", "legislation", "major", "method", "occur", "percent",
    "design", "distinction", "elements", "equation", "evaluation", "features", "final",
    "focus",
    "impact",
    "injury", "institute", "investment", "items",
    "journal", "maintenance", "normal", "obtained", "participation", "perceived", "positive",
    "period", "policy", "principle", "procedure", "process", "required", "research", "response", "role", "section", "sector", "significant", "similar", "source", "specific", "structure", "theory", "variables",
    "potential", "previous", "primary", "purchase", "range", "region", "regulations", "relevant", "resident", "resources", "restricted", "security", "sought", "select",
    "site", "strategies", "survey", "text", "traditional", "transfer"
]

keyword_inputs = [tokenizer(word, return_tensors="pt") for word in keywords]
filter_word_inputs = [tokenizer(word, return_tensors="pt") for word in filter_words]

# Tokenizing the set of unique words.
unique_word_inputs = tokenizer(unique_words, return_tensors="pt", padding=True, truncation=True)

In [348]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import torch

# Get embeddings.
with torch.no_grad():
    keyword_outputs = [model(**input) for input in keyword_inputs]
    keyword_embeddings = [output.last_hidden_state.mean(dim=1).numpy() for output in keyword_outputs]
    
    filter_word_outputs = [model(**input) for input in filter_word_inputs]
    filter_word_embeddings = [output.last_hidden_state.mean(dim=1).numpy() for output in filter_word_outputs]
    
    unique_word_outputs = model(**unique_word_inputs)
    unique_word_embeddings = unique_word_outputs.last_hidden_state..mean(dim=1).numpy()

In [349]:
# Calculate cosine similarities between the keywords embedding and the list of unique words.
keyword_embeddings = np.vstack(keyword_embeddings)
mean_keyword_embedding = np.mean(keyword_embeddings, axis=0)
mean_keyword_embedding = mean_keyword_embedding.reshape(1, -1)

keyword_similarities = cosine_similarity(mean_keyword_embedding, unique_word_embeddings)[0]
keyword_similarity_dict = {word: similarity for word, similarity in zip(unique_words, similarities)}

In [355]:
def get_words_to_filter(filter_word_embeddings=filter_word_embeddings,
               threshold=0.875, inverse=False):
    words_to_filter = set()
    for i, filter_word_embedding in enumerate(filter_word_embeddings):
        filter_word_embedding = filter_word_embedding.reshape(1, -1)
        similarities = cosine_similarity(filter_word_embedding, unique_word_embeddings)[0]
        # Go through the dataframe and remove words over the similarity threshold.
        for i in range(len(similarities)):
            if similarities[i] >= threshold:
                words_to_filter.add(unique_words[i])
    return words_to_filter

In [361]:
filter_set = get_words_to_filter(threshold=0.862)
print(filter_set)

{'environmental', 'practical', 'grass', 'seven', 'guard', 'campus', 'survive', 'simultaneous', 'throughout', 'bath', 'consideration', 'sexual', 'tours', 'promotes', 'developed', 'treat', 'cook', 'behind', 'breeding', 'military', 'tertiary', 'addition', 'scholar', 'characterization', 'dominant', 'dei', 'persistent', 'juvenile', 'reduces', 'ecological', 'address', 'ornamental', 'southwestern', 'evaluate', 'demographic', 'authority', 'still', 'safe', 'lake', 'promote', 'della', 'less', 'newly', 'linear', 'nine', 'apply', 'multi', 'undertake', 'various', 'pharmaceutical', 'require', 'ethnic', 'obtain', 'muscular', 'sts', 'globe', 'relatively', 'trigger', 'pivotal', 'numerous', 'induce', 'con', 'fork', 'broadly', 'princess', 'harbour', 'get', 'alleviate', 'geographic', 'achieve', 'healthcare', 'represent', 'using', 'minimal', 'short', 'exists', 'couple', 'regulate', 'operate', 'medical', 'civil', 'impact', 'whose', 'promising', 'based', 'indigenous', 'create', 'interact', 'prince', 'accompa

In [362]:
def filter_dataframe(df, column_name, filter_set, inverse=False):
    if inverse == True:
        df = df[df[column_name].isin(filter_set)]
    else:
        df = df[~df[column_name].isin(filter_set)]
    return df

In [363]:
df = filter_dataframe(df, 'Word1', filter_set, inverse=False)
df = filter_dataframe(df, 'Word2', filter_set, inverse=False)

In [364]:
df.to_csv("data/genetics_filtered_surface_cooccurrences.csv", index=False)

## *(Extra experimental stuff below)*

In [262]:
word_similarity_df = filter_dataframe(word_similarity_df, 'Word', filter_set)

In [263]:
word_similarity_df.head()

Unnamed: 0,Word,Similarity
4071,biology,0.906111
1579,nature,0.905792
5038,fisheries,0.904539
1368,insects,0.903834
562,timing,0.903503


The similarities list corresponds to the index of a word in the ```unique_word_embeddings``` list, and therefore also the ```unique_words``` list. 

In [264]:
# word_similarity_df = pd.DataFrame({"Word": unique_words, 
#                                    "Similarity": similarities})

word_similarity_df = word_similarity_df.sort_values(by="Similarity", ascending=False)
word_similarity_df.to_csv("data/sorted_word_similarities.csv", index=False)

In [265]:
word_similarity_df.head()

Unnamed: 0,Word,Similarity
4071,biology,0.906111
1579,nature,0.905792
5038,fisheries,0.904539
1368,insects,0.903834
562,timing,0.903503


In [266]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModel, AutoTokenizer

k = 15
kmeans = KMeans(n_clusters=k, random_state=101, n_init=10)
kmeans.fit(unique_word_embeddings)
word_cluster_labels = kmeans.labels_

In [267]:
word_clusters_df = pd.DataFrame({"Word": unique_words, "Cluster":word_cluster_labels, "Embedding": unique_word_embeddings.tolist()})
word_clusters_df = word_clusters_df.sort_values(by="Cluster", ascending=True)
word_clusters_df.to_csv("data/word_clusters.csv", index=False)

In [268]:
word_clusters_df = word_clusters_df[["Word", "Cluster"]]
word_clusters_df.head()

Unnamed: 0,Word,Cluster
1892,biologia,0
1229,auxin,0
5374,randomized,0
1242,assay,0
451,oregon,0


In [272]:
desired_cluster = 5
cluster_rows = word_clusters_df.query(f"Cluster == {desired_cluster}")
print(cluster_rows)

            Word  Cluster
2470         tag        5
4076   insurance        5
655    fertility        5
631         fate        5
4896       order        5
...          ...      ...
448         stem        5
4858  attractive        5
4319     network        5
446    machinery        5
4782       panel        5

[619 rows x 2 columns]


In [310]:
df.head()

Unnamed: 0,Word1,Word2,Frequency
2,aacute,aacute,17.010178
4,gerais,minas,16.99355
5,aires,buenos,16.988152
6,buenos,aires,16.977012
9,compostela,santiago,16.884648
