**Keyword extraction by Yake - From Abstract**

In [None]:
!pip install yake
!pip install langdetect
!pip install fuzzywuzzy
import yake
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
from langdetect import detect
from fuzzywuzzy import process

In [None]:
def extract_keywords(text):
    
    # remove stop-words
    stopwords = set(nltk.corpus.stopwords.words('english')) | set(nltk.corpus.stopwords.words('french')) | set(nltk.corpus.stopwords.words('arabic'))
    text = ' '.join([word for word in word_tokenize(text) if word.lower() not in stopwords])

    # keep only nouns, adverbs and adjectives
    tokens = word_tokenize(text)
    tagged = nltk.pos_tag(tokens)
    tags = ['NN', 'NNP', 'NNS', 'NNPS', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
    text = ' '.join([t[0] for t in tagged if t[1] in tags])
    
    # yake extraction
    lang = detect(text)
    kw_extractor = yake.KeywordExtractor(lan=lang, n=2, top=100) # limit to best 100 uni/bi grams 
    results = kw_extractor.extract_keywords(text)
    keywords = [word for word, score in results]
    
    # remove duplicates
    keywords = process.dedupe(keywords, threshold=25) # similarity percentage 25%

    # format to string
    keywords = ', '.join(keywords)

    return keywords

**Abstract / Keyword extraction by Cermine - Articles**

In [None]:
# install java
!apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
!java -version
# install cermine
!wget https://maven.ceon.pl/artifactory/kdd-releases/pl/edu/icm/cermine/cermine-impl/1.13/cermine-impl-1.13-jar-with-dependencies.jar
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
from bs4 import BeautifulSoup
from functools import reduce

In [None]:
# get abstract from cermine xml output
def get_file_abstract(f):
  try :
    with open (f,'r',encoding='utf-8') as file:
        content = file.readlines()
        content = "".join(content)
        soup = BeautifulSoup(content)
        abstracts = soup.findAll("abstract")
        abstracts_array =  [abstract.text for abstract in abstracts]
        return(abstracts_array[0])
  except:
      return ""

# get keywords from cermine xml output
def get_file_keywords(f):
  try :
    with open (f,'r',encoding='utf-8') as file:
        content = file.readlines()
        content = "".join(content)
        soup = BeautifulSoup(content)
        keys = soup.findAll("keywords")
        keys_array =  [key.text for key in keys]
        return(keys_array[0])
  except:
      return ""

In [None]:
!java -cp cermine-impl-1.13-jar-with-dependencies.jar pl.edu.icm.cermine.ContentExtractor -path 'directory_path'

In [None]:
def extract_cermine(directory_path, file_name_without_extension):
    return [get_file_abstract(directory_path + file_name_without_extension + '.cermxml'), get_file_keywords(directory_path + file_name_without_extension + '.cermxml')]