## Installing the required dependencies

In [None]:
!pip install -U pip setuptools wheel

!pip install -U spacy

!python -m spacy download en_core_web_sm

!pip install git+https://github.com/boudinfl/pke.git
!pip install matplotlib

[0mCollecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.9/13.9 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting git+https://github.com/boudinfl/pke.git
  Cloning https://github.com/boudinfl/pke.git to /tmp/pip-req-build-a4807ab7
  Running command git clone --filter=blob:none --quiet https://github.com/boudinfl/pke.git /tmp/pip-req-build-a4807ab7
  Resolved https://github.com/boudinfl/pke.git to commit 69337af9f9e72a25af6d7991eaa9869f1322dd72
  Preparing metadata (setup.py) ... [?25l[?25hdone
[0m

# Importing the required libraries

In [None]:
import numpy as np
import pandas as pd
import pke
import spacy
spacy.load('en_core_web_sm')
import string
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **Initializing the dataset**

In [None]:
df1 = pd.read_csv("kdd_text.csv")
df2 = pd.read_csv("kdd_keys.csv", sep = "delimiter")

# Merge the two tables

df1["Key"] = df2
df = df1

#Display the dataset
df.head()

  return func(*args, **kwargs)


Unnamed: 0,Text,Key
0,Variable latent semantic indexing No contact i...,"linear algebra,lsi,matrix approximation,""spars..."
1,Entity discovery and assignment for opinion mi...,"entity discovery,sentiment analysis"
2,Stable feature selection via dense feature gro...,"classification,feature selection,high-dimensio..."
3,Consensus group stable feature selection Stabi...,"ensemble,feature selection,high-dimensional da..."
4,COA: finding novel patents through text analys...,"document ranking,information retrieval,patent ..."


# Implementation of the algorithms:

# **Unsupervised**

## **Statistical Methods**

### **TF-IDF algorithm**

In [None]:
# 1. create a TfIdf extractor.
extractor = pke.unsupervised.TfIdf()
keywords_tfidf = []
for index, row in df.iterrows():
  
  # 2. load the content of the document.
  extractor.load_document(input = row["Text"],
                          language='en',
                          normalization=None)

  # 3. select {1-3}-grams not containing punctuation marks as candidates.
  extractor.candidate_selection()

  # 4. weight the candidates using a `tf` x `idf`
  df = pke.load_document_frequency_file(input_file = "df-semeval2010.tsv.gz")
  extractor.candidate_weighting(df=df)

  # 5. get the 3-highest scored candidates as keyphrases
  keyphrases = extractor.get_n_best(n=3)
  keywords_tfidf.append(keyphrases)


In [None]:
with open("key_kdd_tfidf.txt", "w") as external_file:
  for keyphrase in keywords_tfidf:   
    print(keyphrase, file=external_file)
  external_file.close()

### **KP-Miner algorithm**

In [None]:
# 1. create a KPMiner extractor.
extractor = pke.unsupervised.KPMiner()
keywords_kpminer = []
for index, row in df.iterrows():
# 2. load the content of the document.
  extractor.load_document(input=row["Text"],
                        language='en',
                        normalization=None)


# 3. select {1-5}-grams that do not contain punctuation marks or
#    stopwords as keyphrase candidates. Set the least allowable seen
#    frequency to 5 and the number of words after which candidates are
#    filtered out to 200.

  extractor.candidate_selection(lasf=1, cutoff=400)

# 4. weight the candidates using KPMiner weighting function.
  df = pke.load_document_frequency_file(input_file='df-semeval2010.tsv.gz')
  alpha = 2.3
  sigma = 3.0
  extractor.candidate_weighting(df=df, alpha=alpha, sigma=sigma)

# 5. get the 3-highest scored candidates as keyphrases
  keyphrases = extractor.get_n_best(n=3)
  keywords_kpminer.append(keyphrases)


In [None]:
with open("key_kdd_KPMiner.txt", "w") as external_file:
  for keyphrase in keywords_kpminer:   
    print(keyphrase, file=external_file)
  external_file.close()

## **Graph-Based Methods**

### **Text Rank algorithm**

In [None]:
# define the set of valid Part-of-Speeches
pos = {'NOUN', 'PROPN', 'ADJ'}

# 1. create a TextRank extractor.
extractor = pke.unsupervised.TextRank()

keywords_textrank = []
for index, row in df.iterrows():

  # 2. load the content of the document.
  extractor.load_document(input=row["Text"],
                          language='en',
                          normalization=None)

  # 3. build the graph representation of the document and rank the words.
  #    Keyphrase candidates are composed from the 33-percent
  #    highest-ranked words.
  extractor.candidate_weighting(window=2,
                                pos=pos,
                                top_percent=0.33)

  # 4. get the 3-highest scored candidates as keyphrases
  keyphrases = extractor.get_n_best(n=3)
  keywords_textrank.append(keyphrases)


In [None]:
with open("key_kdd_textrank.txt", "w") as external_file:
  for keyphrase in keywords_textrank:   
    print(keyphrase, file=external_file)
  external_file.close()

### **Single Rank algorithm**

In [None]:
# define the set of valid Part-of-Speeches
pos = {'NOUN', 'PROPN', 'ADJ'}

# 1. create a SingleRank extractor.
extractor = pke.unsupervised.SingleRank()

keywords_singlerank = []
for index, row in df.iterrows():
  # 2. load the content of the document.
  extractor.load_document(input = row["Text"],
                          language='en',
                          normalization=None)

  # 3. select the longest sequences of nouns and adjectives as candidates.
  extractor.candidate_selection(pos=pos)

  # 4. weight the candidates using the sum of their word's scores that are
  #    computed using random walk. In the graph, nodes are words of
  #    certain part-of-speech (nouns and adjectives) that are connected if
  #    they occur in a window of 10 words.
  extractor.candidate_weighting(window=10,
                                pos=pos)

  # 5. get the 3-highest scored candidates as keyphrases
  keyphrases = extractor.get_n_best(n=3)
  keywords_singlerank.append(keyphrases)


In [None]:
with open("key_kdd_singlerank.txt", "w") as external_file:
  for keyphrase in keywords_singlerank:   
    print(keyphrase, file=external_file)
  external_file.close()

### **Topic Rank algorithm**

In [None]:
# 1. create a TopicRank extractor.
extractor = pke.unsupervised.TopicRank()

keywords_topicrank = []

for index, row in df.iterrows():
  # 2. load the content of the document.
  extractor.load_document(input = row["Text"])

  # 3. select the longest sequences of nouns and adjectives, that do
  #    not contain punctuation marks or stopwords as candidates.
  pos = {'NOUN', 'PROPN', 'ADJ'}
  stoplist = list(string.punctuation)
  stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
  stoplist += stopwords.words('english')
  extractor.candidate_selection(pos=pos)

  # 4. build topics by grouping candidates with HAC (average linkage,
  #    threshold of 1/4 of shared stems). Weight the topics using random
  #    walk, and select the first occuring candidate from each topic.
  extractor.candidate_weighting(threshold=0.74, method='average')

  # 5. get the 3-highest scored candidates as keyphrases
  keyphrases = extractor.get_n_best(n=3)
  keywords_topicrank.append(keyphrases)


In [None]:
with open("key_kdd_topicrank.txt", "w") as external_file:
  for keyphrase in keywords_topicrank:   
    print(keyphrase, file=external_file)
  external_file.close()

### **Position Rank algorithm**

In [None]:
# define the valid Part-of-Speeches to occur in the graph
pos = {'NOUN', 'PROPN', 'ADJ'}

# define the grammar for selecting the keyphrase candidates
grammar = "NP: {<ADJ>*<NOUN|PROPN>+}"
extractor = pke.unsupervised.PositionRank()
keywords_positionrank = []
for index, row in df.iterrows():
# 1. create a PositionRank extractor.

# 2. load the content of the document.
  extractor.load_document(input=row["Text"],
                        language='en',
                        normalization=None)

# 3. select the noun phrases up to 3 words as keyphrase candidates.
  extractor.candidate_selection(grammar=grammar,
                              maximum_word_number=3)

# 4. weight the candidates using the sum of their word's scores that are
#    computed using random walk biaised with the position of the words
#    in the document. In the graph, nodes are words (nouns and
#    adjectives only) that are connected if they occur in a window of
#    10 words.
  extractor.candidate_weighting(window=10,
                              pos=pos)

# 5. get the 3-highest scored candidates as keyphrases
  keyphrases = extractor.get_n_best(n=3)
  keywords_positionrank.append(keyphrases)


In [None]:
with open("key_kdd_positionrank.txt", "w") as external_file:
  for keyphrase in keywords_positionrank:   
    print(keyphrase, file=external_file)
  external_file.close()

# **Supervised**

## **Binary**

### **KEA algorithm**

In [None]:
# define a list of stopwords
stoplist = stopwords.words('english')

# 1. create a Kea extractor.
extractor = pke.supervised.Kea()
keywords_kea = []
for index, row in df.iterrows():
# 2. load the content of the document.
  extractor.load_document(input=row["Text"],
                        language='en',
                        normalization=None)

# 3. select 1-3 grams that do not start or end with a stopword as
#    candidates. Candidates that contain punctuation marks as words
#    are discarded.
  extractor.candidate_selection()

# 4. classify candidates as keyphrase or not keyphrase.
  df = pke.load_document_frequency_file(input_file='df-semeval2010.tsv.gz')
  model_file = 'Kea-semeval2010.py3.pickle'
  extractor.candidate_weighting(model_file=model_file,
                              df=df)

# 5. get the 3-highest scored candidates as keyphrases
  keyphrases = extractor.get_n_best(n=3)
  keywords_kea.append(keyphrases)


In [None]:
with open("key_kdd_kea.txt", "w") as external_file:
  for keyphrase in keywords_kea:   
    print(keyphrase, file=external_file)
  external_file.close()

# **YAKE! algorithm**

In [None]:
# 1. create a YAKE extractor.
extractor = pke.unsupervised.YAKE()

keywords_yake = []
for index, row in df.iterrows():
# 2. load the content of the document.

  extractor.load_document(input=row["Text"],language='en')


# 3. select {1-3}-grams not containing punctuation marks and not
#    beginning/ending with a stopword as candidates.
  stoplist = stopwords.words('english')
  extractor.candidate_selection(n=3)

# 4. weight the candidates using YAKE weighting scheme, a window (in
#    words) for computing left/right contexts can be specified.
  window = 2
  use_stems = False # use stems instead of words for weighting
  extractor.candidate_weighting(window=2)

# 5. get the 3-highest scored candidates as keyphrases.
#    redundant keyphrases are removed from the output using levenshtein
#    distance and a threshold.
  threshold = 0.8
  keyphrases = extractor.get_n_best(n=3, threshold=threshold)
  keywords_yake.append(keyphrases)



In [None]:
with open("key_kdd_yake.txt", "w") as external_file:
  for keyphrase in keywords_yake:   
    print(keyphrase, file=external_file)
  external_file.close()

# Data Preprocessing

In [None]:
import re

In [None]:
list=["kdd_KPMiner.txt","kdd_positionrank.txt","kdd_yake.txt","kdd_kea.txt","kdd_tfidf.txt","kdd_topicrank.txt","kdd_textrank.txt","kdd_singlerank.txt"]
for file in list:
  with open(f'{file}', 'r') as infile,open(f'key_{file}', 'w') as outfile:
      data = infile.read()
      data = re.sub(r"[0-9]", "", data)
      data = data.replace("(", "").replace("),", "").replace(".","").replace("'","").replace("  "," ").replace("[","").replace("]","").replace(", )","")
      outfile.write(data)

In [None]:
import pandas as pd
df1 = pd.read_csv("kdd_text.csv")
df2 = pd.read_csv("kdd_keys.csv", sep = "delimiter")
df3=pd.read_csv("key_kdd_tfidf.txt",sep = "delimiter", header=None)
df4=pd.read_csv("key_kdd_textrank.txt",sep = "delimiter", header=None)
df5=pd.read_csv("key_kdd_topicrank.txt",sep = "delimiter", header=None)
df6=pd.read_csv("key_kdd_singlerank.txt",sep = "delimiter", header=None)
df7=pd.read_csv("key_kdd_positionrank.txt",sep = "delimiter", header=None)
df8=pd.read_csv("key_kdd_kea.txt",sep = "delimiter", header=None)
df9=pd.read_csv("key_kdd_KPMiner.txt",sep = "delimiter", header=None)
df10=pd.read_csv("key_kdd_yake.txt",sep = "delimiter", header=None)
# Merge all the tables

df1["Key"] = df2
df1["key_tfidf"]=df3
df1["key_textrank"]=df4
df1["key_topicrank"]=df5
df1["key_singlerank"]=df6
df1["key_positionrank"]=df7
df1["key_kea"]=df8
df1["key_KPMiner"]=df9
df1["key_yake"]=df10
df = df1

#Display the dataset
df.head()

  return func(*args, **kwargs)


Unnamed: 0,Text,Key,key_tfidf,key_textrank,key_topicrank,key_singlerank,key_positionrank,key_kea,key_KPMiner,key_yake
0,Variable latent semantic indexing No contact i...,"linear algebra,lsi,matrix approximation,""spars...","variable, variable latent, variable latent sem...",latent,"contact information, variable latent semantic ...","variable latent semantic indexing, contact inf...",contact information,"variable, variable latent, variable latent sem...","variable, semantic, indexing, information, pro...","variable latent semantic, latent semantic inde..."
1,Entity discovery and assignment for opinion mi...,"entity discovery,sentiment analysis","opinion mining, mining, assignment","recent years, important topic, entity discover...","opinion mining applications opinion mining, as...","opinion mining applications opinion mining, op...","opinion mining services, entity discovery, app...","opinion mining, mining, assignment, applicatio...","mining, opinion mining, assignment, applicatio...","entity discovery, opinion mining applications,..."
2,Stable feature selection via dense feature gro...,"classification,feature selection,high-dimensio...","feature, feature selection, selection","feature groups, feature selection","classification accuracy, past, selection algor...","dense feature groups many, stable feature sele...","stable feature selection, dense feature groups...","feature, feature selection, selection, stable,...","feature, selection, feature selection, stable,...","improving classification accuracy, stable feat..."
3,Consensus group stable feature selection Stabi...,"ensemble,feature selection,high-dimensional da...","feature, feature selection, selection","group stable, sample, selection","feature selection, high-dimensional, under-add...",consensus group stable feature selection stabi...,"feature selection, consensus group, under-addr...","feature, feature selection, selection, consens...","feature, selection, feature selection, consens...","feature selection stability, consensus group s..."
4,COA: finding novel patents through text analys...,"document ranking,information retrieval,patent ...","coa, finding, finding novel",novel patents,"text analysis, novel patents, recent years, coa","text analysis, novel patents, recent years, coa","novel patents, text analysis, recent years, coa","coa, finding, finding novel, finding novel pat...","finding novel, finding novel patents, novel pa...","finding novel patents, coa, finding novel, rec..."


# Evaluation F1 Score 

In [None]:
from sklearn.metrics import f1_score

df = pd.read_csv("kdd_text.csv")
df2 = pd.read_csv("kdd_keys.csv", sep = "delimiter")



df["Key"] = df2
df["key_tfidf"]=[f1_score(k, df3.values.tolist(), average='macro')  for k in df2.values.tolist()]
df["key_textrank"]=[f1_score(k, df4.values.tolist(), average='macro')  for k in df2.values.tolist()]
df["key_topicrank"]=[f1_score(k, df5.values.tolist(), average='macro')  for k in df2.values.tolist()]
df["key_singlerank"]=[f1_score(k, df6.values.tolist(), average='macro')  for k in df2.values.tolist()]
df["key_positionrank"]=[f1_score(k, df7.values.tolist(), average='macro')  for k in df2.values.tolist()]
df["key_kea"]=[f1_score(k, df8.values.tolist(), average='macro')  for k in df2.values.tolist()]
df["key_KPMiner"]=[f1_score(k, df9.values.tolist(), average='macro')  for k in df2.values.tolist()]
df["key_yake"]=[f1_score(k, df10.values.tolist(), average='macro')  for k in df2.values.tolist()]
df_fin = df

#Display the dataset
df_fin.head()

  return func(*args, **kwargs)


Unnamed: 0,Text,Key,key_tfidf,key_textrank,key_topicrank,key_singlerank,key_positionrank,key_kea,key_KPMiner,key_yake
0,Variable latent semantic indexing No contact i...,"linear algebra,lsi,matrix approximation,""spars...",0.082172,0.353898,0.304137,0.281995,0.361046,0.307925,0.373468,0.235144
1,Entity discovery and assignment for opinion mi...,"entity discovery,sentiment analysis",0.442042,0.208452,0.149001,0.207749,0.437782,0.427431,0.16171,0.372706
2,Stable feature selection via dense feature gro...,"classification,feature selection,high-dimensio...",0.152473,0.329591,0.154166,0.444249,0.152825,0.346602,0.166194,0.266192
3,Consensus group stable feature selection Stabi...,"ensemble,feature selection,high-dimensional da...",0.122999,0.42957,0.015162,0.415546,0.140861,0.139717,0.02188,0.349189
4,COA: finding novel patents through text analys...,"document ranking,information retrieval,patent ...",0.295585,0.031678,0.213485,0.403532,0.371025,0.072886,0.356589,0.41184


In [2]:
d = {'key_tfidf': [df["key_tfidf"].mean()], 'key_textrank': [df["key_textrank"].mean()], 'key_topicrank': [df["key_topicrank"].mean()], 'key_singlerank': [df["key_singlerank"].mean()], 'key_positionrank': [df["key_positionrank"].mean()], 'key_kea': [df["key_kea"].mean()], 'key_KPMiner': [df["key_KPMiner"].mean()], 'key_yake': [df["key_yake"].mean()]}
df_score = pd.DataFrame(data=d)
df_score.head()

Unnamed: 0,key_tfidf,key_textrank,key_topicrank,key_singlerank,key_positionrank,key_kea,key_KPMiner,key_yake
0,0.160129,0.09219,0.08576,0.102147,0.132144,0.19872,0.048817,0.18357
