In [1]:
import os
import sys
import psycopg2

sys.path.insert(0, os.path.abspath(
    '../'))  # add the current module so that we can import the utils file
from tools.utils import get_top_keywords_for_query, create_df_for_query

[nltk_data] Downloading package stopwords to /home/lvs215/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/lvs215/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/lvs215/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
conn = psycopg2.connect(user="lvs215",
                        password="",
                        host="127.0.0.1",
                        port="12777",
                        database="aip")
start_year = 2011  # inclusive
end_year = 2020  # inclusive

In [3]:
# Get for each year, the top 100 keywords.
num_keywords = 10
# corpus_query = """
# SELECT * 
# FROM publications 
# WHERE year between ? and ?
# and (lower(title) like '%workflow%' or lower(abstract) like '%workflow%') 
# and (lower(title) like '%schedul%' or lower(abstract) like '%schedul%')
# """
corpus_query = """
SELECT * 
FROM publications 
WHERE year between %s and %s;
"""
corpus_df = create_df_for_query(conn, corpus_query, [start_year, end_year])

In [4]:
queries = [
    # Workflow communities - 1
    (
        """
        SELECT *
        FROM publications
        WHERE year = %s
        AND (title ILIKE %s OR abstract ILIKE %s)
        AND (title ILIKE %s OR abstract ILIKE %s)
        """,
        "workflow-community",
        "collaboration-network-workflow-scheduling_aip.gexf",
        ["%workflow%", "%workflow%", '%schedul%', '%schedul%']
    ),

    # Workflow formalisms - 2
    (
        """
        SELECT *
        FROM publications
        WHERE year = %s
        AND (lower(title) ILIKE %s OR lower(abstract) ILIKE %s)
        AND (
            (lower(title) ILIKE %s OR lower(abstract) ILIKE %s)
            OR (lower(title) ILIKE %s OR lower(abstract) ILIKE %s)
        )
        """,
        "workflow-formalism-community",
        "collaboration-network-workflow-formalisms_aip.gexf",
        ["%workflow%", "%workflow%", "%formalism%", "%formalism%", "%language%",
         "%language%"]
    ),

    # Workflow allocation - 3
    (
        """
        SELECT *
        FROM publications
        WHERE year = %s
        AND (lower(title) ILIKE %s OR lower(abstract) ILIKE %s)
        AND (
            (lower(title) ILIKE %s OR lower(abstract) ILIKE %s)
            OR (lower(title) ILIKE %s OR lower(abstract) ILIKE %s)
            OR (lower(title) ILIKE %s OR lower(abstract) ILIKE %s)
        )
        """,
        "workflow-allocation-community",
        "collaboration-network-workflow-allocation_aip.gexf",
        ["%workflow%", "%workflow%", "%allocat%", "%allocat%", "%schedul%",
         "%schedul%", "%plan%", "%plan%"]
    ),

    # Resource provisioning communities - 4
    (
        """
        SELECT *
        FROM publications
        WHERE year = %s
        AND (lower(title) ILIKE %s OR lower(abstract) ILIKE %s)
        AND (
            lower(title) ILIKE %s OR lower(abstract) ILIKE %s
            OR lower(title) ILIKE %s OR lower(abstract) ILIKE %s
        )
        """,
        "resource-provisioning-community",
        "collaboration-network-resource-provisioning_aip.gexf",
        ["%workflow%", "%workflow%", "%provision%", "%provision%", "%autoscal%",
         "%autoscal%"]
    ),

    # applications and services - 5
    (
        """
        SELECT *
        FROM publications
        WHERE year = %s
        AND (lower(title) ILIKE %s OR lower(abstract) ILIKE %s)
        AND (lower(title) ILIKE %s OR lower(abstract) ILIKE %s)
        """,
        "applications-and-services-community",
        "collaboration-network-applications-and-services-community_aip.gexf",
        ["%cloud%", "%cloud%", "%service%", "%service%"]
    ),
]

In [5]:
def get_emerging_top_keywords_for_keywords_list(keywords_per_year):
    # This cell outputs keywords that are amoung the top-{num_keywords} in the past (end_year - last_years, end_year] years
    # that are not found in the block [start_year, end_year - last_years]

    keywords_in_the_last_years = set()
    last_years = 5
    for year in range(end_year, end_year - last_years, -1):
        for keyword in keywords_per_year[year]:
            keywords_in_the_last_years.add(keyword)

    keywords_in_the_remaining_years = set()
    for year in range(end_year - last_years, start_year - 1, -1):
        for keyword in keywords_per_year[year]:
            keywords_in_the_remaining_years.add(keyword)

    emerging_keywords = keywords_in_the_last_years - keywords_in_the_remaining_years
    return ["\\enquote{{{}}}".format(i) for i in sorted(emerging_keywords)]

In [6]:
def get_rising_top_keywords_for_keywords_list(keywords_per_year):
    # This cell outputs keywords that have strictly risen in rank throughout the years in the top-{num_keywords}.

    keyword_last_rank = dict()
    seen_keywords = set()
    emerging_keywords = set()

    # Fill the dictionary with the initial year
    for index in range(len(keywords_per_year[start_year])):
        keyword = keywords_per_year[start_year][index]
        keyword_last_rank[keyword] = index
        seen_keywords.add(keyword)
        emerging_keywords.add(keyword)

    # for rank, word in keyword_last_rank.items():
    #     print(rank, word)
    # Now, we check each year's keywords. There can be four cases:
    # 1. The keyword was also in last year's set. In this case the index (its rank) has to be higher or equal.
    # 2. The keyword is not in keyword_last_rank _AND_ was never seen before, this means it can be an emerging field and should be added to the keyword_last_rank dictionary.
    # 3. The keyword is not in keyword_last_rank _AND_ was seen before. This means it dropped throughout the years and should be removed.
    # 4. A keyword was seen once and then never again, drop it if it was not encountered in the {end_year}.
    for year in range(start_year + 1, end_year + 1):
        this_years_rank = dict()
        new_emerging_topics = set()  # Keep a clean set to automatically weed out keywords matching case 4
        for index in range(len(keywords_per_year[year])):
            keyword = keywords_per_year[year][index]

            #         # For insight
            #         if keyword == "part":
            #             print(year, keyword, index)

            # Case 1
            if keyword in keyword_last_rank and keyword in emerging_keywords:
                if index <= keyword_last_rank[
                    keyword]:  # rank is lower, all is well
                    this_years_rank[keyword] = index
                    new_emerging_topics.add(keyword)
            elif keyword not in seen_keywords:  # Case 2
                this_years_rank[
                    keyword] = index  # We can reuse keyword_last_rank since keywords are unique per year
                new_emerging_topics.add(keyword)
            else:  # Case 3 - this else statement can be deleted, but for completion's sake we left it in
                if keyword in emerging_keywords:
                    emerging_keywords.remove(keyword)

            seen_keywords.add(
                keyword)  # Always add the keyword to the seen keywords set.
            keyword_last_rank = this_years_rank  # Set the last year's rank dict to this year, to be ready for the next potential loop iteration.
            emerging_keywords = new_emerging_topics  # Since we swap at the end, keywords emerged in the last year are automatically kept.

    ret = ["\\enquote{{{}}}".format(i) for i in emerging_keywords]

    for word in emerging_keywords:
        for year in range(start_year, end_year + 1):
            if word in keywords_per_year[year]:
                print(word, year)

    return ret

In [14]:
for query, community, _, query_args in queries:
    keywords_per_year = dict()
    print(community)
    for year in range(start_year, end_year + 1):
        args = [str(year), *query_args]
        keywords = get_top_keywords_for_query(conn, corpus_df, query,
                                              num_keywords,
                                              article_query_params=args)
        keywords_per_year[year] = keywords
    print(*get_emerging_top_keywords_for_keywords_list(keywords_per_year))
    print(*get_rising_top_keywords_for_keywords_list(keywords_per_year))

workflow-community
\enquote{based} \enquote{deadline} \enquote{makespan} \enquote{model} \enquote{multi} \enquote{objective}
model 2020
workflow 2011
workflow 2012
workflow 2013
workflow 2014
workflow 2015
workflow 2016
workflow 2017
workflow 2018
workflow 2019
workflow 2020
multi 2020
objective 2020
based 2020
\enquote{model} \enquote{workflow} \enquote{multi} \enquote{objective} \enquote{based}
workflow-formalism-community
\enquote{algorithm} \enquote{analysis} \enquote{cloud} \enquote{computational} \enquote{container} \enquote{declaration} \enquote{declarative} \enquote{engine} \enquote{framework} \enquote{front} \enquote{hypervolume} \enquote{look} \enquote{many} \enquote{objective} \enquote{optimization} \enquote{optimizer} \enquote{pareto} \enquote{problem} \enquote{red} \enquote{reusable} \enquote{scheduling} \enquote{science} \enquote{scripting} \enquote{specification} \enquote{sshfs} \enquote{structure} \enquote{support}
problem 2020
optimizer 2020
hypervolume 2020
pareto 202