In [None]:
import numpy as np
import pandas as pd
import collections
import colorcet as cc

import matplotlib.pyplot as plt
import seaborn as sns

import litstudy

# Options for plots
plt.rcParams['figure.figsize'] = (10, 6)
sns.set('paper')

pd.options.display.max_colwidth = 100

# load data

In [None]:
path = "papers_sorted_VF.csv" #"papers.csv"
papers = pd.read_csv(path)

In [None]:
# some info abuot the dataset
print("#Relevant papers={}".format(len(papers)))
print('paper attributes:{}'.format(list(papers.columns)))

# mage sure data types are correct
papers['title'] = pd.Series(papers['title'], dtype="string")
papers['year'] = pd.Series(papers['year'], dtype="int")

# Exploratory analysis
## publication year

### check & fill Nans

In [None]:
papers.year.value_counts().plot(kind='barh',
                                figsize=(5,3),
                                title="Number publication per year")

## Document Type (Enty type)

In [None]:
papers['pubType'] = pd.Series(papers['pubType'], dtype="string")


if papers.pubType.isnull().all():
    print('No values to plot for Entry_type')
else:
    papers.pubType.fillna('NA', inplace=True)
    papers.pubType.value_counts().plot(kind='barh',
                                    figsize=(5,3),
                                    title="pubType")

In [None]:
papers.pubType.value_counts()

## Authors

In [None]:
if papers.author.isnull().all():
    print('No values to plot for Authors')
else:
    # get the first author names
    first_authors = []
    for author in papers['author']:
        
        author = str(author)
        
        first_authors.append(author.split('and')[0])
    papers['first_author'] = first_authors
    
    papers.first_author.value_counts().plot(kind='bar',
                                            figsize=(25,5),
                                            title="First Authors")

## Address (country and cities)

In [None]:
papers['address'] = pd.Series(papers['address'], dtype="string")


if papers.address.isnull().all():
    print('No values to plot for Address')
else:
    papers.address.fillna('NA', inplace=True)
    
    papers.address.value_counts().plot(kind='barh',
                                    figsize=(5,10),
                                    title="Address")

## Institutions

In [None]:
papers['institution'] = pd.Series(papers['institution'], dtype="string")


if papers.institution.isnull().all():
    print('No values to plot for Institution')
else:
    papers.institution.fillna('NA', inplace=True)
    
    papers.institution.value_counts().plot(kind='barh',
                                           figsize=(5,3),
                                           title="Institution")

## Editors & Publishers

In [None]:
papers['journal'] = pd.Series(papers['journal'], dtype="string")
papers['editor'] = pd.Series(papers['editor'], dtype="string")
papers['publisher'] = pd.Series(papers['publisher'], dtype="string")

if papers.editor.isnull().all():
    print('No values to plot for Editor')
else:
    papers.editor.fillna('NA', inplace=True)
    papers.editor.value_counts().plot(kind='barh',
                                    figsize=(5,3),
                                    title="Editor")

In [None]:
# x top relevant publishers/journals - see why we have so many nans

if papers.journal.isnull().all():
    print('No values to plot for Journal')
else:
    papers.journal.fillna('NA', inplace=True)
    papers.journal.value_counts().plot(kind='barh',
                                    figsize=(5,15),
                                    title="Journal")

In [None]:
if papers.publisher.isnull().all():
    print('No values to plot for Publisher')
else:
    papers.publisher.fillna('NA', inplace=True)
    papers.publisher.value_counts().plot(kind='barh',
                                    figsize=(5,8),
                                    title="Publisher")

## Use Cases

In [None]:
papers['use_case'] = pd.Series(papers['use_case'], dtype="string")


if papers.use_case.isnull().all():
    print('No values to plot for Use cases')
else:
    papers.use_case.fillna('NA', inplace=True)
    
    papers.use_case.value_counts().plot(kind='barh',
                                        figsize=(5,3),
                                        title="Use cases")

## Technology

In [None]:
papers['technology'] = pd.Series(papers['technology'], dtype="string")

if papers.technology.isnull().all():
    print('No values to plot for Technology')
else:
    papers.technology.fillna('NA', inplace=True)
    
    papers.technology.value_counts().plot(kind='barh',
                                          figsize=(5,20),
                                          title="Technology")

In [None]:
# categorize the technologies in a more abstract group
def append_technology_usecase(uc_element, tech_cat):
    if '(Sub)trajectory Classification' in uc_element:
        uc_element = uc_element.replace('(Sub)trajectory Classification', '(Sub)trajectory classification')
    
    
    
    if ";" in uc_element:
        for e in uc_element.split(";"):
            if e[0] == " ":
                e = e[1:]
                
            if e[-1] == " ":
                e = e[:-1]

            use_case.append(e)
            techniques.append(tech_cat)
            paper_ids.append(pid)
            pub_years.append(year)      
    else:
        use_case.append(uc_element)
        techniques.append(tech_cat)
        paper_ids.append(pid)
        pub_years.append(year)  

techniques = []
paper_ids = []
pub_years = []
use_case = []
for pid, year, uc, technology in zip(papers['key'].values,
                                     papers['year'].values,
                                     papers['use_case'],
                                     papers['technology'].values):
    
    technology = technology.lower()
    is_other_dl = True
    
    # eitheri it is a traditional ML method or it is DL
    # ML
    if "ml" in technology:
        continue
        tech_category = 'Classical ML'
        append_technology_usecase(uc, tech_category)
        is_other_dl = False
        
    #RNN
    if ((("lstm" in technology) or ("rnn" in technology) or ("gru" in technology)
        or ("recurrent" in technology)) 

        and (("cnn-lstm" not in technology) or ("crnn" not in technology) or ("attention" not in technology)
            or ("encoder" not in technology) or ("decoder" not in technology))
       ):
        tech_category = 'RNN'
        append_technology_usecase(uc, tech_category)
        is_other_dl = False

    #CNN
    if ((("cnn" in technology) or ("convolution" in technology))

       and (("cnn-lstm" not in technology) or ("crnn" not in technology) or ("attention" not in technology)
            or ("encoder" not in technology) or ("decoder" not in technology))
       ):
        tech_category = 'CNN'
        append_technology_usecase(uc, tech_category)
        is_other_dl = False

    # FNN
    if (((" nn" in technology) or ("fully connected" in technology) or ("fully-connected" in technology) 
         or ("fnn" in technology) or ("dense" in technology) or ("mlp" in technology))

       and 
        
        (("cnn-lstm" not in technology) or ("crnn" not in technology) or ("attention" not in technology)
            or ("encoder" not in technology) or ("decoder" not in technology))
       ):
        tech_category = 'FNN'
        append_technology_usecase(uc, tech_category) 
        is_other_dl = False

    #GNN
    if ((("gnn" in technology) or ("gcn" in technology) or ("graph" in technology))

        and (("convolutional" not in technology) or ("recurrent" not in technology))
       ):
        tech_category = 'GNN'
        append_technology_usecase(uc, tech_category) 
        is_other_dl = False

    # AE
    if ("autoencoder" in technology) or ("encoder" in technology) or ("decoder" in technology):
        tech_category = 'AE'
        append_technology_usecase(uc, tech_category) 
        is_other_dl = False

    # Attention
    if ("attention" in technology) or ("former" in technology) or ("bert" in technology):
        tech_category = 'Attention'
        append_technology_usecase(uc, tech_category) 
        is_other_dl = False

    # GAN
    if (("generative" in technology) or ("gan" in technology) or ("adversarial" in technology)):            
        tech_category = 'GAN'
        append_technology_usecase(uc, tech_category) 
        is_other_dl = False

    # DRL
    if ("reinforcement" in technology) or ("drl" in technology) or (" rl" in technology):
        tech_category = 'DRL'
        append_technology_usecase(uc, tech_category) 
        is_other_dl = False

    # Hybrid
    if ((("cnn-lstm" in technology) or ("crnn" in technology) or ("hybrid" in technology) or ("clstm" in technology))
       ):
        tech_category = 'CRNN'
        append_technology_usecase(uc, tech_category) 
        is_other_dl = False

    # Other
    if (("other" in technology) or is_other_dl):        
        #print(technology)
        tech_category = 'Other DL'
        append_technology_usecase(uc, tech_category) 
        
print(len(techniques),len(paper_ids), len(pub_years), len(use_case))

use_case_tech_df = pd.DataFrame()
use_case_tech_df["paper_ids"] = paper_ids
use_case_tech_df["pub_years"] = pub_years
use_case_tech_df["use_case"] = use_case
use_case_tech_df["techniques"] = techniques

In [None]:
list(use_case_tech_df.use_case.unique())

In [None]:
## make sure the use cases names are consistent with the paper
## estimated time of arrival --> arrival time prediction
## next location / final destination prediction --> next location / destination prediction
konsistent_ucs = []
for uc in use_case_tech_df.use_case:
    if uc == "Estimated Time of Arrival":
        konsistent_ucs.append("Arrival time prediction")
        
    elif uc == "Next location / final destination prediction":
        konsistent_ucs.append("Next location / destination prediction")
        
    else:
        konsistent_ucs.append(uc)
        
use_case_tech_df["use_case"] = konsistent_ucs

### Visualize techniques per use-cases over the years

In [None]:
#create fixed color pallette
palette = sns.color_palette(cc.glasbey, n_colors=25)
display(palette)

#### Make sure the colors match the paper theme

In [None]:
task_palette = {'Trajectory prediction/imputation': palette.as_hex()[4],
                 '(Sub)trajectory classification': palette.as_hex()[7],
                 'Traffic volume prediction': palette.as_hex()[3],
                 'Next location / destination prediction': palette.as_hex()[0],
                 'Anomaly detection': palette.as_hex()[6],
                 'Synthetic data generation': palette.as_hex()[5],
                 'Location classification': palette.as_hex()[2],
                 'Arrival time prediction': palette.as_hex()[1]}

In [None]:
tech_palette = {'RNN': palette.as_hex()[8],
                'CNN': palette.as_hex()[9],
                'GAN': palette.as_hex()[10],
                'CRNN': palette.as_hex()[11],
                'Attention': palette.as_hex()[12],
                'GNN': palette.as_hex()[13],
                'AE': palette.as_hex()[14],
                'Other DL': palette.as_hex()[15],
                'DRL': palette.as_hex()[16],
                'FNN': palette.as_hex()[17]}

In [None]:
use_case_tech_df = use_case_tech_df[use_case_tech_df.pub_years > 2017]
use_case_tech_df = use_case_tech_df.sort_values(by=['pub_years'], ascending=True)

### plot a trend 

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(15, 7), layout='constrained')

for ax, topic, palette, lgnd_pos_x in zip(axs,
                                          ["techniques", "use_case"],
                                          [tech_palette, task_palette],
                                          [0.32, 0.95]
                                         ):
    
    count_df = pd.DataFrame(use_case_tech_df.value_counts(subset = ["pub_years", topic])).reset_index()
    count_df.columns = ["pub_years", topic, "counts"]

    ax.set_title("{} over the past 5 years".format(topic), fontsize=24)
    sns.lineplot(data=count_df,
                 x="pub_years",
                 y="counts",
                 hue=topic,
                 palette=palette,
                 style=topic,
                 markers=True,
                 linewidth = 2.5,
                 markersize=10,
                 ax=ax)

    ax.tick_params(axis='x', size= 32)
    ax.set_ylim(0, 50)
    ax.set(xlabel=None)

    ax.get_legend().remove()
    handles, labels = ax.get_legend_handles_labels()
    fig.legend(handles, labels, bbox_to_anchor=(lgnd_pos_x,-0.05), fontsize=20)
    
plt.savefig("trend_suplots.pdf", bbox_inches='tight')

In [None]:
tech_count_df = pd.DataFrame(use_case_tech_df.value_counts(subset = ["pub_years", "techniques"])).reset_index()
tech_count_df.columns = ["pub_years", "techniques", "counts"]

sns.lineplot(data=tech_count_df,
             x="pub_years",
             y="counts",
             hue="techniques",
             palette=tech_palette,
             style="techniques",
             markers=True,
             linewidth = 2.5,
             markersize=10)
# plt.title("NN-design for trajectory data from 2018 to 2023", fontsize=20)
plt.yticks(np.arange(0,50, 5, dtype=None), fontsize=16), plt.ylabel(" ")
plt.ylabel("Number of NN designs per publication", fontsize=20)
plt.xticks([2018, 2019, 2020, 2021, 2022, 2023], fontsize=16, rotation = 75), plt.xlabel(" ")
plt.legend(bbox_to_anchor=(1.0, 1.0), fontsize=16)

plt.savefig("dl_tech_trend.pdf", bbox_inches='tight')

In [None]:
tech_count_df = pd.DataFrame(use_case_tech_df.value_counts(subset = ["pub_years", "use_case"])).reset_index()
tech_count_df.columns = ["pub_years", "use_case", "counts"]

sns.lineplot(data=tech_count_df,
             x="pub_years",
             y="counts",
             hue="use_case",
             palette=task_palette,
             style="use_case",
             markers=True,
             linewidth = 2.5,
             markersize=10)
#plt.title("ML/DL use cases of trajectory data from 2018 to 2023", fontsize=20)
plt.yticks(np.arange(0,50, 5, dtype=None), fontsize=16), plt.ylabel(" ")
plt.xticks([2018, 2019, 2020, 2021, 2022, 2023], fontsize=16, rotation = 75), plt.xlabel(" ")
plt.ylabel("Number of use cases per publication", fontsize=20)
handles, labels = plt.gca().get_legend_handles_labels()

# make sure the legends are alphabetically oredered
legend_df = pd.DataFrame()
legend_df["labels"] = labels
legend_df["handles"] = handles

legend_df = legend_df.sort_values(by=['labels'], ascending=True)

plt.legend(bbox_to_anchor=(1.0, 1.0), fontsize=14)

plt.savefig("usecases_trend.pdf", bbox_inches='tight')

In [None]:
fig, axs = plt.subplots(3, 2, figsize=(15, 15), layout='constrained')

x_ticks_ordered = ['AE', 'Attention', 'CNN', 'DRL', 'FNN', 'GAN', 'GNN', 'CRNN', 'Other DL', 'RNN']

for ax, year in zip(axs.flat,use_case_tech_df.pub_years.unique()):
    ax.set_title("publication year= {}".format(int(year)), fontsize=20)
    
    sns.countplot(data=use_case_tech_df[use_case_tech_df.pub_years==year],
                  x = "techniques", 
                  hue="use_case",
                  ax=ax,
                  palette=task_palette,
                  order=x_ticks_ordered,
                   width=1)
    ax.get_legend().remove()
    [ax.axvline(x+.5,color='white') for x in ax.get_xticks()]
    
    ax.tick_params(axis='x', rotation=90, labelsize= 16)
    ax.tick_params(axis='y', labelsize= 16)
    ax.set_ylim(0, 10)
    ax.set(xlabel=None); ax.set(ylabel=None)
    
    if year==2018:
        handles, labels = ax.get_legend_handles_labels()
        
handles = [handles[i] for i in legend_df.index]
labels = [labels[i] for i in legend_df.index]

fig.legend([handles[i] for i in legend_df.index], [labels[i] for i in legend_df.index],
           loc='lower center', bbox_to_anchor=(0.5,-0.20), fontsize=18)
    
plt.tight_layout()   

# tikzplotlib.save("grid_plot.tex")
fig.savefig("grid_plot.pdf", bbox_inches='tight')

### Visualization of DL techniques over the past years for trajectory data

This plot only shows the techniques regardless of the specific use case

In [None]:
plt.figure()
plt.title("DL-Technology for trajectory data over the past 8 years", fontsize=20)

use_case_tech_df["pub_years"] = use_case_tech_df["pub_years"].astype(int)
sns.countplot(data=use_case_tech_df, x="pub_years", hue="techniques", palette=tech_palette)

plt.yticks(np.arange(0,50, 5, dtype=None), fontsize=16), plt.ylabel(" ")
plt.xticks(fontsize=16, rotation = 75), plt.xlabel(" ")
plt.legend(bbox_to_anchor=(1.0, 1.0), fontsize=16)

In [None]:
uc_counts = pd.DataFrame(use_case_tech_df.value_counts(subset = ["techniques"])).reset_index()
uc_counts.columns = ["techniques", "counts"]
uc_counts

In [None]:
pub_count_df = pd.DataFrame(papers.value_counts(subset = ["year"])).reset_index()
pub_count_df.columns = ["pub_years", "counts"]
pub_count_df = pub_count_df.sort_values("pub_years") 

pub_perYear = []
for year in pub_count_df.pub_years:
    pub_perYear.append(pub_count_df[pub_count_df.pub_years==year].loc[:, pub_count_df.columns!="pub_years"].sum(axis=1).values[0])
    
display(pub_perYear)
pub_count_df

### Stacked Plot (for paper)

In [None]:
tech_count_df = pd.DataFrame(use_case_tech_df.value_counts(subset = ["pub_years", "techniques"])).reset_index()
tech_count_df.columns = ["pub_years", "techniques", "counts"]

In [None]:
df = pd.DataFrame()
df["pub_years"] = tech_count_df.pub_years.unique()

for technique in tech_count_df.techniques.unique():
    tech_df = tech_count_df[tech_count_df.techniques == technique].copy()
        
    if len(tech_df) != len(df["pub_years"]):
        df[technique] = 0  
        for year, count in zip(tech_df.pub_years, tech_df.counts):
            df.loc[df[df["pub_years"]==year].index, technique] = count
    else:
        df[technique] = tech_count_df[tech_count_df.techniques == technique].counts.values

df = df.sort_values("pub_years") 
display(df)


plt.figure(figsize=(3, 3))
df = df.set_index("pub_years")
df = df.apply(lambda x: (x-0)/(x.sum()-0), axis=1)
display(df)

ax = df.plot(kind='bar', stacked=True, color=tech_palette)
ax.set_xlabel(" ")
ax.set_ylabel(" ")

plt.xticks(fontsize=12, rotation = 90)
plt.yticks(fontsize=12)

for i, x_pos in zip(range(len(df.index)),
                       [-0.05, 0.99, 1.99, 2.9, 3.96, 4.95]):
    ax.text(x_pos, 1, pub_perYear[i], color="k", fontsize=12)

handles, labels = plt.gca().get_legend_handles_labels()
legend_df = pd.DataFrame()
legend_df["labels"] = labels
legend_df["handles"] = handles

legend_df = legend_df.sort_values(by=['labels'], ascending=True)

plt.legend([handles[i] for i in legend_df.index], [labels[i] for i in legend_df.index],
          bbox_to_anchor=(1.0, 1.0), fontsize=16)
plt.savefig("stack_plot_Techniques.pdf", bbox_inches='tight')
plt.show()

In [None]:
df.RNN.mean()

In [None]:
uc_count_df = pd.DataFrame(use_case_tech_df.value_counts(subset = ["pub_years", "use_case"])).reset_index()
uc_count_df.columns = ["pub_years", "use_case", "counts"]

In [None]:
df = pd.DataFrame()
df["pub_years"] = uc_count_df.pub_years.unique()

for uc in uc_count_df.use_case.unique():
    uc_df = uc_count_df[uc_count_df.use_case == uc].copy()
        
    if len(uc_df) != len(df["pub_years"]):
        df[uc] = 0  
        for year, count in zip(uc_df.pub_years, uc_df.counts):
            df.loc[df[df["pub_years"]==year].index, uc] = count
    else:
        df[uc] = uc_count_df[uc_count_df.use_case == uc].counts.values

df = df.sort_values("pub_years") 
display(df)

plt.figure(figsize=(3, 3))
df = df.set_index("pub_years")
df = df.apply(lambda x: (x-0)/(x.sum()-0), axis=1)
display(df)

ax = df.plot(kind='bar', stacked=True, color=task_palette)


for i, x_pos in zip(range(len(df.index)),
                       [-0.05, 0.99, 1.99, 2.9, 3.96, 4.95]):
    ax.text(x_pos, 1, pub_perYear[i], color="k", fontsize=12)

plt.xlabel(" ")
plt.ylabel(" ")

plt.xticks(fontsize=12, rotation = 90)
plt.yticks(fontsize=12)

handles, labels = plt.gca().get_legend_handles_labels()
legend_df = pd.DataFrame()
legend_df["labels"] = labels
legend_df["handles"] = handles

legend_df = legend_df.sort_values(by=['labels'], ascending=True)

plt.legend([handles[i] for i in legend_df.index], [labels[i] for i in legend_df.index],
          bbox_to_anchor=(1.0, 1.0), fontsize=16)

plt.savefig("stack_plot_UseCases.pdf", bbox_inches='tight')
plt.show()

# Semantic Analysis

In [None]:
from litstudy.types import Document, Author, DocumentSet, DocumentIdentifier, Affiliation

class AITDocument(Document):
    def __init__(self, entry):
        doi = entry["key"] or None
        title = entry["title"]

        super().__init__(DocumentIdentifier(title, doi=doi))
        self.entry = entry

    @property
    def title(self) -> str:
        return self.entry.get("title")

    @property
    def authors(self):
        authors = self.entry.get("Authors", "").split("; ")
        affs = self.entry.get("Author Affiliations", "").split("; ")
        assert len(authors) == len(affs)
        return [IEEEAuthor(a, b) for a, b in zip(authors, affs)]

    

    @property
    def publication_year(self):
        try:
            return int(self.entry["year"])
        except Exception:
            return None


    @property
    def abstract(self):
        return self.entry.get("abstract") or None



In [None]:
import csv
from litstudy.common import robust_open

pp = './papers.csv'

with robust_open(pp) as f:
    lines = csv.DictReader(f)
    docs = [AITDocument(line) for line in lines]
  
docs = DocumentSet(docs)

corpus = litstudy.build_corpus(docs, ngram_threshold=0.8)

In [None]:
litstudy.plot_cocitation_network(docs, max_edges=500)

In [None]:
litstudy.compute_word_distribution(corpus).filter(like='_', axis=0).sort_index()

In [None]:
plt.figure(figsize=(20, 3))
litstudy.plot_word_distribution(corpus, limit=50, title="Top words", vertical=True, label_rotation=45);

In [None]:
num_topics = 25
topic_model = litstudy.train_nmf_model(corpus, num_topics, max_iter=250)

In [None]:
for i in range(num_topics):
    print(f'Topic {i+1}:', topic_model.best_tokens_for_topic(i))

In [None]:
plt.figure(figsize=(15, 10))
litstudy.plot_topic_clouds(topic_model, ncols=5);
plt.savefig("25_topics.eps", bbox_inches='tight')

In [None]:
plt.figure(figsize=(20, 20))
litstudy.plot_embedding(corpus, topic_model);

In [None]:
topic_id = topic_model.best_topic_for_token('anomaly')
for doc_id in topic_model.best_documents_for_topic(topic_id, limit=10):
    print(docs[int(doc_id)].title)

In [None]:
docs

In [None]:
threshold = 0.2
dl_topic = topic_model.doc2topic[:, topic_id] > threshold

docs = docs.add_property('dl_topic', dl_topic)


groups = {
    'anomaly related': 'dl_topic',
    'other': 'not dl_topic',
}

litstudy.plot_year_histogram(docs, groups=groups, stacked=True);

In [None]:
table = litstudy.compute_year_histogram(docs, groups=groups)
table.div(table.sum(axis=1), axis=0) * 100