This program  performs data cleaning and summarizing to required dataframes 


In [7]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [1]:
# Importing modules
import pandas as pd
from collections import Counter

import re

import gensim
from gensim.utils import simple_preprocess

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])



### Load Data

In [8]:
# Read data and extract abstracts (or titles)

papers = pd.read_csv('/data/questa_data.csv')
papers = papers[['Title', 'Abstract', 'PubYear']]
papers = papers.iloc[:-6 , :]   #drop last 6 rows belonging to PubYear = 2022


## Text Pursing

In [7]:

# Convert the text to lowercase
papers['paper_text_processed'] = \
papers['Abstract'].map(lambda x: str(x).lower())

# Remove punctuation (this kept the words such as M/M/s as it is, and seperators with '-' as it is)
papers['paper_text_processed'] = \
papers['paper_text_processed'].map(lambda x: re.sub('[,;:.!?{}()\[\]\']', '', str(x))) 

#remove '-, &, +,=,v_' and replace with empty space. 
papers['paper_text_processed'] = \
papers['paper_text_processed'].map(lambda x: re.sub('[\&+_-]', ' ', str(x))) 

#remove non-alphanumeric characters, but keep empty spaces and '/' symbol
papers['paper_text_processed'] = \
papers['paper_text_processed'].map(lambda x: re.sub(r'[^A-Za-z0-9 /]', '', str(x))) 

## Re-word some texts and change Kendall's notations

In [8]:
#rename 'markovian' as 'markov'
papers['paper_text_processed'] = \
papers['paper_text_processed'].map(lambda x: re.sub('markovian', 'markov', str(x))) 

#rename 'polling' as 'poll', as lemmatization sometimes assigns 'pollong =poll'
papers['paper_text_processed'] = \
papers['paper_text_processed'].map(lambda x: re.sub('polling', 'poll', str(x))) 

#rename 'regular variation' as 'regvariation', to concatanate the words
papers['paper_text_processed'] = \
papers['paper_text_processed'].map(lambda x: re.sub('regular variation', 'regvariation', str(x))) 

#collect words  in the themses of 'models', 'methods'and 'concepts' to join
#'single server words'
ss_words = '|'.join(['single-server', 'single server']) 
#'multi server words'
ms_words = '|'.join(['multi-server', 'multi server', 'multiple-server', 'multiple server', 'multiple servers'])
#'queueing network'
qn_words = '|'.join(['queueing network', 'queueing networks', 'network of queues'])
#'priority queue'
pq_words = '|'.join(['priority queues', 'priority queue', 'priority queueing'])
#'laplace transform'
lt_words = '|'.join(['laplace transform', 'laplace transforms', 'laplace-stieltjes transforms',
                     'laplace-stieltjes transform', 'laplace stieltjes transform', 'laplace stieltjes transforms'])
#'large deviation'
ld_words = '|'.join(['large deviation', 'large deviations', 'large-deviation', 'large-deviations'])
#'fluid limit'
fl_words = '|'.join(['fluid limit', 'fluid limits', 'fluid-limit'])
#'tail asymptotic'
ta_words = '|'.join(['tail asymptotic', 'tail asymptotics'])
#'product form'
pf_words = '|'.join(['product form', 'product forms', 'product-form'])
#'bandwidth'. Lemmatization removed the word 'bandwidth'. so change to 'widthband'
bw_words = '|'.join(['bandwidth', 'bandwidths'])  

#Define single and multi server queue types 
single_s_types = '|'.join(['\w*\/\w*\/1\/\w*', '\w*\/\w*\/1']) #this works 
multi_s_types = '|'.join(['\w*\/\w*\/\w*\/\w*', '\w*\/\w*\/\w*']) 

#relabel the words types
papers['paper_text_processed'] = \
papers['paper_text_processed'].map(lambda x: re.sub(ss_words, 'singleserver', str(x))) 

papers['paper_text_processed'] = \
papers['paper_text_processed'].map(lambda x: re.sub(ms_words, 'multiserver', str(x))) 

papers['paper_text_processed'] = \
papers['paper_text_processed'].map(lambda x: re.sub(qn_words, 'queuenetwork', str(x))) 

papers['paper_text_processed'] = \
papers['paper_text_processed'].map(lambda x: re.sub(pq_words, 'priorityqueue', str(x))) 

#this produced both 'laptransform' and 'laplace transform'
papers['paper_text_processed'] = \
papers['paper_text_processed'].map(lambda x: re.sub(lt_words, 'laptransform', str(x)))  

#this also kept words 'largedeviation' and 'largedeviations(just once)'
papers['paper_text_processed'] = \
papers['paper_text_processed'].map(lambda x: re.sub(ld_words, 'largedeviation', str(x))) 

#this also kept words 'limitfluid' and 'fluid limit'(just once, no an error)'
papers['paper_text_processed'] = \
papers['paper_text_processed'].map(lambda x: re.sub(fl_words, 'limitfluid', str(x))) 

#this also kept words 'tailasymptotic' and 'tail asymptotic(just once, not an error)'
papers['paper_text_processed'] = \
papers['paper_text_processed'].map(lambda x: re.sub(ta_words, 'tailasymptotic', str(x))) 

papers['paper_text_processed'] = \
papers['paper_text_processed'].map(lambda x: re.sub(pf_words, 'productform', str(x))) 

papers['paper_text_processed'] = \
papers['paper_text_processed'].map(lambda x: re.sub(bw_words, 'widthband', str(x))) 



#relabel  queue types (Maybe can combine these two)
papers['paper_text_processed'] = \
papers['paper_text_processed'].map(lambda x: re.sub(single_s_types, 'singleserver', str(x))) 

papers['paper_text_processed'] = \
papers['paper_text_processed'].map(lambda x: re.sub(multi_s_types, 'multiserver', str(x))) 

## Tokenize  and remove stop words 
 

#### Tokenize (convert each sentence into words) and remove stopwords

In [10]:
#add new words to stop words
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'queue', 'queues', 'queueing', 'editorial', 
                    'abstract','nan'])

#tokenize, and convert to lowercase and return list of structure
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), min_len = 3, deacc=True))

#remove stopwords
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

data = papers.paper_text_processed.values.tolist()   

data_words = list(sent_to_words(data))

# remove stop words
data_words = remove_stopwords(data_words)   # a list of lists


##   Lemmatization

In [11]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

data_lemmatized = lemmatization(data_words)    # a list of lists

## Create a dataframe containing lemmatized abstracts and  topics from 'models', 'methods' 'concepts'


In [None]:
# lemmatize the list of models, methods, concepts

models =  ['singleserver', 'multiserver', 'queuenetwork', 'poll', 'vacations', 'priorityqueue', 'tandem']
methods =  ['laptransforms', 'largedeviation', 'diffusion', 'limitfluid']
concepts = ['insensitivity', 'tailasymptotic', 'productform', 'reversibility',  'widthband', 'stability']

lemmatized_models = lemmatization(list(sent_to_words(models)))   #this words
lemmatized_models = [" ".join(sub_list) for sub_list in lemmatized_models]  #this works

lemmatized_methods = lemmatization(list(sent_to_words(methods)))   #this words
lemmatized_methods = [" ".join(sub_list) for sub_list in lemmatized_methods]  #this works

lemmatized_concepts = lemmatization(list(sent_to_words(concepts)))   #this words
lemmatized_concepts = [" ".join(sub_list) for sub_list in lemmatized_concepts]  #this works

In [14]:
#create a dataframe with papers[PubYear] column 

pub_years = papers['PubYear']
df_topics = pub_years.to_frame()  

flaten_data_lemmatized = [" ".join(sub_list) for sub_list in data_lemmatized]
df_topics['lemmatized_Abstract'] = flaten_data_lemmatized

In [16]:
# check if words in lemmatized_models,methods,concepts  lists are in the data_lemmatized list 
#(this is a list of pre-processed text)

lemmatized_text = [" ".join(sub_list) for sub_list in data_lemmatized]  #flatten the list of lists
#print(lemmatized_text)

#create boolian columns in df_topics for every word in lemmatized_models,methods,concepts lists 
for model in lemmatized_models:
    df_topics[model] = [t.find(model) != -1 for t in lemmatized_text]   #models
    
for method in lemmatized_methods:
    df_topics[method] = [t.find(method) != -1 for t in lemmatized_text] #methods

for concept in lemmatized_concepts:
    df_topics[concept] = [t.find(concept) != -1 for t in lemmatized_text] #methods

df_topics.shape   #19 columns

(1750, 19)

In [39]:
lst_models_yrs = [] #a place holder for PubYear corresponding to each column in df column when the column has a 'True'  value
lst_methods_yrs = []
lst_concepts_yrs = []

#get the PubYears corresponding to each model types in  lemmatized_models
for model in lemmatized_models:  
    l1 = list(df_topics.PubYear[df_topics[model]])  #
    lst_models_yrs.append(l1)

#get the PubYears corresponding to each method types in  lemmatized_methods
for method in lemmatized_methods:  
    l2 = list(df_topics.PubYear[df_topics[method]])  #
    lst_methods_yrs.append(l2)

#get the PubYears corresponding to each concept types in  lemmatized_concepts
for concept in lemmatized_concepts:  
    l3 = list(df_topics.PubYear[df_topics[concept]])  #
    lst_concepts_yrs.append(l3)

In [18]:
#save the df_topics to csv file
df_topics.to_csv('/results/abstracts_preprocessed.csv')

## Create dataframe for themes

In [11]:
flat_lst_models = [item for sublist in lst_models_yrs for item in sublist] #flatenns the lst
flat_lst_methods = [item for sublist in lst_methods_yrs for item in sublist] #flatenns the lst
flat_lst_concepts = [item for sublist in lst_concepts_yrs for item in sublist] #flatenns the lst

def count_theme_yrs(lst):
    count = sorted(Counter(lst).items())
    x = [i[0] for i in count]
    y = [i[1] for i in count]
    new_y = [round(p/q, 2) for p,q in zip(y, numpapers)]
    return x, new_y

x_model, y_model = count_theme_yrs(flat_lst_models)
x_method, y_method = count_theme_yrs(flat_lst_methods)
x_concept, y_concept = count_theme_yrs(flat_lst_concepts)


In [None]:
themes_length = [len(y_model),len(y_method), len(y_concept)] #

#compare x_model and x_method and find the index of element in x_model which are not in x_method
diff_lst = set(x_model) ^ set(x_method)   #get non-common elements
noncommon_idx_method = [x_model.index(x) for x in diff_lst] #get index of x_model where noncommon elements apppears
#insert 0 at index given by noncommon_idx_method
new_y_method = y_method[:noncommon_idx_method[0]] + [0] + y_method[1:] 

df_themes = pd.DataFrame({'Year':pd.Series(x_model), 'numpapers': pd.Series(numpapers),
                          'models':pd.Series(y_model),
                         'methods':pd.Series(new_y_method), 'concepts':pd.Series(y_concept)})

#save df_themes including numpapers list 
df_themes.to_csv('/results/df_themes_numpapers.csv', index =False)

## Create dataframe for keywords

In [12]:
lst_models =  ['single server', 'multiserver', 'queueing network', 'polling', 'vacations', 'priority queue', 'tandem']
lst_methods =  ['Laplace transforms','large deviations', 'diffusion', 'fluid limit']
lst_concepts =  ['insensitivity', 'tail asymptotics', 'product form', 'reversibility',  'bandwidth', 'stability']

In [32]:
# create a dictionary of keywords with their proportion overtime and keeping lists same size
x_ss , y_ss = count_theme_yrs(lst_models_yrs[0])
x_ms , y_ms = count_theme_yrs(lst_models_yrs[1])
x_qn , y_qn = count_theme_yrs(lst_models_yrs[2])
x_pl , y_pl = count_theme_yrs(lst_models_yrs[3])
x_vc , y_vc = count_theme_yrs(lst_models_yrs[4])
x_pq , y_pq = count_theme_yrs(lst_models_yrs[5])
x_tm , y_tm = count_theme_yrs(lst_models_yrs[6])

x_lt , y_lt = count_theme_yrs(lst_methods_yrs[0])
x_ld , y_ld = count_theme_yrs(lst_methods_yrs[1])
x_df , y_df = count_theme_yrs(lst_methods_yrs[2])
x_fl , y_fl = count_theme_yrs(lst_methods_yrs[3])

x_in , y_in = count_theme_yrs(lst_concepts_yrs[0])
x_ta , y_ta = count_theme_yrs(lst_concepts_yrs[1])
x_pf , y_pf = count_theme_yrs(lst_concepts_yrs[2])
x_rv , y_rv = count_theme_yrs(lst_concepts_yrs[3])
x_bw , y_bw = count_theme_yrs(lst_concepts_yrs[4])
x_st , y_st = count_theme_yrs(lst_concepts_yrs[5])

year = x_ss   

#define a function that creates a lst same size as list year
def create_lst(l1, xlst, ylst):
    xlst = set(xlst)
    idx = [i for i, el in enumerate(l1) if el not in xlst]
    l = ylst.copy()
    newl = l
    for  i in idx:
        newl.insert(i,0)
    return newl 


In [33]:
#create new lsts for each theme
newy_ss = create_lst(year, x_ss, y_ss)
newy_ms = create_lst(year, x_ms, y_ms)
newy_qn = create_lst(year, x_qn, y_qn)
newy_pl = create_lst(year, x_pl, y_pl)
newy_vc = create_lst(year, x_vc, y_vc)
newy_pq = create_lst(year, x_pq, y_pq)
newy_tm = create_lst(year, x_tm, y_tm)

newy_lt = create_lst(year, x_lt, y_lt)
newy_ld = create_lst(year, x_ld, y_ld)
newy_df = create_lst(year, x_df, y_df)
newy_fl = create_lst(year, x_fl, y_fl)

newy_in = create_lst(year, x_in, y_in)
newy_ta = create_lst(year, x_ta, y_ta)
newy_pf = create_lst(year, x_pf, y_pf)
newy_rv = create_lst(year, x_rv, y_rv)
newy_bw = create_lst(year, x_bw, y_bw)
newy_st = create_lst(year, x_st, y_st)

#create a dictionary with keyword length
prop_by_keywords = {
    'single server': newy_ss,   
     'multiserver': newy_ms,    
     'queueing network': newy_qn,   
     'polling': newy_pl,            
     'vacations': newy_vc,       
     'priority queue':newy_pq,     
     'tandem':newy_tm,           
     'Laplace transforms': newy_lt,
     'large deviations': newy_ld,
     'diffusion': newy_df,
     'fluid limit':newy_fl,
     'insensitivity':newy_in,
     'tail asymptotics':newy_ta, 
      'product form': newy_pf,
     'reversibility': newy_rv,
     'bandwidth': newy_bw,
     'stability': newy_st
}


In [13]:
#crate a dataframe of keyword proportions with year
df_keywords_yr = pd.DataFrame.from_dict(prop_by_keywords)

df_keywords_yr = df_keywords_yr.T 

#rename all column names
lst_colnames = year.copy()
lst_colnames = [str(x) for x in lst_colnames] 
df_keywords_yr.columns = lst_colnames


df_keywords_yr.index.name = 'keywords'
df_keywords_yr

#add themes column 
lst_keywords = [lst_models, lst_methods, lst_concepts]
lst_keywords = [x for xs in lst_keywords for x in xs]  #unlist sublists
type(lst_keywords)  #17keywords


#create a list with theme name for each keyword
lst_themes = []
for l in lst_keywords:
    if l in lst_models:
        lst_themes.append(1)
    elif l in lst_methods:
         lst_themes.append(2)
    else: 
         lst_themes.append(3)
lst_themes

df_keywords_yr['themes'] = lst_themes
df_keywords_yr

#save the dataframe
df_keywords_yr.to_csv('/results/df_keywords_yr.csv')


In [48]:

#define new  periods
period1 = [*range(1986, 1995, 1)]
period2 = [*range(1995, 2004, 1)]
period3 = [*range(2004, 2013, 1)]
period4 = [*range(2013, 2022, 1)]

#define list of periods and ranges
periods = [period1, period2, period3, period4] # period5, period6, period7, period8, period9
# period_ranges = ['1986 - 1989', '1990 - 1993', '1994 - 1997', '1998 - 2001', '2002 - 2005', '2006 - 2009', 
#                 '2010 - 2013', '2014 - 2017', '2018 - 2021']
period_ranges = ['1986 - 1994',  '1995 - 2003', '2004 - 2012', '2013 - 2021']



In [53]:
#collect all lemmatized_ themes into one list
lemmatized_themes = [lemmatized_models, lemmatized_methods, lemmatized_concepts]
lemmatized_themes = " ".join(map(" " .join, lemmatized_themes)).split()
# print(lemmatized_themes)

#create dictionary for lemmatized_ themes and lst_ themes. Then merge all dictionaries
dict_models = dict(zip(lemmatized_models, lst_models))
dict_methods = dict(zip(lemmatized_methods, lst_methods))
dict_concepts = dict(zip(lemmatized_concepts, lst_concepts))

#merge dictionaries
dict_themes = {**dict_models, **dict_methods, **dict_concepts}
dict_themes;


In [14]:
#use Counter to count words in each period, and then filter out the counts for the themes

def count_themes(themes, txt):
    count = Counter(txt)
    return {key: count[key] for key in count if key in themes}

def count_period_papers(period):
    #count_papers is a  collection Counter of the number of papers in each year
    d = {key: count_papers[key] for key in count_papers if key in period}
    total = sum(list(d.values())) #total numb. of papers in the given period
    return total


##  Create dataframe for keywords in 4 periods 

In [15]:
#create a list of all keywords
lst_keywords = [lst_models, lst_methods, lst_concepts]
lst_keywords = [x for xs in lst_keywords for x in xs]  #unlist sublists
type(lst_keywords)  #17keywords


#create a list with theeme name for each keyword
lst_themes = []
for l in lst_keywords:
    if l in lst_models:
        lst_themes.append(1)
    elif l in lst_methods:
         lst_themes.append(2)
    else: 
         lst_themes.append(3)

df_keywords = pd.DataFrame(lst_keywords, columns =['keywords'])
df_keywords


In [56]:
#get the proportion of papers corresponding to each period
for i in range(len(periods)):
    period = periods[i]
    txt_period = txt_period_tokens(period)
    theme_counts = count_themes(lemmatized_themes, txt_period[1])
    sorted_counts = sorted(theme_counts.items(),  key=lambda x: x[1]) #reverse =True,
    x, y = map(list, zip(*sorted_counts))
    total_papers = count_period_papers(period)  #total number of papers in the period
    new_y = [round(s/total_papers,3) for s in y]  #proportion of counts per period
    new_x = [dict_themes[w] for w in x] #{key: dict_themes[key] for key in dict_themes if key in x}

    lst_p_values = []
    for l in lst_keywords:
        if l in new_x:
        # lst_p1_values.append(l)
            idx =   new_x.index(l)
            lst_p_values.append(new_y[idx])
        else:
             lst_p_values.append(0)
            
    #j = 1+i
    colname = str(period_ranges[i])
    #colname = 'period' + str(j)
    df_keywords[colname] = lst_p_values
    #print(period_range)

df_keywords['themes'] = lst_themes

#save the dataframe
df_keywords.to_csv(path + '/Results/df_keywords.csv', index =False)
# df_keywords

## Co-Occurance analysis

In [17]:
#get list of papers with co-occurance of words from the df_topics

lst_ss_laptransform = df_topics.PubYear[(df_topics['singleserver']==True) & (df_topics['laptransform']==True)] 
lst_ss_lrgdev = df_topics.PubYear[(df_topics['singleserver']==True) & (df_topics['largedeviation']==True)] 
lst_ss_diff = df_topics.PubYear[(df_topics['singleserver']==True) & (df_topics['diffusion']==True)] 
lst_ss_limitfluid  = df_topics.PubYear[(df_topics['singleserver']==True) & (df_topics['limitfluid']==True)] 
                      # #concepts
lst_ss_insensitivity  = df_topics.PubYear[(df_topics['singleserver']==True) & (df_topics['insensitivity']==True)] 
lst_ss_tailasymptotic = df_topics.PubYear[(df_topics['singleserver']==True) & (df_topics['tailasymptotic']==True)]
lst_ss_productform = df_topics.PubYear[(df_topics['singleserver']==True) & (df_topics['productform']==True)]
lst_ss_reversibility = df_topics.PubYear[(df_topics['singleserver']==True) & (df_topics['reversibility']==True)]
lst_ss_widthband = df_topics.PubYear[(df_topics['singleserver']==True) & (df_topics['widthband']==True)]
lst_ss_stability = df_topics.PubYear[(df_topics['singleserver']==True) & (df_topics['stability']==True)]
# #model = 'multiserver'
                 # #methods
lst_ms_laptransform = df_topics.PubYear[(df_topics['multiserver']==True) & (df_topics['laptransform']==True)] 
lst_ms_lrgdev = df_topics.PubYear[(df_topics['multiserver']==True) & (df_topics['largedeviation']==True)] 
lst_ms_diff = df_topics.PubYear[(df_topics['multiserver']==True) & (df_topics['diffusion']==True)] 
lst_ms_limitfluid  = df_topics.PubYear[(df_topics['multiserver']==True) & (df_topics['limitfluid']==True)] 
                 # #concepts
lst_ms_insensitivity  = df_topics.PubYear[(df_topics['multiserver']==True) & (df_topics['insensitivity']==True)] 
lst_ms_tailasymptotic = df_topics.PubYear[(df_topics['multiserver']==True) & (df_topics['tailasymptotic']==True)]
lst_ms_productform = df_topics.PubYear[(df_topics['multiserver']==True) & (df_topics['productform']==True)]
lst_ms_reversibility = df_topics.PubYear[(df_topics['multiserver']==True) & (df_topics['reversibility']==True)]
lst_ms_widthband = df_topics.PubYear[(df_topics['multiserver']==True) & (df_topics['widthband']==True)]
lst_ms_stability = df_topics.PubYear[(df_topics['multiserver']==True) & (df_topics['stability']==True)]
#model = 'queuenetwork'
                   #methods
lst_qn_laptransform = df_topics.PubYear[(df_topics['queuenetwork']==True) & (df_topics['laptransform']==True)] 
lst_qn_lrgdev = df_topics.PubYear[(df_topics['queuenetwork']==True) & (df_topics['largedeviation']==True)] 
lst_qn_diff = df_topics.PubYear[(df_topics['queuenetwork']==True) & (df_topics['diffusion']==True)] 
lst_qn_limitfluid  = df_topics.PubYear[(df_topics['queuenetwork']==True) & (df_topics['limitfluid']==True)] 
                          # #concepts
lst_qn_insensitivity  = df_topics.PubYear[(df_topics['queuenetwork']==True) & (df_topics['insensitivity']==True)] 
lst_qn_tailasymptotic = df_topics.PubYear[(df_topics['queuenetwork']==True) & (df_topics['tailasymptotic']==True)]
lst_qn_productform = df_topics.PubYear[(df_topics['queuenetwork']==True) & (df_topics['productform']==True)]
lst_qn_reversibility = df_topics.PubYear[(df_topics['queuenetwork']==True) & (df_topics['reversibility']==True)]
lst_qn_widthband = df_topics.PubYear[(df_topics['queuenetwork']==True) & (df_topics['widthband']==True)]
lst_qn_stability = df_topics.PubYear[(df_topics['queuenetwork']==True) & (df_topics['stability']==True)]

# #model = 'poll'
                   # #methods
lst_pl_laptransform = df_topics.PubYear[(df_topics['poll']==True) & (df_topics['laptransform']==True)] 
lst_pl_lrgdev = df_topics.PubYear[(df_topics['poll']==True) & (df_topics['largedeviation']==True)] 
lst_pl_diff = df_topics.PubYear[(df_topics['poll']==True) & (df_topics['diffusion']==True)] 
lst_pl_limitfluid  = df_topics.PubYear[(df_topics['poll']==True) & (df_topics['limitfluid']==True)] 
                          # #concepts
lst_pl_insensitivity  = df_topics.PubYear[(df_topics['poll']==True) & (df_topics['insensitivity']==True)] 
lst_pl_tailasymptotic = df_topics.PubYear[(df_topics['poll']==True) & (df_topics['tailasymptotic']==True)]
lst_pl_productform = df_topics.PubYear[(df_topics['poll']==True) & (df_topics['productform']==True)]
lst_pl_reversibility = df_topics.PubYear[(df_topics['poll']==True) & (df_topics['reversibility']==True)]
lst_pl_widthband = df_topics.PubYear[(df_topics['poll']==True) & (df_topics['widthband']==True)]
lst_pl_stability = df_topics.PubYear[(df_topics['poll']==True) & (df_topics['stability']==True)]

# #model = 'vacation'
                   # #methods
lst_vn_laptransform = df_topics.PubYear[(df_topics['vacation']==True) & (df_topics['laptransform']==True)] 
lst_vn_lrgdev = df_topics.PubYear[(df_topics['vacation']==True) & (df_topics['largedeviation']==True)] 
lst_vn_diff = df_topics.PubYear[(df_topics['vacation']==True) & (df_topics['diffusion']==True)] 
lst_vn_limitfluid  = df_topics.PubYear[(df_topics['vacation']==True) & (df_topics['limitfluid']==True)] 
                          # #concepts
lst_vn_insensitivity  = df_topics.PubYear[(df_topics['vacation']==True) & (df_topics['insensitivity']==True)] 
lst_vn_tailasymptotic = df_topics.PubYear[(df_topics['vacation']==True) & (df_topics['tailasymptotic']==True)]
lst_vn_productform = df_topics.PubYear[(df_topics['vacation']==True) & (df_topics['productform']==True)]
lst_vn_reversibility = df_topics.PubYear[(df_topics['vacation']==True) & (df_topics['reversibility']==True)]
lst_vn_widthband = df_topics.PubYear[(df_topics['vacation']==True) & (df_topics['widthband']==True)]
lst_vn_stability = df_topics.PubYear[(df_topics['vacation']==True) & (df_topics['stability']==True)]

# #model = 'priorityqueue'
                   # #methods
lst_pq_laptransform = df_topics.PubYear[(df_topics['priorityqueue']==True) & (df_topics['laptransform']==True)] 
lst_pq_lrgdev = df_topics.PubYear[(df_topics['priorityqueue']==True) & (df_topics['largedeviation']==True)] 
lst_pq_diff = df_topics.PubYear[(df_topics['priorityqueue']==True) & (df_topics['diffusion']==True)] 
lst_pq_limitfluid  = df_topics.PubYear[(df_topics['priorityqueue']==True) & (df_topics['limitfluid']==True)] 
                          # #concepts
lst_pq_insensitivity  = df_topics.PubYear[(df_topics['priorityqueue']==True) & (df_topics['insensitivity']==True)] 
lst_pq_tailasymptotic = df_topics.PubYear[(df_topics['priorityqueue']==True) & (df_topics['tailasymptotic']==True)]
lst_pq_productform = df_topics.PubYear[(df_topics['priorityqueue']==True) & (df_topics['productform']==True)]
lst_pq_reversibility = df_topics.PubYear[(df_topics['priorityqueue']==True) & (df_topics['reversibility']==True)]
lst_pq_widthband = df_topics.PubYear[(df_topics['priorityqueue']==True) & (df_topics['widthband']==True)]
lst_pq_stability = df_topics.PubYear[(df_topics['priorityqueue']==True) & (df_topics['stability']==True)]


# #model = 'tandem'
                   # #methods
lst_tm_laptransform = df_topics.PubYear[(df_topics['tandem']==True) & (df_topics['laptransform']==True)] 
lst_tm_lrgdev = df_topics.PubYear[(df_topics['tandem']==True) & (df_topics['largedeviation']==True)] 
lst_tm_diff = df_topics.PubYear[(df_topics['tandem']==True) & (df_topics['diffusion']==True)] 
lst_tm_limitfluid  = df_topics.PubYear[(df_topics['tandem']==True) & (df_topics['limitfluid']==True)] 
                          # #concepts
lst_tm_insensitivity  = df_topics.PubYear[(df_topics['tandem']==True) & (df_topics['insensitivity']==True)] 
lst_tm_tailasymptotic = df_topics.PubYear[(df_topics['tandem']==True) & (df_topics['tailasymptotic']==True)]
lst_tm_productform = df_topics.PubYear[(df_topics['tandem']==True) & (df_topics['productform']==True)]
lst_tm_reversibility = df_topics.PubYear[(df_topics['tandem']==True) & (df_topics['reversibility']==True)]
lst_tm_widthband = df_topics.PubYear[(df_topics['tandem']==True) & (df_topics['widthband']==True)]
lst_tm_stability = df_topics.PubYear[(df_topics['tandem']==True) & (df_topics['stability']==True)]

################# method and concepts ###
#method = 'laplace transform'
lst_lt_in = df_topics.PubYear[(df_topics['laptransform']==True) & (df_topics['insensitivity']==True)] 
lst_lt_ta = df_topics.PubYear[(df_topics['laptransform']==True) & (df_topics['tailasymptotic']==True)] 
lst_lt_pf = df_topics.PubYear[(df_topics['laptransform']==True) & (df_topics['productform']==True)]
lst_lt_re = df_topics.PubYear[(df_topics['laptransform']==True) & (df_topics['reversibility']==True)] 
lst_lt_bw = df_topics.PubYear[(df_topics['laptransform']==True) & (df_topics['widthband']==True)] 
lst_lt_st = df_topics.PubYear[(df_topics['laptransform']==True) & (df_topics['stability']==True)]

#method = 'large deviations'
lst_ld_in = df_topics.PubYear[(df_topics['largedeviation']==True) & (df_topics['insensitivity']==True)] 
lst_ld_ta = df_topics.PubYear[(df_topics['largedeviation']==True) & (df_topics['tailasymptotic']==True)] 
lst_ld_pf = df_topics.PubYear[(df_topics['largedeviation']==True) & (df_topics['productform']==True)]
lst_ld_re = df_topics.PubYear[(df_topics['largedeviation']==True) & (df_topics['reversibility']==True)] 
lst_ld_bw = df_topics.PubYear[(df_topics['largedeviation']==True) & (df_topics['widthband']==True)] 
lst_ld_st = df_topics.PubYear[(df_topics['largedeviation']==True) & (df_topics['stability']==True)]

#method = 'diffusion'
lst_df_in = df_topics.PubYear[(df_topics['diffusion']==True) & (df_topics['insensitivity']==True)] 
lst_df_ta = df_topics.PubYear[(df_topics['diffusion']==True) & (df_topics['tailasymptotic']==True)] 
lst_df_pf = df_topics.PubYear[(df_topics['diffusion']==True) & (df_topics['productform']==True)]
lst_df_re = df_topics.PubYear[(df_topics['diffusion']==True) & (df_topics['reversibility']==True)] 
lst_df_bw = df_topics.PubYear[(df_topics['diffusion']==True) & (df_topics['widthband']==True)] 
lst_df_st = df_topics.PubYear[(df_topics['diffusion']==True) & (df_topics['stability']==True)]

#method = 'fluid limit'
lst_fl_in = df_topics.PubYear[(df_topics['limitfluid']==True) & (df_topics['insensitivity']==True)] 
lst_fl_ta = df_topics.PubYear[(df_topics['limitfluid']==True) & (df_topics['tailasymptotic']==True)] 
lst_fl_pf = df_topics.PubYear[(df_topics['limitfluid']==True) & (df_topics['productform']==True)]
lst_fl_re = df_topics.PubYear[(df_topics['limitfluid']==True) & (df_topics['reversibility']==True)] 
lst_fl_bw = df_topics.PubYear[(df_topics['limitfluid']==True) & (df_topics['widthband']==True)] 
lst_fl_st = df_topics.PubYear[(df_topics['limitfluid']==True) & (df_topics['stability']==True)]
   


In [60]:
# define a function to count the number of papers in each co-occurance list for a given period

def count_occurance(period, lst):
    count_lst = Counter(lst)
    d = {key: count_lst[key] for key in count_lst if key in period}
    total = sum(list(d.values())) 
    return total


In [61]:
#For each period, count how many papers have the co-occurance words 
#period1
occurance_lst = [lst_ss_laptransform, lst_ss_lrgdev, lst_ss_diff, lst_ss_limitfluid, 
                 lst_ss_insensitivity,lst_ss_tailasymptotic, lst_ss_productform , lst_ss_reversibility,
                 lst_ss_widthband,lst_ss_stability,
                 #
                 lst_ms_laptransform, lst_ms_lrgdev, lst_ms_diff, lst_ms_limitfluid, 
                 lst_ms_insensitivity,lst_ms_tailasymptotic, lst_ms_productform , lst_ms_reversibility,
                 lst_ms_widthband,lst_ms_stability,
                #
                 lst_qn_laptransform, lst_qn_lrgdev, lst_qn_diff, lst_qn_limitfluid, 
                 lst_qn_insensitivity,lst_qn_tailasymptotic, lst_qn_productform , lst_qn_reversibility,
                 lst_qn_widthband,lst_qn_stability,
                #
                 lst_pl_laptransform, lst_pl_lrgdev, lst_pl_diff, lst_pl_limitfluid, 
                 lst_pl_insensitivity,lst_pl_tailasymptotic, lst_pl_productform , lst_pl_reversibility,
                 lst_pl_widthband,lst_pl_stability,
                #
                 lst_vn_laptransform, lst_vn_lrgdev, lst_vn_diff, lst_vn_limitfluid, 
                 lst_vn_insensitivity,lst_vn_tailasymptotic, lst_vn_productform , lst_vn_reversibility,
                 lst_vn_widthband,lst_vn_stability,
                 #
                 lst_pq_laptransform, lst_pq_lrgdev, lst_pq_diff, lst_pq_limitfluid, 
                 lst_pq_insensitivity,lst_pq_tailasymptotic, lst_pq_productform , lst_pq_reversibility,
      
                 lst_pq_widthband,lst_pq_stability,
                #
                 lst_tm_laptransform, lst_tm_lrgdev, lst_tm_diff, lst_tm_limitfluid, 
                 lst_tm_insensitivity,lst_tm_tailasymptotic, lst_tm_productform , lst_tm_reversibility,
                 lst_tm_widthband,lst_tm_stability,
                #
                lst_lt_in, lst_lt_ta, lst_lt_pf, lst_lt_re, lst_lt_bw, lst_lt_st,
                #
                lst_ld_in, lst_ld_ta, lst_ld_pf, lst_ld_re, lst_ld_bw, lst_ld_st,
                #
                 lst_df_in, lst_df_ta, lst_df_pf, lst_df_re, lst_df_bw, lst_df_st,
                 #
                 lst_fl_in, lst_fl_ta, lst_fl_pf, lst_fl_re, lst_fl_bw, lst_fl_st
                ]
#df_occurance_period1 = Dataframe()
len(occurance_lst)

94

In [62]:
def count_occurance_period(period):
    period = period
    lst=[]
    for occurance in occurance_lst:
    #print(lst)
        total = count_occurance(period, occurance)
        lst.append(total)
    return lst

    #txt_period = txt_period_tokens(period)
#print(lst_count_occurance_period1)

In [18]:
#get the count of co-occurance of words in each period
lst_count_occurance_period1 = count_occurance_period(period1)
lst_count_occurance_period2 = count_occurance_period(period2)
lst_count_occurance_period3 = count_occurance_period(period3)
lst_count_occurance_period4 = count_occurance_period(period4)

In [64]:
#create a dataframe for each period

#for this, first create tupule of words

names_lst = [('single server', 'Laplace transforms'), ('single server', 'large deviations'),
          ('single server', 'diffusion'), ('single server', 'fluid limit'),
         ('single server', 'insensitivity'), ('single server', 'tail asymptotics'), 
          ('single server', 'product form'), ('single server', 'reversibility'), ('single server', 'bandwidth'),
         ('single server', 'stability'),
          #
         ('multiserver', 'Laplace transforms'), ('multiserver', 'large deviations'),
          ('multiserver', 'diffusion'), ('multiserver', 'fluid limit'),
         ('multiserver', 'insensitivity'), ('multiserver', 'tail asymptotics'), 
          ('multiserver', 'product form'), ('multiserver', 'reversibility'), ('multiserver', 'bandwidth'),
         ('multiserver', 'stability'),
          #
         ('queueing network', 'Laplace transforms'), ('queueing network', 'large deviations'),
          ('queueing network', 'diffusion'), ('queueing network', 'fluid limit'),
         ('queueing network', 'insensitivity'), ('queueing network', 'tail asymptotics'), 
          ('queueing network', 'product form'), ('queueing network', 'reversibility'), ('queueing network', 'bandwidth'),
         ('queueing network', 'stability'),
          #
         ('polling', 'Laplace transforms'), ('polling', 'large deviations'),
          ('polling', 'diffusion'), ('polling', 'fluid limit'),
         ('polling', 'insensitivity'), ('polling', 'tail asymptotics'), 
          ('polling', 'product form'), ('polling', 'reversibility'), ('polling', 'bandwidth'),
         ('polling', 'stability'),
         #
         ('vacations', 'Laplace transforms'), ('vacations', 'large deviations'),
          ('vacations', 'diffusion'), ('vacations', 'fluid limit'),
         ('vacations', 'insensitivity'), ('vacations', 'tail asymptotics'), 
          ('vacations', 'product form'), ('vacations', 'reversibility'), ('vacations', 'bandwidth'),
         ('vacations', 'stability'),
         #
         ('priority queue', 'Laplace transforms'), ('priority queue', 'large deviations'),
          ('priority queue', 'diffusion'), ('priority queue', 'fluid limit'),
         ('priority queue', 'insensitivity'), ('priority queue', 'tail asymptotics'), 
          ('priority queue', 'product form'), ('priority queue', 'reversibility'), ('priority queue', 'bandwidth'),
         ('priority queue', 'stability'),
         #
         ('tandem', 'Laplace transforms'), ('tandem', 'large deviations'),
          ('tandem', 'diffusion'), ('tandem', 'fluid limit'),
         ('tandem', 'insensitivity'), ('tandem', 'tail asymptotics'), 
          ('tandem', 'product form'), ('tandem', 'reversibility'), ('tandem', 'bandwidth'),
         ('tandem', 'stability'),
             #
            ('Laplace transforms', 'insensitivity'), ('Laplace transforms', 'tail asymptotics'),
             ('Laplace transforms', 'product form'), ('Laplace transforms', 'reversibility'), 
             ('Laplace transforms', 'bandwidth'), ('Laplace transforms', 'stability'),
            #
             ('Large deviations', 'insensitivity'), ('Large deviations', 'tail asymptotics'),
             ('Large deviations', 'product form'), ('Large deviations', 'reversibility'), 
             ('Large deviations', 'bandwidth'), ('Large deviations', 'stability'),
             #
             ('diffusion', 'insensitivity'), ('diffusion', 'tail asymptotics'),
             ('diffusion', 'product form'), ('diffusion', 'reversibility'), 
             ('diffusion', 'bandwidth'), ('diffusion', 'stability'),
             #
             ('fluid limit', 'insensitivity'), ('fluid limit', 'tail asymptotics'),
             ('fluid limit', 'product form'), ('fluid limit', 'reversibility'), 
             ('fluid limit', 'bandwidth'), ('fluid limit', 'stability')]
#print(lst_models)
print(len(names_lst))

94


In [65]:
df_occurance = pd.DataFrame(list(zip(names_lst, lst_count_occurance_period1,lst_count_occurance_period2,
                                    lst_count_occurance_period3, lst_count_occurance_period4)#, 
                                    #lst_count_occurance_period5, lst_count_occurance_period6, 
                                    #lst_count_occurance_period7, lst_count_occurance_period8,
                                    #lst_count_occurance_period9)
                                ),
               columns =['bigram', 'period1', 'period2', 'period3', 'period4'])#, 'period5','period6',
                        #'period7', 'period8', 'period9'])
#save the df_occurance to csv file
df_occurance.to_csv('/results/update_abstract_co_occurance.csv', index = False)