**Exploring data from Pedro**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

filename = 'ExportCypherviaAPOC29Aug.xlsx'
data = pd.read_excel(filename,sheetname=None)
pd.set_option('max_rows', 5)

Here I define a function to compute the number of pairs over the years, normalised by the number of documents found in that year. This function receives as input a pair of text a - b, and b - a. Sounds silly, but it makes the following computation easier.

In [2]:
ppy = data['Pairs per year']
ndocs = data['Number Docs per Year']
TPVal = ppy['Term Pair'].values
nall = len(ppy['Year'])
#print 'Number of entries: %d'%nel
def get_pair_evol(t3,t3_inv):
    # Gets the normalised number of pairs for all years
    idp1 = np.where((TPVal == t3) | (TPVal == t3_inv))[0] #get all pairs
    years = np.sort(np.unique(ppy['Year'][idp1])) # all relevant years for the pair
    nyears = len(years) 
    sum_pairs = ppy.iloc[idp1].groupby('Year') # Grouping by year
    sum_alldocs = sum_pairs.sum()
    ids = [np.where(ndocs['YEAR'].values == x)[0] for x in years] 
    ids_list = [x[0] for x in ids]
    cc = ndocs['Count docs'].values[ids_list]
    norm_pairs = [sum_alldocs.values[x]/(0.0 + cc[x]) for x in range(nyears)]
    np_list = [x[0] for x in norm_pairs]
    
    return years, np_list,t3 #returns the years and list of normalised pairs


Here I calculate all possible combinations of pairs, including those in reverse order


In [None]:
import itertools

terms = np.unique(ppy['t1.Term'])
nterms = len(terms)
ncomb = int(nterms*(nterms-1)/2.)
print 'All pair combinations:', ncomb
comb = list(itertools.combinations(terms,2))

t3_list = ['%s - %s'%(x[0], x[1]) for x in list(comb)]
t3_list_inv = ['%s - %s'%(x[1], x[0]) for x in list(comb)]


All pair combinations: 21945


Here the code goes through the database finding each pair combination and running the function from above. To speed up the computation, it is done in parallel using `Pool` in `multiprocessing`. Later,a median value is computed and I plot the results

In [None]:
import multiprocessing as mp

year_arr = np.sort(np.unique(ppy['Year'].values)) # It'll contain all years found in all data
nallyears = len(year_arr)

nel = ncomb
print nel

nproc = 4  # number of parallel processes
i0 = 0  
i1 = nel
npp = int((i1-i0+1.0)/(nproc+0.0)) # number of terms that each processor will deal with      

def do_pairs(ip):
    out = []
    x0 = npp*ip
    x1 = ncomb-1 if ip == nproc-1 else npp*(ip+1) #the last processor absorbs possible rounding error
    np_arr = [[] for x in range(nallyears)]
    idarr  = [[] for x in range(nallyears)]
    for x in range(x0,x1):
        t3 = t3_list[x]
        t3_inv = t3_list_inv[x]
        year, nplist,pname = get_pair_evol(t3,t3_inv) 
        for iy in range(nallyears):
            year_i = year_arr[iy]
            if year_i in year:
                idnp = year.tolist().index(year_i) # index of year to find correct pair element
                np_arr[iy].append(nplist[idnp])
                idarr[iy].append(x)
      
    print 'Process %d done.'%ip
    return [np_arr, idarr] 

pool = mp.Pool(processes=nproc)
res = [pool.apply_async(do_pairs, args=(x,)) for x in range(nproc)]
np_arr_p = [p.get()[0] for p in res]
id_arr_p  = [p.get()[1] for p in res]




21945


The function `get_par_id` below retrieves the year array and pairs for a given pair `idnum`. 

In [None]:

def get_pair_id(idnum):
    yid = []# [[] for x in range(all_ids)]
    pid = []
    for ip in range(nproc):
        flat_id_arr = np.unique(sum(id_arr_p[ip],[]))
        sel = np.where(idnum == flat_id_arr)[0]
        if len(sel) == 1:
            #ikk =0
        #for ik in range(all_ids):
            idd = list(flat_id_arr).index(idnum)
#            iid = []
            for iy in range(nallyears):
                id_iy = np.where(np.asarray(id_arr_p[ip][iy]) == idnum)[0]
                if len(id_iy) ==1:
                    yid.append(year_arr[iy])
                    pid.append(np_arr_p[ip][iy][id_iy[0]])
        else:
            continue
    return yid, pid
        
#        plt.plot(yid,pyid[ikk][1],'k-',linewidth=.01)


Finally, things are plotted

In [None]:
from tqdm import tnrange, tqdm_notebook
plt.figure(1,figsize=(6,6))

# Retrieving each individual pair evolution -- it's a bit slow...
print 'Retrieving and plotting every single pair evolution...'

# Plot 1000 random pairs
nplots = 1000
comb_perm = np.random.permutation(np.arange(ncomb))[0:nplots]

#for idd in tqdm(range(nplots)):
for idd in tqdm_notebook(range(nplots)):
    idnum = comb_perm[idd]
    yid, pyid = get_pair_id(idnum)
    plt.plot(yid,pyid,'k-',linewidth=.1)

med_pairs = np.zeros(nallyears)
perc = np.zeros([nallyears,2]) #1 and 2 sigma
for iy in range(nallyears):
    med_pairs[iy] = np.median(sum([x[iy] for x in np_arr_p],[]))
    perc[iy,0]      = np.percentile(sum([x[iy] for x in np_arr_p],[]), 84)
    perc[iy,1]      = np.percentile(sum([x[iy] for x in np_arr_p],[]), 97.5)

nzero = np.where((med_pairs >0) & (~np.isnan(med_pairs)))[0]

#smooth curves
def smooth(y, box_pts):
    box = np.ones(box_pts)/box_pts
    y_smooth = np.convolve(y, box, mode='same')
    return y_smooth

s_median = smooth(med_pairs[nzero],5)


plt.plot(np.asarray(year_arr)[nzero], s_median,'r-',linewidth=2,label='Median')
plt.fill_between(np.asarray(year_arr)[nzero], s_median,smooth(perc[nzero,1],5),
                 facecolor='royalblue',alpha=0.3,label=r'$1-\sigma$')
plt.fill_between(np.asarray(year_arr)[nzero], s_median,smooth(perc[nzero,0],5),
                 facecolor='royalblue',alpha=0.7,label=r'$2-\sigma$')


plt.xlabel('Years',fontsize=15)
plt.ylabel('Norm.pairs',fontsize=15)
plt.yscale('log')
plt.legend(loc='lower left')