# Topic Pairs by Year

In this notebook we try to find topics that co-occur rarely in order to pursue the hypothesis of combinatorial creativity. If two significant topics co-occur in the corpus below some threshold of frequency, they are regarded as proxies for the existence of a novel idea in the documents in which they co-occur. 

We begin by filtering out very high alpha topics (not more than 0.04) and by considering only topic-document pairs in which the topic has more than a 10% concentration in the document. This removes very general topics and focuses on documents where there topics have some meaningful presence. Both thresholds may be adjusted of course.

## To Do
* Adjust numbers by total for each year (since each year has a different sample size)

In [2]:
import sqlite3
import pandas as pd

# This is a local copy of the webapp database that combines source, corpus, and model data
dbfile = '../battelle.db' 
db = sqlite3.connect(dbfile)

# These are the thresholds described above
topic_alpha_max = 0.04
topic_weight_min = 0.1

# We pull in the relevant tables
# D = corpus.doc
# T = model.topic
# DT = model.doctopic
D = pd.read_sql_query('SELECT doc_label,doc_id FROM doc',db)
T = pd.read_sql_query('SELECT topic_id,topic_alpha,topic_words FROM topic WHERE topic_alpha <= {}'.format(topic_alpha_max),db)
DT = pd.read_sql_query('SELECT doc_id,topic_id,topic_weight FROM doctopic_long WHERE topic_weight > {} ORDER BY topic_id'.format(topic_weight_min),db)

# We do the join here because it's faster
# TP = topicpair 
DT = pd.merge(DT,D,on='doc_id') # Necessary to get the years (labels)
TP = pd.merge(DT,DT,on='doc_id') # Quick way to create a graph!

# We remove redundant pairs and selfies using this one weird trick
TP = TP[TP['topic_id_x'] < TP['topic_id_y']]

# We produce a combined weight value from the two topics
# We may want to calculate this value differently
TP['combo_weight'] = TP['topic_weight_x'] * TP['topic_weight_y']
TP = TP.drop(['topic_weight_x','topic_weight_y'],1)

# We fix the TP table some, removing a redundant column and renaming the remaining one
# We also need to convert the data type from object to numeric
TP = TP.drop('doc_label_x',axis=1)
TP.rename(columns={'doc_label_y':'year'}, inplace=True)
TP['year'] = pd.to_numeric(TP['year'])

# For readability, we resort the table
TP.sort_values(['topic_id_x','topic_id_y','year'],inplace=True)

# We grab the number of documents per year to use at some point to adjust the document counts
# per topic pair per year
DC = D['doc_id'].groupby([D['doc_label']]).count() # Total docs per year
years = TP['year'].unique() # For convenience

In [4]:
# We collapse the table by combining docs per year, keeping their count
GT = TP[['doc_id']].groupby([TP['topic_id_x'],TP['topic_id_y'],TP['year']]).count()
# Unstacking gives a nice matrix
GTU = GT.unstack().fillna(0)
GTU.to_csv('candidate-topicpairs-unstacked.csv')

In [6]:
GTU.info()
GTU

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 14605 entries, (0, 1) to (198, 199)
Data columns (total 11 columns):
(doc_id, 2005)    14605 non-null float64
(doc_id, 2006)    14605 non-null float64
(doc_id, 2007)    14605 non-null float64
(doc_id, 2008)    14605 non-null float64
(doc_id, 2009)    14605 non-null float64
(doc_id, 2010)    14605 non-null float64
(doc_id, 2011)    14605 non-null float64
(doc_id, 2012)    14605 non-null float64
(doc_id, 2013)    14605 non-null float64
(doc_id, 2014)    14605 non-null float64
(doc_id, 2015)    14605 non-null float64
dtypes: float64(11)
memory usage: 1.3+ MB


Unnamed: 0_level_0,Unnamed: 1_level_0,doc_id,doc_id,doc_id,doc_id,doc_id,doc_id,doc_id,doc_id,doc_id,doc_id,doc_id
Unnamed: 0_level_1,year,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
topic_id_x,topic_id_y,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
0,1,0,0,0,0,3,4,2,2,2,7,1
0,3,0,1,0,0,1,0,1,1,1,0,0
0,4,3,4,8,2,2,10,4,15,16,13,7
0,5,0,0,0,0,0,1,0,0,0,1,0
0,6,0,0,0,2,0,0,0,0,0,1,0
0,8,1,0,0,1,2,2,2,0,1,0,2
0,10,0,0,0,1,1,2,0,2,4,0,2
0,11,0,0,0,0,0,0,0,0,0,0,1
0,13,0,0,0,0,0,0,0,0,1,1,3
0,14,0,0,0,0,0,0,0,0,1,0,0


In [7]:
# We again collapse the table by combining docs per year, but summing their weights this time
GT2 = TP['combo_weight'].groupby([TP['topic_id_x'],TP['topic_id_y'],TP['year']]).sum()
GTU2 = GT2.unstack().fillna(0)
GTU2.to_csv('candidate-topicpairs-by-weight-unstacked.csv')
GTU2.info()
GTU2

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 14605 entries, (0, 1) to (198, 199)
Data columns (total 11 columns):
2005    14605 non-null float64
2006    14605 non-null float64
2007    14605 non-null float64
2008    14605 non-null float64
2009    14605 non-null float64
2010    14605 non-null float64
2011    14605 non-null float64
2012    14605 non-null float64
2013    14605 non-null float64
2014    14605 non-null float64
2015    14605 non-null float64
dtypes: float64(11)
memory usage: 1.3+ MB


Unnamed: 0_level_0,year,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
topic_id_x,topic_id_y,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1,0.000000,0.000000,0.000000,0.000000,0.129235,0.097714,0.070516,0.103723,0.089464,0.224787,0.026221
0,3,0.000000,0.043936,0.000000,0.000000,0.089452,0.000000,0.044078,0.043983,0.012325,0.000000,0.000000
0,4,0.112151,0.324628,0.421332,0.098070,0.095970,0.345351,0.285777,0.843228,0.853737,0.739095,0.344699
0,5,0.000000,0.000000,0.000000,0.000000,0.000000,0.058950,0.000000,0.000000,0.000000,0.019626,0.000000
0,6,0.000000,0.000000,0.000000,0.031049,0.000000,0.000000,0.000000,0.000000,0.000000,0.025222,0.000000
0,8,0.017599,0.000000,0.000000,0.023448,0.076575,0.045025,0.042117,0.000000,0.033726,0.000000,0.038463
0,10,0.000000,0.000000,0.000000,0.031336,0.034274,0.104165,0.000000,0.127990,0.175043,0.000000,0.059523
0,11,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.046752
0,13,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.054257,0.046191,0.064554
0,14,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.030609,0.000000,0.000000


In [8]:
# We again collapse the table by combining docs without regard for year, keeping their count
GT3 = TP['doc_id'].groupby([TP['topic_id_x'],TP['topic_id_y']]).count()
# This creates a co-occurence matrix of topics
GTU3 = GT3.unstack().fillna(0)
GTU3.to_csv('candidate-topicpairs-matrix.csv')
GTU3.info()
GTU3

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199 entries, 0 to 198
Columns: 198 entries, 1 to 199
dtypes: float64(198)
memory usage: 309.4 KB


topic_id_y,1,3,4,5,6,7,8,9,10,11,...,190,191,192,193,194,195,196,197,198,199
topic_id_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,21,5,84,2,3,0,11,0,12,1,...,11,4,6,4,6,0,28,79,6,0
1,0,1,31,4,2,0,13,1,13,2,...,1,3,13,0,0,0,6,26,0,0
2,0,1,3,3,0,8,45,8,0,2,...,1,4,0,1,51,6,7,6,18,0
3,0,0,1,0,6,0,40,1,5,0,...,4,18,7,2,42,0,41,55,12,5
4,0,0,0,28,5,1,26,2,1,111,...,1,12,10,3,3,3,27,60,12,2
5,0,0,0,0,0,0,5,0,0,15,...,1,2,0,2,0,0,3,8,3,0
6,0,0,0,0,0,1,4,0,3,0,...,3,0,3,0,0,0,3,30,1,0
7,0,0,0,0,0,0,98,3,0,9,...,2,2,1,0,0,2,3,1,22,13
8,0,0,0,0,0,0,0,10,60,36,...,14,9,28,12,63,8,41,36,37,14
9,0,0,0,0,0,0,0,0,0,2,...,0,1,1,1,3,0,3,3,6,0


In [6]:
# Finally, we create a topic co-occurrence matrix for each year
m = {}
for year in TP['year'].unique():
    TPY = TP[TP.year == year]
    GTY = TPY['doc_id'].groupby([TPY['topic_id_x'],TPY['topic_id_y']]).count()
    GTY.to_csv('candidate-topicpairs-{}.csv'.format(year))
    GTYU = GTY.unstack().fillna(0)
    GTYU.to_csv('candidate-topicpairs-matrix-{}.csv'.format(year))
    m[year] = GTYU

In [9]:
for year in m:
    print(year)
    print(m[year])

2005
topic_id_y  4    5    8    9    10   11   12   13   14   15  ...   190  191  \
topic_id_x                                                   ...              
0             3  NaN    1  NaN  NaN  NaN  NaN  NaN  NaN  NaN ...   NaN  NaN   
1             2  NaN    1  NaN  NaN  NaN  NaN  NaN  NaN  NaN ...   NaN  NaN   
2             1  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN ...   NaN  NaN   
3           NaN  NaN    1  NaN  NaN  NaN  NaN    2  NaN    1 ...   NaN  NaN   
4           NaN    1  NaN  NaN  NaN   11  NaN  NaN    1  NaN ...   NaN  NaN   
5           NaN  NaN  NaN  NaN  NaN    1  NaN    1  NaN  NaN ...   NaN  NaN   
6           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN ...   NaN  NaN   
7           NaN  NaN    1  NaN  NaN    1  NaN  NaN  NaN  NaN ...   NaN  NaN   
8           NaN  NaN  NaN    1    1  NaN    2    4  NaN  NaN ...     1  NaN   
9           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN ...   NaN  NaN   
10          NaN  NaN  NaN  NaN  NaN  NaN  NaN  