In [1]:
import sys
sys.path.append("../src")

In [229]:
from pathlib import Path
from collections import defaultdict

from gensim.corpora import MmCorpus, Dictionary
from gensim.models.ldamodel import LdaModel

from sqlalchemy import Column, Integer, String, ForeignKey, Boolean, Float
from sqlalchemy import create_engine, func
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship
from sqlalchemy.orm import sessionmaker

from tqdm.notebook import tqdm
import pandas as pd

In [3]:
import db

  from pandas import Panel


In [4]:
import imp; imp.reload(db)

<module 'db' from '../src/db.py'>

In [5]:
prepared_data_dir = Path("../scratch/prepared_data/")

In [24]:
# corpus = MmCorpus('../scratch/prepared_data/corpus.mm')
# dictionary = Dictionary.load("../scratch/prepared_data/dct.mm")
corpus, dictionary, corp2paper, dct2kwd = db.read_from_prepared_data(prepared_data_dir)
mdir = Path('../scratch/tmodels/')
n_topics = 20
tmodel_loc = mdir / f'topic_model{n_topics}'

lda_model = LdaModel.load(str(tmodel_loc))
tm = db.TopicModeler(dictionary, corpus)
embedding = tm.get_inference(lda_model)
coh_per_topic = tm.get_coherence_model(lda_model).get_coherence_per_topic()

100%|██████████| 6869/6869 [00:02<00:00, 3289.82it/s]


In [25]:
db_loc = '../scratch/test.sqlite'

In [26]:
engine = create_engine(f"sqlite:///{db_loc}")
Session = sessionmaker(bind=engine)
session = Session()

In [27]:
session.query(db.Paper.id).count()

7245

In [28]:
corp2paper_dct = {c: p for c, p in corp2paper}
dct2kwd_dct = {d: k for d, k in dct2kwd}

In [141]:
corpus_inds, paper_inds = zip(*corp2paper)

In [95]:
paper2bibcode = session.query(db.Paper.id, db.Paper.bibcode).all()

In [96]:
# q = (
#     session.query(db.Paper.bibcode)
#     .filter(db.Paper.id.in_([p for _, p in corp2paper]))
# )
# bibs = [q[0] for q in q.all()]

In [382]:
q = (
    session.query(
        db.PaperKeywords.paper_id, db.PaperKeywords.keyword_id, db.PaperKeywords.count,
    )
#     .filter(db.PaperKeywords.keyword_id.in_(kwds_batch))
    .join(db.Paper)  # for the journal blacklist removal
)

In [384]:
q = q.filter(~db.Paper.bibcode.contains('arXiv'))

In [383]:
q.count()

497098

In [100]:
# bibs = [[c, paper2bibcode_dct[p]] for c, p in corp2paper]

In [140]:
bibs = [paper2bibcode_dct[p] for c, p in corp2paper]

In [294]:
embedding_df = pd.DataFrame(embedding)
embedding_df.index = paper_inds

# Get Topic Years

In [313]:
from tsfresh import extract_features
import numpy as np

In [367]:
thresh = 0.3

In [368]:
year_min = 1997
year_max = 2010

In [369]:
all_time_series = []
for topic in tqdm(embedding_df.columns):
    ids_in_topic = embedding_df.index[embedding_df.loc[:, topic] > thresh].tolist() # function to include options with argmax as well?
    years_query = (
        session.query(db.Paper.year, func.count(db.Paper.year)) # Don't really need to add journal exlusions because already limiting to the IDs which are in topic model
        .filter(db.Paper.id.in_(ids_in_topic))
        .filter(db.Paper.year <= year_max)
        .filter(db.Paper.year >= year_min)
        .group_by(db.Paper.year)
    )
    year_counts = years_query.all()
    ycd = defaultdict(int, {y:c for y, c in year_counts})
    topic_time_series = [{"topic": topic, "year": y, "count": ycd[y]} for y in range(year_min, year_max)]
    all_time_series = all_time_series + topic_time_series

ts_df_long = pd.DataFrame(all_time_series)
ts_df = ts_df_long.pivot(index='topic', columns="year", values="count")

features_df = extract_features(ts_df_long, column_id='topic', column_sort='year')
features_df['coherence_score'] = coh_per_topic

def cagr(x_row):
    x = x_row.values
    nz_inds = np.nonzero(x)[0]
    if len(nz_inds) == 0:  # If all are 0, set CAGR to 0
        return 0
    else:
        first_nonzero_index = nz_inds[0]
        x = x[first_nonzero_index:]  # Not valid if starts with 0. Becomes inf
        x = x[~np.isnan(x)]  # For normalized time series, NaNs before any occurrence of kwd
    if len(x) < 2:  # If no periods, set CAGR to 0
        return 0
    else:
        ys = x_row.index
        period = max(ys) - min(ys)
        return (x[-1] / x[0]) ** (1 / period) - 1

features_df['CAGR'] = ts_df.apply(cagr, axis=1)

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))





Feature Extraction:   0%|          | 0/20 [00:00<?, ?it/s][A
Feature Extraction: 100%|██████████| 20/20 [00:00<00:00, 108.62it/s]A


In [373]:
features_df

variable,count__abs_energy,count__absolute_sum_of_changes,"count__agg_autocorrelation__f_agg_""mean""__maxlag_40","count__agg_autocorrelation__f_agg_""median""__maxlag_40","count__agg_autocorrelation__f_agg_""var""__maxlag_40","count__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""intercept""","count__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""rvalue""","count__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""slope""","count__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""stderr""","count__agg_linear_trend__f_agg_""max""__chunk_len_50__attr_""intercept""",...,count__time_reversal_asymmetry_statistic__lag_1,count__time_reversal_asymmetry_statistic__lag_2,count__time_reversal_asymmetry_statistic__lag_3,count__value_count__value_-1,count__value_count__value_0,count__value_count__value_1,count__variance,count__variance_larger_than_standard_deviation,coherence_score,CAGR
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,9330.0,117.0,-0.080422,-0.136259,0.214763,38.0,-1.0,-1.0,0.0,,...,2385.363636,345.666667,415.714286,0.0,0.0,0.0,49.668639,1.0,-20.123412,0.044569
1,2170.0,72.0,-0.067732,-0.196309,0.150296,19.0,-1.0,-7.0,0.0,,...,96.909091,134.666667,1043.714286,0.0,0.0,0.0,22.923077,1.0,-19.9429,-0.035626
2,2567.0,50.0,-0.221817,-0.128119,0.427153,19.0,0.0,0.0,0.0,,...,379.272727,318.111111,728.714286,0.0,0.0,0.0,12.08284,1.0,-20.991539,0.064247
3,1600.0,31.0,-0.113779,0.030398,0.253568,15.0,-1.0,-1.0,0.0,,...,82.090909,331.555556,569.857143,0.0,0.0,0.0,7.100592,1.0,-20.588431,0.006195
4,1625.0,41.0,-0.079387,-0.125017,0.055949,18.0,-1.0,-6.0,0.0,,...,47.090909,58.555556,529.428571,0.0,0.0,0.0,7.360947,1.0,-20.069713,0.026893
5,13253.0,90.0,-0.405083,-0.133773,0.419302,43.0,1.0,6.0,0.0,,...,5570.909091,17066.222222,20797.857143,0.0,0.0,0.0,86.863905,1.0,-19.665369,0.073161
6,3790.0,50.0,-0.423663,-0.109488,0.505825,22.0,0.0,0.0,0.0,,...,941.454545,2160.444444,2586.571429,0.0,0.0,0.0,20.556213,1.0,-20.626991,0.067911
7,3028.0,41.0,-0.297084,-0.262194,0.248575,21.0,-1.0,-1.0,0.0,,...,372.272727,1101.222222,1816.857143,0.0,0.0,0.0,10.224852,1.0,-19.49986,0.036551
8,1515.0,29.0,-0.373034,-0.053269,0.699808,13.0,1.0,2.0,0.0,,...,312.727273,326.222222,398.428571,0.0,0.0,0.0,5.47929,1.0,-20.764688,0.05378
9,22022.0,132.0,-0.07106,-0.149695,0.298633,54.0,-1.0,-4.0,0.0,,...,3705.454545,-920.0,5467.285714,0.0,0.0,0.0,81.668639,1.0,-20.954849,0.027753
