In [1]:
# query 5

# journal connectivity
#   https://coda.io/d/WoS-db_d2zsZYQIcsP/Test-queries_suE54?utm_campaign=link&utm_medium=web&utm_source=2zsZYQIcsP&loginToken=eyJjc3JmVG9rZW4iOiJJakV5TkhveU1WcEZURkZwWWxOWE9Hd2kiLCJleHBpcnlNc2VjIjoxNTYyMDk3NTk0MjI5fQ#_lu4Eu


In [2]:
import pymysql
import time
import json

In [3]:
# mysql parameters

db_name = 'test_wos_cut_full'
client_config = {'unix_socket':'/project2/jevans/study_dbs/mysql/.sql.sock',
                'database': db_name}

In [4]:
# query parameters

batch_sizes = [5,10,25,125]
year = 1978  # for the year specified here
lookback_window = 5  # years

In [5]:
# load top 125 journals from 1978 (from query 1)
with open('query1_results_125.json','r') as f:
    loaded_data = json.load(f)
    results = loaded_data['results_1978']
    top_journals = [l[1] for l in results]

In [6]:
# todo aggregate
for batch_size in batch_sizes:
    
    print('batch size {}'.format(batch_size))
    
    # run query
    start_time = time.time()

    BATCH_SIZE = batch_size
    journals = top_journals[:BATCH_SIZE]

    #   in order to protect against
    #   SQL injection risks, 
    #   it's better to pass arguments directly
    #   to mysql (without formatting
    #   as one long string):
    #   see https://stackoverflow.com/questions/589284/imploding-a-list-for-use-in-a-python-mysqldb-in-clause  
    #   ---do as we say, not as we do---
    db = pymysql.connect(**client_config)
    cursor = db.cursor()
    journals_str =  ', '.join(["\""+j+"\"" for j in journals]) # for convenient but injection-risky insertion into the SQL query
    sql = '''SELECT P_citing.source as citing_journal,
                    P_cited.source as cited_journal,
                    count(*) as count
            FROM publications as P_citing
                JOIN the_references R1
                    ON P_citing.wos_id=R1.wos_id
                JOIN publications P_cited
                    ON P_cited.wos_id=R1.uid
            WHERE (P_citing.pubyear={}) AND
                (P_citing.source IN ({})) AND
                (P_cited.pubyear >= {}) AND
                (P_cited.pubyear < {}) AND
                (P_cited.source IN ({}))
            GROUP BY citing_journal, cited_journal
            ORDER BY count DESC
            '''.format(year,
                       journals_str,
                       year-lookback_window,
                       year,
                       journals_str)
    print(sql, '\n','----------------------------','\n')
    cursor.execute(sql)

    results = []
    for i,result in enumerate(cursor):
        results.append(result)
    cursor.close()
    db.close()

    end_time = time.time()
    elapsed = end_time - start_time
    print("elapsed (s): {}".format(elapsed))
    print('results: \n----------------------------------')    
    print(results)

    # write results
    save_obj = {'elapsed': elapsed,
               'results': results}
    with open('query5_results_{}.json'.format(batch_size),'w') as f:
        json.dump(save_obj, f)
    

batch size 5
SELECT P_citing.source as citing_journal,
                    P_cited.source as cited_journal,
                    count(*) as count
            FROM publications as P_citing
                JOIN the_references R1
                    ON P_citing.wos_id=R1.wos_id
                JOIN publications P_cited
                    ON P_cited.wos_id=R1.uid
            WHERE (P_citing.pubyear=1978) AND
                (P_citing.source IN ("FEDERATION PROCEEDINGS", "BULLETIN OF THE AMERICAN PHYSICAL SOCIETY", "CLINICAL RESEARCH", "ABSTRACTS OF PAPERS OF THE AMERICAN CHEMICAL SOCIETY", "BRITISH MEDICAL JOURNAL")) AND
                (P_cited.pubyear >= 1973) AND
                (P_cited.pubyear < 1978) AND
                (P_cited.source IN ("FEDERATION PROCEEDINGS", "BULLETIN OF THE AMERICAN PHYSICAL SOCIETY", "CLINICAL RESEARCH", "ABSTRACTS OF PAPERS OF THE AMERICAN CHEMICAL SOCIETY", "BRITISH MEDICAL JOURNAL"))
            GROUP BY citing_journal, cited_journal
            ORDER BY

elapsed (s): 338.9625196456909
results: 
----------------------------------
[('NATURE', 'NATURE', 1904), ('LANCET', 'LANCET', 1377), ('BRITISH MEDICAL JOURNAL', 'BRITISH MEDICAL JOURNAL', 838), ('DOKLADY AKADEMII NAUK SSSR', 'DOKLADY AKADEMII NAUK SSSR', 798), ('BRITISH MEDICAL JOURNAL', 'LANCET', 592), ('LANCET', 'BRITISH MEDICAL JOURNAL', 365), ('LANCET', 'NATURE', 166), ('BULLETIN OF THE AMERICAN PHYSICAL SOCIETY', 'BULLETIN OF THE AMERICAN PHYSICAL SOCIETY', 121), ('FEDERATION PROCEEDINGS', 'FEDERATION PROCEEDINGS', 108), ('FEDERATION PROCEEDINGS', 'NATURE', 106), ('NATURE', 'FEDERATION PROCEEDINGS', 62), ('NATURE', 'LANCET', 56), ('DOKLADY AKADEMII NAUK SSSR', 'NATURE', 55), ('BRITISH MEDICAL JOURNAL', 'NATURE', 35), ('FEDERATION PROCEEDINGS', 'LANCET', 29), ('LANCET', 'FEDERATION PROCEEDINGS', 16), ('NATURE', 'TRANSACTIONS-AMERICAN GEOPHYSICAL UNION', 16), ('TRANSACTIONS-AMERICAN GEOPHYSICAL UNION', 'NATURE', 13), ('LANCET', 'CLINICAL RESEARCH', 12), ('FEDERATION PROCEEDINGS', 'C

In [7]:
#   todo   good suggestion from Sasha, organize journals by issn