In [None]:
import pyodbc
import pandas as pd
import time
import pickle as pkl
import numpy as np
import matplotlib.pyplot as plt
import json
import os

%matplotlib inline

with open ('../params.json') as f:
    params = json.load(f)
    
table_prefix = params['table_prefix']
diseases = params['diseases']
case_limit = params['case_limit']
control_limit = params['control_limit']
min_enrollment = params['enrollment']
user = params['user']

chunk = True

creds_file = "/home/" + user + "/creds.txt" 
creds = lines = [line.rstrip('\n') for line in open(creds_file)]

connection_string = ("Driver={ODBC Driver 17 for SQL Server};" + 
                     "server=" + creds[0] + ";" +
                     "domain=" + creds[1] + ";" +  
                     "database=" + creds[2] + ";" +
                     "uid=" + creds[3]  + ";" +
                     "pwd=" + creds[4] + ";" +
                     "ssl=require;")

cn = pyodbc.connect(connection_string, autocommit=True)
cursor = cn.cursor()

# directory = '../../data/' + str(table_prefix) + '_' + str(case_limit)
directory = '../../data/diseaes_replaced' + str(table_prefix) + '_' + str(case_limit)

if not os.path.exists(directory):
    os.makedirs(directory)

In [None]:
# get all members
sql = "SELECT DISTINCT(MemberNum) FROM " + user + ".dbo." + table_prefix +\
      "_sequences ORDER BY MemberNum"
members = pd.read_sql_query(sql, cn)
print(members.shape)
members.to_csv(directory + '/members.csv', compression='gzip')

# get all to generate dictionary / popular terms etc.

In [None]:
from gensim.models import Word2Vec
import pickle as pkl

start_time = time.time()

sql = "SELECT MemberNum, fromIndex, Type, Object FROM " + user + ".dbo." + table_prefix + "_sequences ORDER BY MemberNum, Date, TYPE"

if chunk:
    chunk_list = []
    i = 0
    for c in pd.read_sql_query(sql , cn, chunksize=10000000):
        print(i, time.time() - start_time)
        chunk_list.append(c)
        i += 1
else:
    seq = pd.read_sql(sql, cn)
    
print(time.time() - start_time)
%time seq = pd.concat(chunk_list)
print(seq.shape)

In [None]:
seq = seq[np.isfinite(seq['fromIndex'])]
print(seq.shape)

In [None]:
seq['Type'] = seq['Type'].astype('int64')
seq['fromIndex'] = seq['fromIndex'].astype(int)
seq['Object'] = seq['Object'].astype(str)
seq['Type_obj'] = seq['Type'].astype(str) + '_' + seq['Object'].astype(str)

val_counts = seq['Type_obj'].value_counts()
val_counts = val_counts.sort_values(ascending=False).reset_index()
val_counts.columns = ['Type_obj', 'count']

print(val_counts.head())

# common dict
common_dict = seq[seq['Type'] == 0]['Object'].value_counts().head(n=500).to_dict()
i = 0
for key in common_dict:
    common_dict[key] = i
    i+=1
pkl.dump(common_dict, open(directory + '/common_dict.pkl', 'wb'))

# full dict
dictionary = {}
for i, val in val_counts.iterrows():
    dictionary[i] = val['Type_obj'].strip()
reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
pkl.dump(dictionary, open(directory + '/full_dict.pkl', 'wb'))
val_counts[1000:].hist()

In [None]:
print(len(dictionary), len(reversed_dictionary))
seq['Type_obj'] = seq['Type_obj'].str.strip()
seq['Type_obj'] = seq['Type_obj'].replace(to_replace=['0_332.0', '0_332.1'], value='dis')
%time sentences = seq.groupby('MemberNum')['Type_obj'].apply(list).tolist()

# sentences
# size - dimensionality of feature vectors
# window - max distance between current and predicted word
# min count - ignore rare words (should be higher than 5)
print(len(sentences))
model = Word2Vec(sentences, size=100, window=5, min_count=(len(sentences)*0.1),
                 workers=16)
model.save(directory + '/w2v_' + user + '_' + table_prefix + '.model')
print(len(model.wv.vocab))

val_counts = val_counts[:len(model.wv.vocab)]

In [None]:
shard_size = 10
fixed_vocab = val_counts.shape[0]
input_codes = sorted(list(val_counts['Type_obj']))
input_token_index = dict(
    [(char, i) for i, char in enumerate(input_codes)])
print('Vocab size', fixed_vocab, 'input_heads', len(input_codes))
pkl.dump(input_token_index, open(directory + '/input_token_index.pkl', 'wb'))

# @TODO add common diseases and remove too close to end
# @ TODO replace diseases with 'disease'
for i in range(members.shape[0] / shard_size):
    print(str(i) + '/' + str(members.shape[0] / shard_size))
    include_members = members[i*shard_size:(i+1)*shard_size]
    query_param = (include_members['MemberNum'].astype(str).str.cat(sep=", "))
    
    sql = ("SELECT MemberNum, fromIndex, Type, Object FROM " + user +\
           ".dbo." + table_prefix + "_sequences " +\
           "WHERE MemberNum in (" + query_param + ")" +\
           "ORDER BY MemberNum, cast(fromIndex as int) ASC, Type, Object")
    #print(sql)
    seq = pd.read_sql(sql, cn)
    seq['Type_obj'] = seq['Type'].astype(str) + '_' + seq['Object'].astype(str)
    seq['Type_obj'] = seq['Type_obj'].replace(to_replace=['0_332.0', '0_332.1'], value='dis')
    print(seq.shape)
    
    # reduce size - only words in vocab
    seqFiltered = seq[seq['Type_obj'].isin(model.wv.vocab)]
    #print(seqFiltered.shape)
    
    seqFiltered['InputCode'] = seqFiltered['Type_obj'].map(input_token_index)
    seqFiltered = seqFiltered[seqFiltered['InputCode'].notnull()]
    #print(seqFiltered.shape)
    
    seqFiltered.reset_index(inplace=True, drop=True)
    seq_del = seqFiltered.copy(deep=True)
    
    for memberNum in seqFiltered['MemberNum'].unique():
        member = seqFiltered.loc[(seqFiltered['MemberNum'] == memberNum)]\
                            .sort_values(by=['fromIndex', 'Type', 'Object'], ascending=True)

        firstRow = member.index.min()
        index_minus25 = member[(member['fromIndex'] == 0)].index.min() - 24
        lastRow = member.index.max()

        if firstRow < index_minus25:
            seq_del.drop(seqFiltered.index[firstRow: index_minus25], inplace=True)
        if (index_minus25 <= 0) or (firstRow > lastRow-25):
            seq_del.drop(seqFiltered.index[firstRow: lastRow], inplace=True)
            
    print(seq_del.shape)
    seq_del.to_csv(directory + '/seq_' + str(i*shard_size) + "_" + 
                   str((i+1) * shard_size) + '.csv.gz', compression='gzip')