In [0]:
pip install nltk

In [0]:
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *
import pandas as pd
import pickle

ICD9CODES = pickle.load(open("/dbfs/FileStore/data_2/ICD9CODES.p", 'rb'))
ICD9CODES_TOP10 = pickle.load(open("/dbfs/FileStore/data_2/ICD9CODES_TOP10.p", 'rb'))
ICD9CODES_TOP50 = pickle.load(open("/dbfs/FileStore/data_2/ICD9CODES_TOP50.p", 'rb'))
ICD9CAT_TOP10 = pickle.load(open("/dbfs/FileStore/data_2/ICD9CAT_TOP10.p", 'rb'))
ICD9CAT_TOP50 = pickle.load(open("/dbfs/FileStore/data_2/ICD9CAT_TOP50.p", 'rb'))

In [0]:
from pyspark.ml.feature import StopWordsRemover
STOPWORDS_v0 = StopWordsRemover.loadDefaultStopWords("english") + ICD9CODES
STOPWORDS_v0 = [str(i) for i in STOPWORDS_v0]

In [0]:
from pyspark.ml.feature import HashingTF, IDF, RegexTokenizer, StopWordsRemover

def create_TFIDF_v0(trainData, applyData, inputCol="text", outputCol="features", minDocFreq=3, numFeatures=20):    
    tokenizer = RegexTokenizer(pattern="[.:\s]+", inputCol=inputCol, outputCol="z_words")
    wordsData1 = tokenizer.transform(trainData)
    wordsData2 = tokenizer.transform(applyData)
    
    remover = StopWordsRemover(inputCol="z_words", outputCol="z_filtered", stopWords=STOPWORDS_v0)
    wordsDataFiltered1 = remover.transform(wordsData1)
    wordsDataFiltered2 = remover.transform(wordsData2)
    
    hashingTF = HashingTF(inputCol="z_filtered", outputCol="z_rawFeatures", numFeatures=numFeatures)
    featurizedData1 = hashingTF.transform(wordsDataFiltered1)
    featurizedData2 = hashingTF.transform(wordsDataFiltered2)
    # alternatively, CountVectorizer can also be used to get term frequency vectors

    idf = IDF(inputCol="z_rawFeatures", outputCol=outputCol, minDocFreq=minDocFreq)
    idfModel = idf.fit(featurizedData1)
    
    rescaledData = idfModel.transform(featurizedData2)
    return rescaledData.drop("z_words", "z_filtered", "z_rawFeatures", inputCol)

In [0]:
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS

STOPWORDS_v1 = list(ENGLISH_STOP_WORDS) + ICD9CODES

# print "TFIDF v1 stop words"
# print STOPWORDS_v1

In [0]:
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from pyspark.mllib.util import Vectors
from pyspark.mllib.linalg import VectorUDT
from pyspark.sql.functions import UserDefinedFunction

def preprocessor_v1(text):
    text = re.sub('\[\*\*[^\]]*\*\*\]', '', text)
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('[\W]+', ' ', text.lower()) 
    return text

def create_TFIDF_v1(df_train, df_apply, inputCol="text", outputCol="features",
                    minDocFreq=3, maxDocFreq=1.0, numFeatures=20):
    df_train['z_cleaned'] = df_train[inputCol].apply(preprocessor_v1)
    df_apply['z_cleaned'] = df_apply[inputCol].apply(preprocessor_v1)

    # Now we create the sparse matrix of tfidf values
    tfidf = TfidfVectorizer(input='content',ngram_range=(1, 1),
                            stop_words=STOPWORDS_v1, 
                            min_df=minDocFreq,
                            max_df=maxDocFreq,
                            max_features=numFeatures)
    # I select to remove stopwords and minimun doc frequency =10 to delete very unusual words
    # that only show up in less than 10 notes (out of 59k notes available) 

    tfidf.fit([c for c in df_train['z_cleaned']])
    dtm = tfidf.transform([c for c in df_apply['z_cleaned']]).tocsr()
    dtm.sort_indices()
    df_apply[outputCol] = list(dtm)
   
    del df_train['z_cleaned']
    del df_apply['z_cleaned']
    del df_apply[inputCol]
    
    return df_apply

<hd5> WORD2VEC

In [0]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')



STOPWORDS_WORD2VEC = stopwords + ICD9CODES

In [0]:
import numpy as np
import re

# Run this cell if you are using Glove type format
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = [float(val) for val in splitLine[1:]]
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

def preprocessor_word2vec(text):
    text = re.sub('\[\*\*[^\]]*\*\*\]', '', text)
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('[\W]+', ' ', text.lower()) 
    text = re.sub(" \d+", " ", text)
    #text = gensim.parsing.preprocessing.remove_stopwords(text)
    return text

def makeFeatureVec(words, model, num_features, index2word_set):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    #index2word_set = set(model.wv.index2word) #activate if using gensim

    # activate if uploaded text version
    #index2word_set=set(keys_updated)
    
    
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # 
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec

def getAvgFeatureVecs(reviews, model, index2word_set, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
       #
       # Print a status message every 10000th review
       if counter%10000 == 0:
           print("Review %d of %d" % (counter, len(reviews)))
       # 
       # Call the function (defined above) that makes average feature vectors
       reviewFeatureVecs[counter] = makeFeatureVec(review, model,num_features,index2word_set)
       #
       # Increment the counter
       counter = counter + 1
    return reviewFeatureVecs

def create_WORD2VEC(df, inputCol="text", outputCol="features",
                    word2vecmodel="./data/model_word2vec.txt"):
    df['z_cleaned'] = df[inputCol].apply(preprocessor_word2vec)
    
    # Create tokens
    token_review=[]
    for i in range(df['z_cleaned'].shape[0]):
        review = df['z_cleaned'][i]
        token_review.append([i for i in review.split()])
    model_w2v = word2vecmodel
#     model_w2v = loadGloveModel(word2vecmodel)
    numFeatures = len(model_w2v.values()[0])
    print("numFeatures: ", numFeatures)
    
    keys_updated = [word for word in model_w2v.keys() if word not in STOPWORDS_WORD2VEC]
    index2word_set=set(keys_updated)

    final_w2v = getAvgFeatureVecs(token_review, model_w2v, index2word_set, num_features=numFeatures)
    df[outputCol] = list(final_w2v)
    
    del df['z_cleaned']
    del df[inputCol]
    
    return df

DOC2VEC
Note: This code only converts the data using a pre-trained doc2vec model

In [0]:
def create_DOC2VEC(df,doc2vecmodel):
    import pandas as pd
    import numpy as np
    df1=pd.read_csv(doc2vecmodel, index_col='id') 
    df1['features']=list(df1.values)
    df1=df1['features'].apply(np.asarray)
        
    result = pd.merge(pd.DataFrame({'id':df1.index, 'features':df1.values}), df, on='id')
    del result['text']
    del df1
    
    return result

In [0]:
# df = pd.DataFrame()

HELPER FUNCTION

In [0]:
import random, pickle
import pandas as pd
from pyspark.mllib.util import Vectors
from pyspark.mllib.linalg import VectorUDT
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import StringType
import numpy as np

def separate(seed, N):    
    idx=list(range(N))
    random.seed(seed)
    random.shuffle(idx)
    idx_train= idx[0:int(N*0.50)]
    idx_val= idx[int(N*0.50):int(N*0.75)]
    idx_test= idx[int(N*0.75):N]

    return idx_train, idx_val, idx_test

def separate_2(df, hadmid_pickle):
    f = open(hadmid_pickle, 'rb')
    hadmid_train = pickle.load(f)
    hadmid_val = pickle.load(f)
    hadmid_test = pickle.load(f)
    f.close()
    
    df2 = df.copy()
    df2['_idx'] = df2.index
    df2.set_index('id', inplace=True)
    
    idx_train = list(df2.loc[hadmid_train]['_idx'])
    idx_val = list(df2.loc[hadmid_val]['_idx'])
    idx_test = list(df2.loc[hadmid_test]['_idx'])
    
    return idx_train, idx_val, idx_test

def output_to_spark(df, path, col='features', dense=False):
    if type(df) != pd.DataFrame:    
        print('hello')
        udf = UserDefinedFunction(lambda x: Vectors.stringify(x), StringType())
        s_df2 = df.withColumn(col, udf(df[col]))
        # df2.write.csv(path, header=True)
        s_df2.registerTempTable(path)
#         df3 = df2.toPandas()
#         df3.to_csv(path, index=False)
    else:
        N = df[col].iloc[0].shape[-1]
        if dense:
            def to_string(x):
                return "({0},[{1}],[{2}])".format(N, 
                                                  ",".join([str(i) for i in range(N)]),
                                                  ",".join([str(i) for i in list(x)]))
        else:            
            def to_string(x):
                return "({0},[{1}],[{2}])".format(N, 
                                      ",".join([str(i) for i in list(x.indices)]),
                                      ",".join([str(i) for i in list(x.data)]))
        try :
          df2 = df.copy()
          df2[col] = df[col].apply(to_string)
          s_df2 = spark.createDataFrame(df2)
          s_df2.registerTempTable(path)
        except :
          d = 'hi'
        
        
    return 'hi'
def batch_output_spark(df, otype, fname, labels, outputCol='features', 
                     hadmid_pickle='/dbfs/FileStore/data/TRAIN-VAL-TEST-HADMID.p'):
    labels2 = ['id'] + labels + [outputCol]
        
    if otype.lower() == "tfidfv0":
        f = open(hadmid_pickle, 'rb')
        train_id_df = spark.createDataFrame(zip(pickle.load(f)), ['id2'])
        val_id_df = spark.createDataFrame(zip(pickle.load(f)), ['id2'])
        test_id_df = spark.createDataFrame(zip(pickle.load(f)), ['id2'])
        f.close()
        
        df.cache()

        df1 = df.join(train_id_df, train_id_df.id2 == df.id, 'inner').select(labels2)
        output_to_spark(df1, "{0}_train.csv".format(fname))
        df1 = df.join(val_id_df, val_id_df.id2 == df.id, 'inner').select(labels2)
        output_to_spark(df1, "{0}_val.csv".format(fname))
        df1 = df.join(test_id_df, test_id_df.id2 == df.id, 'inner').select(labels2)
        output_to_spark(df1, "{0}_test.csv".format(fname))
        
    elif otype.lower() == "tfidfv1":
        idx_train, idx_val, idx_test = separate_2(df, hadmid_pickle)
        
        output_to_spark(df.loc[idx_train][labels2], "{0}_train".format(fname), dense=False)
        output_to_spark(df.loc[idx_val][labels2], "{0}_val".format(fname), dense=False)
        output_to_spark(df.loc[idx_test][labels2], "{0}_test".format(fname), dense=False)
    elif otype.lower() == "word2vecv0":
        idx_train, idx_val, idx_test = separate_2(df, hadmid_pickle)
        output_to_spark(df.loc[idx_train][labels2], "{0}_train".format(fname), dense=True)
        output_to_spark(df.loc[idx_val][labels2], "{0}_val".format(fname), dense=True)
        output_to_spark(df.loc[idx_test][labels2], "{0}_test".format(fname), dense=True)
    elif otype.lower() == "doc2vecv0":
        # doc2vec has the same format as word2vec
        idx_train, idx_val, idx_test = separate_2(df, hadmid_pickle)
        output_to_spark(df.loc[idx_train][labels2], "{0}_train".format(fname), dense=True)
        output_to_spark(df.loc[idx_val][labels2], "{0}_val".format(fname), dense=True)
        output_to_spark(df.loc[idx_test][labels2], "{0}_test".format(fname), dense=True)
        
def read_csv(path):
    df = spark.read.csv(path, header=True, inferSchema=True)
    
    udf = UserDefinedFunction(lambda x: Vectors.parse(x), VectorUDT())
    new_df = df.withColumn('features', udf(df.features))
    
    return new_df

Actual Data Preprocessing
prepare separation indices (train, validation, test)

In [0]:

df1_sp = spark.sql('''
select distinct 
`row_id` as id,
`0389`,
`2449`,
`25000`,
`2720`,
`2724`,
`2760`,
`2761`,
`2762`,
`2767`,
`2851`,
`2859`,
`2875`,
`3051`,
`311`,
`32723`,
`4019`,
`40390`,
`40391`,
`41071`,
`412`,
`41401`,
`4168`,
`4240`,
`4241`,
`42731`,
`42789`,
`4280`,
`486`,
`49390`,
`496`,
`5070`,
`5119`,
`5180`,
`51881`,
`53081`,
`5845`,
`5849`,
`5859`,
`5990`,
`7742`,
`78552`,
`99592`,
`9971`,
`V053`,
`V1582`,
`V290`,
`V4581`,
`V4582`,
`V5861`,
`V5867`,
`text`
--`clean_text`
from tempdb.hadm_top_50_icd_step_5
''')

df1_pd = df1_sp.toPandas()


In [0]:
# df1_sp_train.head()

In [0]:
# idx_train, idx_val, idx_test = separate_2(df1_pd, '/dbfs/FileStore/data/TRAIN_VAL_TEST_HADMID.p')
# df1_pd_train = df1_pd.loc[idx_train]

# f = open('/dbfs/FileStore/data/TRAIN_VAL_TEST_HADMID.p', 'rb')
# hadmid_train = pickle.load(f)
# hadmid_val = pickle.load(f)
# hadmid_test = pickle.load(f)
# f.close()
    
# hadmid_train_df = spark.createDataFrame(zip(hadmid_train), ['id2'])
# df1_sp_train = df1_sp.join(hadmid_train_df, hadmid_train_df.id2 == df1_sp.id, 'inner')

In [0]:
from time import time
t0 = time()

df2 = create_TFIDF_v1(df1_pd.copy(), df1_pd.copy(), minDocFreq=10, 
                      maxDocFreq=0.8, numFeatures=40000)

In [0]:
# Auxiliar functions
def equivalent_type(f):
    if f == 'datetime64[ns]': return TimestampType()
    elif f == 'int64': return LongType()
    elif f == 'int32': return IntegerType()
    elif f == 'float64': return FloatType()
    else: return StringType()

def define_structure(string, format_type):
    try: typo = equivalent_type(format_type)
    except: typo = StringType()
    return StructField(string, typo)

# Given pandas dataframe, it will return a spark's dataframe.
def pandas_to_spark(pandas_df):
    columns = list(pandas_df.columns)
    types = list(pandas_df.dtypes)
    struct_list = []
    for column, typo in zip(columns, types): 
      struct_list.append(define_structure(column, typo))
    p_schema = StructType(struct_list)
    return sqlContext.createDataFrame(pandas_df, p_schema)

In [0]:
df2_new = df2["features"].astype('string')
spf = pd.DataFrame(df2_new).reset_index()
spf.columns = ["id","features"]
s_df2_new = pandas_to_spark(spf)
s_df2_new.registerTempTable("tfidf_output")

In [0]:
%sql
create table tempdb.tfidf_output as (
select * 
from tfidf_output
)


num_affected_rows,num_inserted_rows


In [0]:
%sql
select * 
from tempdb.tfidf_output 
limit 10

id,features
0,"(0, 6326)	0.19197569127830585  (0, 6457)	0.12709624505626282  (0, 6891)	0.1343398770493549  (0, 7214)	0.17485338375166284  (0, 8101)	0.18784796008227986  (0, 8377)	0.14451317971411926  (0, 9269)	0.18513973756449514  (0, 11964)	0.18952783849733856  (0, 13369)	0.425198023609649  (0, 14422)	0.32953273303373176  (0, 14574)	0.14043445709416455  (0, 19734)	0.14030628244306934  (0, 19796)	0.15726548161603893  (0, 23913)	0.16048722624961378  (0, 26751)	0.20617991474114705  (0, 27742)	0.10144601315375221  (0, 28684)	0.2924615481119045  (0, 29172)	0.12376964972817037  (0, 31804)	0.1763568947895499  (0, 33216)	0.1639204751102144  (0, 33510)	0.1663071397755431  (0, 33568)	0.19217516686301844  (0, 34811)	0.20959046214573204  (0, 37257)	0.2092252238208384  (0, 38088)	0.1697131464116581"
1,"(0, 472)	0.03494981129755392  (0, 473)	0.03461530879783161  (0, 476)	0.09827853714763313  (0, 803)	0.0664901586198983  (0, 880)	0.037535011015165765  (0, 881)	0.04956713243635864  (0, 972)	0.06478821463102233  (0, 1075)	0.05245825317996584  (0, 1383)	0.12040765005013673  (0, 1736)	0.04387451320013108  (0, 1737)	0.06723326107008398  (0, 2222)	0.08135655430424316  (0, 2585)	0.04061937149750106  (0, 2683)	0.04549996522556103  (0, 3029)	0.0354799461429996  (0, 3084)	0.07047782446096308  (0, 3316)	0.1103787479722794  (0, 3789)	0.07262523897878886  (0, 4062)	0.07379378196008364  (0, 4157)	0.05644091634813815  (0, 4526)	0.07794759778393513  (0, 5247)	0.04184694349337137  (0, 5387)	0.05806931926989073  (0, 5659)	0.047991758941657095  (0, 5885)	0.09440377053761605  :	:  (0, 33985)	0.038309253280478364  (0, 34084)	0.2445476435838154  (0, 34159)	0.0954408698669372  (0, 34173)	0.0444224404804769  (0, 34267)	0.061915619071196394  (0, 34770)	0.05209262716294869  (0, 34773)	0.041800619750725874  (0, 34856)	0.16853130061483648  (0, 34989)	0.03585843113489304  (0, 35785)	0.058384368881350686  (0, 36728)	0.05980508046248022  (0, 36880)	0.03607022686608911  (0, 38224)	0.052071718435575544  (0, 38327)	0.03641212922059722  (0, 38391)	0.06870401477617673  (0, 38626)	0.07952319996365825  (0, 38679)	0.0879120490021288  (0, 38786)	0.07178671312830068  (0, 38921)	0.05456231837461163  (0, 39189)	0.06439675377523792  (0, 39210)	0.04110444342513519  (0, 39388)	0.10065612732192088  (0, 39534)	0.05438323076134064  (0, 39627)	0.090586408744646  (0, 39873)	0.045201420954449484"
2,"(0, 5888)	0.07120439311381299  (0, 6262)	0.17680296248242316  (0, 6615)	0.13883521031154508  (0, 6751)	0.09033065828764754  (0, 8058)	0.12970373065673788  (0, 8377)	0.09630785828428405  (0, 8454)	0.18097074698292992  (0, 8814)	0.11425235349286318  (0, 9637)	0.0819478121514124  (0, 9761)	0.08923327523230111  (0, 11393)	0.06402340757353328  (0, 11403)	0.22644240947576236  (0, 11849)	0.11339911415155743  (0, 13312)	0.14341098918247616  (0, 13942)	0.15930055993874473  (0, 14324)	0.282737131675853  (0, 16023)	0.1457064421945257  (0, 16204)	0.10910859348666811  (0, 17058)	0.0750114719352491  (0, 17182)	0.09032515594178649  (0, 17669)	0.113164682212006  (0, 18577)	0.20310909238204403  (0, 18632)	0.16638795325332414  (0, 20984)	0.0940014091329036  (0, 21132)	0.12897629375333539  :	:  (0, 25135)	0.09686774721837636  (0, 25554)	0.22414019309105174  (0, 25894)	0.15416580160803595  (0, 26128)	0.13903592323587285  (0, 26132)	0.06580744456000476  (0, 28065)	0.12116556843759611  (0, 28551)	0.10782678509613644  (0, 28590)	0.1197931992100308  (0, 28643)	0.05541150851949356  (0, 29803)	0.09811050739024965  (0, 30611)	0.10897279926523737  (0, 31305)	0.14329306458618926  (0, 32392)	0.06153859921228103  (0, 32526)	0.10906682530119317  (0, 32796)	0.16827886983400303  (0, 33606)	0.18162155341648878  (0, 33995)	0.1425776372673248  (0, 34194)	0.06548493891093127  (0, 34309)	0.15240194971192816  (0, 34991)	0.11625815235931988  (0, 36880)	0.07400401848463592  (0, 36924)	0.09479093943113229  (0, 36962)	0.10671703171291232  (0, 38937)	0.2048709670512429  (0, 38990)	0.09466573024136604"
3,"(0, 700)	0.09824024371892649  (0, 3599)	0.05299463721961074  (0, 4560)	0.10655654539030039  (0, 5393)	0.20057949061779506  (0, 6253)	0.06357004462915676  (0, 6837)	0.06169240297506521  (0, 7135)	0.07554787675316693  (0, 7705)	0.08705767478430149  (0, 7893)	0.10712248021764727  (0, 9412)	0.10616962535202372  (0, 9768)	0.07400002681917757  (0, 10067)	0.1152206645014994  (0, 10931)	0.070578674193818  (0, 10969)	0.09702862953741807  (0, 11486)	0.08256224686867204  (0, 11652)	0.20007352684051852  (0, 12671)	0.196246747958425  (0, 13368)	0.06267576508667803  (0, 13559)	0.25066845719567377  (0, 13567)	0.09144676463235274  (0, 14687)	0.0875077077188485  (0, 14876)	0.11858455234764537  (0, 15125)	0.10762111543303822  (0, 15128)	0.09904050357827529  (0, 15881)	0.09122655508197483  :	:  (0, 26715)	0.11349033839731999  (0, 26875)	0.07150589662247644  (0, 30190)	0.06728488920505346  (0, 30255)	0.06823495235705067  (0, 30433)	0.13439223462543598  (0, 30689)	0.07967130750913062  (0, 30764)	0.12520615326188464  (0, 30787)	0.09550436570310655  (0, 30822)	0.06401104458160548  (0, 31222)	0.11800050825118225  (0, 31720)	0.09220482453527166  (0, 31977)	0.057878590221928886  (0, 34089)	0.054160876844157435  (0, 34613)	0.06710311504980533  (0, 34989)	0.05240657124314658  (0, 35398)	0.21228144345048452  (0, 36817)	0.09540094846589812  (0, 37641)	0.10381261731190777  (0, 38424)	0.11171950400578336  (0, 39388)	0.07355372698420225  (0, 39603)	0.17531591115195583  (0, 39621)	0.15894037294339222  (0, 39653)	0.07751363906344866  (0, 39688)	0.1601331757733363  (0, 39873)	0.06606121385036669"
4,"(0, 1093)	0.11878239320514376  (0, 2652)	0.17771298981908956  (0, 3087)	0.18932375950324654  (0, 3950)	0.22139844523880448  (0, 5182)	0.09905311147703856  (0, 5223)	0.10082961841579079  (0, 6432)	0.15810802165353544  (0, 6577)	0.08039287358108041  (0, 6629)	0.12463271156221752  (0, 6649)	0.10969988613080185  (0, 6650)	0.09705536809656662  (0, 7088)	0.093872866556303  (0, 8067)	0.1092863456370094  (0, 9461)	0.07995236352136487  (0, 10462)	0.05001699190504169  (0, 10490)	0.13043110392264207  (0, 12374)	0.21243073266945672  (0, 12378)	0.0720045067080651  (0, 15518)	0.11376919370584482  (0, 16502)	0.09203658947394562  (0, 17145)	0.12028112620562129  (0, 17166)	0.12358223765773103  (0, 17173)	0.10121928113449435  (0, 17179)	0.07741803867676018  (0, 17224)	0.1559350517942967  :	:  (0, 25505)	0.07517077839233147  (0, 26171)	0.07366879967496763  (0, 26347)	0.12117744805421403  (0, 26436)	0.12365163702853432  (0, 26599)	0.1510408579758129  (0, 27758)	0.16020926482455952  (0, 30437)	0.13600683898267216  (0, 30543)	0.11063120316374865  (0, 33409)	0.08360128048740928  (0, 33642)	0.11504054039998092  (0, 34008)	0.10766456074617037  (0, 34267)	0.09909565389749013  (0, 34474)	0.16829851727279815  (0, 34492)	0.08574581751247812  (0, 34694)	0.04905592447924205  (0, 34989)	0.05739124851460008  (0, 34991)	0.09069250041218739  (0, 35530)	0.054553569171443  (0, 35723)	0.09450133715419415  (0, 35785)	0.18688780952143438  (0, 36283)	0.0919081776845933  (0, 37037)	0.06637534549145843  (0, 38990)	0.07384834185563119  (0, 39123)	0.10864798794174257  (0, 39295)	0.07505950508459094"
5,"(0, 6320)	0.2932662687465924  (0, 13369)	0.20318650239039135  (0, 20070)	0.7422267796998802  (0, 27738)	0.36500176878482593  (0, 32256)	0.36840977314628837  (0, 34770)	0.22990758185063304"
6,"(0, 880)	0.0250117446614994  (0, 1216)	0.029416069893615768  (0, 3311)	0.06006130073392351  (0, 3434)	0.03411343064790726  (0, 3498)	0.06222226621442785  (0, 4902)	0.0567586478016558  (0, 5095)	0.037890163775768634  (0, 5832)	0.09442190648348249  (0, 6285)	0.07456713741162836  (0, 6445)	0.09112934765560766  (0, 6470)	0.05173161890971044  (0, 6751)	0.05867676749328675  (0, 7135)	0.06889143572169282  (0, 7243)	0.140426178090629  (0, 7371)	0.03631131937463963  (0, 7662)	0.033569489084519566  (0, 7957)	0.047826965606037296  (0, 7971)	0.03278538678835658  (0, 8592)	0.08557695494071353  (0, 8693)	0.09454300634190847  (0, 8828)	0.047835284286582035  (0, 9177)	0.03420803581697662  (0, 9713)	0.04226325530399114  (0, 10637)	0.07312260718309153  (0, 10786)	0.09312756929709519  :	:  (0, 35133)	0.04102983201956402  (0, 35227)	0.19582778416399912  (0, 35241)	0.07904268645658621  (0, 35288)	0.04757883875465517  (0, 35402)	0.04354869432727393  (0, 35422)	0.0525204425722156  (0, 35483)	0.04563281983270817  (0, 36215)	0.03611424878884038  (0, 36497)	0.0441775091754702  (0, 36500)	0.049931558547314216  (0, 36787)	0.034208426204981435  (0, 36804)	0.050792125752255855  (0, 37914)	0.05413149084573124  (0, 38043)	0.05648438559536917  (0, 38144)	0.07582687717451159  (0, 38559)	0.15133815214799928  (0, 38708)	0.04896479035595231  (0, 38758)	0.05966389541511964  (0, 38938)	0.040502366277350016  (0, 39008)	0.03413199475514686  (0, 39261)	0.05301592521020197  (0, 39332)	0.08201525172505124  (0, 39547)	0.030854079208820974  (0, 39575)	0.07344254698536767  (0, 39857)	0.022017499023648527"
7,"(0, 473)	0.09674986906147916  (0, 1217)	0.14203615996271263  (0, 1460)	0.1010865433113653  (0, 1462)	0.14552339518125243  (0, 2053)	0.034027823920668256  (0, 3599)	0.03378301317528523  (0, 6253)	0.12157386678734855  (0, 6268)	0.051818196160082444  (0, 7243)	0.19633673212759786  (0, 7260)	0.1485301911519406  (0, 7314)	0.07345652787263263  (0, 7656)	0.06791397566179158  (0, 7855)	0.04980183534978358  (0, 7932)	0.1352058433207806  (0, 8126)	0.04434165473262266  (0, 8133)	0.09408555966048648  (0, 8549)	0.07169259662224006  (0, 9427)	0.061775194890671646  (0, 9768)	0.04717352570306168  (0, 9887)	0.06909612621912221  (0, 9889)	0.06304478481446893  (0, 10462)	0.029115489894866398  (0, 10805)	0.07484126882809092  (0, 11443)	0.10718458877567184  (0, 11444)	0.09567006201629102  :	:  (0, 32949)	0.12867116083985103  (0, 33016)	0.060738595871131645  (0, 33048)	0.08898186636288716  (0, 33314)	0.05948262618670136  (0, 33475)	0.08225666365003322  (0, 33606)	0.08247491521616453  (0, 33681)	0.05586972986086108  (0, 33714)	0.07019430142749362  (0, 33722)	0.0954926474298673  (0, 33724)	0.15870523442872497  (0, 33730)	0.060144602540348494  (0, 33940)	0.04859417338613391  (0, 33985)	0.0713829683541925  (0, 35184)	0.2768036838292696  (0, 36803)	0.057732781165952936  (0, 36816)	0.12602644489321504  (0, 37157)	0.17149279350807528  (0, 37221)	0.1001687658845324  (0, 37808)	0.143618898250477  (0, 37882)	0.09395595241668134  (0, 38209)	0.09795202568692113  (0, 38587)	0.09396253682601415  (0, 38723)	0.176982379937925  (0, 39603)	0.05588018002406435  (0, 39857)	0.03078374607002285"
8,"(0, 10930)	0.19122282638821098  (0, 10933)	0.178801681460338  (0, 11924)	0.22298934596165174  (0, 14069)	0.3349217325908912  (0, 14222)	0.28002979498904473  (0, 24700)	0.44505157898595676  (0, 25932)	0.19374656768745777  (0, 29619)	0.20731471064568302  (0, 32346)	0.19149351344182305  (0, 33831)	0.2306348271641671  (0, 33923)	0.18791366558101635  (0, 34383)	0.3177126836183075  (0, 34674)	0.23064425802042685  (0, 37142)	0.24064503572739393  (0, 39196)	0.27448199760145486"
9,"(0, 5959)	0.20345043928101572  (0, 8317)	0.15387455263345762  (0, 8606)	0.21979929531798928  (0, 10930)	0.12411149248773658  (0, 11924)	0.14472927243518166  (0, 13682)	0.2177195459731895  (0, 14069)	0.21737800284389472  (0, 17532)	0.290415846613915  (0, 21022)	0.25912847527794747  (0, 21462)	0.25895797250578373  (0, 22731)	0.09245640026524249  (0, 22734)	0.28367165882672685  (0, 29389)	0.2509153743940719  (0, 29619)	0.13455578833806658  (0, 30822)	0.13821932995801445  (0, 32346)	0.12428717953752791  (0, 33923)	0.12196371079022597  (0, 34055)	0.2969221753242905  (0, 34674)	0.14969762573495027  (0, 37142)	0.15618854248737776  (0, 37277)	0.23588117036710382  (0, 39196)	0.3563002495755612"


In [0]:
TFIDFV1_train = spark.sql('''select distinct c.id,     a.`4019`,    a.`2724`,    a.`25000`,    a.`4280`,    a.`41401`,    a.`53081`,    a.`51881`,    a.`42731`,    a.`5849`,    a.`5990`,    c.features from tempdb.hadm_top_50_icd_step_5 a inner join tempdb.hadm_dataset_lookup b  on a.row_id = b.row_id  and b.group_type = 'train' inner join tempdb.tfidf_output c  on c.id = b.row_id'''
)
TFIDFV1_test = spark.sql('''select distinct c.id,     a.`4019`,    a.`2724`,    a.`25000`,    a.`4280`,    a.`41401`,    a.`53081`,    a.`51881`,    a.`42731`,    a.`5849`,    a.`5990`,    c.features from tempdb.hadm_top_50_icd_step_5 a inner join tempdb.hadm_dataset_lookup b  on a.row_id = b.row_id  and b.group_type = 'test' inner join tempdb.tfidf_output c  on c.id = b.row_id''')

TFIDFV1_valid = spark.sql('''select distinct c.id,     a.`4019`,    a.`2724`,    a.`25000`,    a.`4280`,    a.`41401`,    a.`53081`,    a.`51881`,    a.`42731`,    a.`5849`,    a.`5990`,    c.features from tempdb.hadm_top_50_icd_step_5 a inner join tempdb.hadm_dataset_lookup b  on a.row_id = b.row_id  and b.group_type = 'valid' inner join tempdb.tfidf_output c  on c.id = b.row_id''')


In [0]:
TFIDFV1_valid.repartition(1).write.format('com.databricks.spark.csv').save("/dbfs/FileStore/output/DATA_TFIDFV1_HADM_TOP10_valid.csv",header = 'true')
TFIDFV1_train.repartition(1).write.format('com.databricks.spark.csv').save("/dbfs/FileStore/output/DATA_TFIDFV1_HADM_TOP10_train.csv",header = 'true')
TFIDFV1_test.repartition(1).write.format('com.databricks.spark.csv').save("/dbfs/FileStore/output/DATA_TFIDFV1_HADM_TOP10_test.csv",header = 'true')

In [0]:
%sql
select distinct c.id,     a.`4019`,    a.`2724`,    a.`25000`,    a.`4280`,    a.`41401`,    a.`53081`,    a.`51881`,    a.`42731`,    a.`5849`,    a.`5990`,    c.features from tempdb.hadm_top_50_icd_step_5 a inner join tempdb.hadm_dataset_lookup b  on a.row_id = b.row_id  and b.group_type = 'train' inner join tempdb.tfidf_output c  on c.id = b.row_id

In [0]:
'''select distinct c.id,     a.`4019`,    a.`2724`,    a.`25000`,    a.`4280`,    a.`41401`,    a.`53081`,    a.`51881`,    a.`42731`,    a.`5849`,    a.`5990`,    c.featuresfrom tempdb.hadm_top_50_icd_step_5 a inner join tempdb.hadm_dataset_lookup b  on a.row_id = b.row_id  and b.group_type = 'train' inner join tempdb.tfidf_output c  on c.id = b.row_id'''


id,4019,2724,25000,4280,41401,53081,51881,42731,5849,5990,features
26,0,0,0,0,0,0,0,0,0,0,"(0, 2220)	0.08021759686129024  (0, 5324)	0.2137304892109997  (0, 5832)	0.1253308067402864  (0, 6285)	0.06598440154526375  (0, 6470)	0.06866590363825427  (0, 7578)	0.15583481691907408  (0, 7971)	0.08703528934137462  (0, 8126)	0.16838502868493765  (0, 9175)	0.07870371221774941  (0, 9761)	0.07693835723508903  (0, 10137)	0.20232927552519023  (0, 10156)	0.09949076221465279  (0, 10580)	0.08303983089264531  (0, 10930)	0.06957066777226481  (0, 10933)	0.06505160818377273  (0, 11007)	0.22562690986102502  (0, 11441)	0.06534841006572388  (0, 11733)	0.12781881676138349  (0, 11924)	0.08112795944746642  (0, 12087)	0.05789843250959277  (0, 12585)	0.10155745557926378  (0, 12744)	0.1177344217372055  (0, 13537)	0.21374495156667092  (0, 14065)	0.06165671346681768  (0, 14973)	0.1105926161958799  :	:  (0, 23992)	0.05318834559804123  (0, 26132)	0.1134804626436751  (0, 26335)	0.06531265951823466  (0, 26614)	0.11258967627108073  (0, 26751)	0.11847226711085582  (0, 27278)	0.1012718946664369  (0, 28761)	0.22864204668247134  (0, 29094)	0.1520736456887156  (0, 29172)	0.07111881397963607  (0, 29619)	0.15085043068397208  (0, 30234)	0.07004368471622162  (0, 30262)	0.13048023845028295  (0, 30835)	0.2366576370976026  (0, 30956)	0.12040151337355914  (0, 31745)	0.05907264397185294  (0, 31806)	0.12226135991342416  (0, 32392)	0.10611912916152973  (0, 33709)	0.10968343036506044  (0, 33831)	0.08390953758196933  (0, 34026)	0.2119209743559759  (0, 34694)	0.05422000845905553  (0, 35858)	0.11346202961828943  (0, 37655)	0.10229340238638236  (0, 37914)	0.14370274167121866  (0, 39857)	0.11689960594134205"
29,0,0,0,0,0,0,0,0,0,0,"(0, 0)	0.023022960325209986  (0, 472)	0.017093370276187594  (0, 881)	0.024242458451350937  (0, 4902)	0.02082943410443407  (0, 5567)	0.02094055732816615  (0, 6394)	0.02464771095129089  (0, 7224)	0.03417552981572028  (0, 7550)	0.08521429040317441  (0, 7551)	0.18200784634041547  (0, 7661)	0.02809008784046255  (0, 7671)	0.1396840943275125  (0, 7971)	0.024063330616432857  (0, 8026)	0.06667031177064223  (0, 8271)	0.08060431815991756  (0, 8315)	0.05787672893357912  (0, 8325)	0.11811163058279188  (0, 9184)	0.22591634658246415  (0, 9649)	0.01619565239086362  (0, 9664)	0.022390601226739113  (0, 10404)	0.021705330052181365  (0, 10869)	0.043870607698867685  (0, 11265)	0.09816530654834861  (0, 11795)	0.02512098560609127  (0, 11876)	0.02220562533220624  (0, 11963)	0.05487409630629977  :	:  (0, 33263)	0.024677958027223487  (0, 33294)	0.162095240280951  (0, 33420)	0.03349710008409584  (0, 33831)	0.023199129456413156  (0, 33842)	0.04229733368411202  (0, 34584)	0.0447830398104694  (0, 34806)	0.014902382188244499  (0, 35130)	0.037511171544181855  (0, 35406)	0.034696831447957065  (0, 35614)	0.02447471931607582  (0, 36077)	0.07853298759041806  (0, 36211)	0.03234337173904062  (0, 36223)	0.10788198322608833  (0, 36366)	0.08725372465690535  (0, 36543)	0.060844567243647646  (0, 36752)	0.017612468470004517  (0, 37232)	0.05139438319029361  (0, 37336)	0.22464234206590178  (0, 37341)	0.07873408538480475  (0, 37471)	0.13095755384201077  (0, 38490)	0.406082398787214  (0, 38527)	0.04863988333665341  (0, 38635)	0.0859661309874065  (0, 38636)	0.08752314552450043  (0, 38707)	0.14885640958729104"
65,1,0,0,0,0,0,0,0,0,0,"(0, 0)	0.19149232674170846  (0, 50)	0.026597887181597942  (0, 91)	0.024098574028368396  (0, 130)	0.12689393487217282  (0, 163)	0.09978121041302428  (0, 202)	0.025270573041668656  (0, 236)	0.20467773511484025  (0, 276)	0.026281569233221783  (0, 321)	0.02555491557434213  (0, 375)	0.02636283757280437  (0, 472)	0.017771657075201012  (0, 473)	0.03520313127128711  (0, 635)	0.03288803874240687  (0, 669)	0.05008229249645992  (0, 731)	0.04168360968141573  (0, 803)	0.033809633128155235  (0, 836)	0.03561363876975265  (0, 880)	0.07634482926345354  (0, 1075)	0.05334907695731529  (0, 1118)	0.036422896322532046  (0, 1173)	0.034793093966922053  (0, 1179)	0.03448382195412404  (0, 1267)	0.0723599133572228  (0, 1324)	0.03980190809454781  (0, 1357)	0.08488012389000518  :	:  (0, 37926)	0.03188788233093987  (0, 38153)	0.03519021002157919  (0, 38213)	0.031189584628117415  (0, 38320)	0.035471598481857984  (0, 38327)	0.0370304645353922  (0, 38391)	0.03493535858882336  (0, 38498)	0.03362189413826489  (0, 38503)	0.027584077225944234  (0, 38513)	0.024921475390276872  (0, 38625)	0.029988007018253993  (0, 38676)	0.028512223726895387  (0, 38679)	0.022351233520872343  (0, 38689)	0.028167050092709964  (0, 38691)	0.028290592875435165  (0, 38756)	0.029524113202900448  (0, 38944)	0.02377447410384877  (0, 39063)	0.05577860446574361  (0, 39110)	0.03561573562515112  (0, 39193)	0.05498934105233619  (0, 39199)	0.07612752234800088  (0, 39210)	0.06270369244050507  (0, 39239)	0.022816122370934333  (0, 39326)	0.03282450335147134  (0, 39857)	0.016801329000812836  (0, 39978)	0.06081403813638656"
191,1,0,0,1,0,0,1,1,0,0,"(0, 473)	0.05226436765444418  (0, 2054)	0.08855313807074441  (0, 3030)	0.0991473986306122  (0, 6024)	0.10410407359354917  (0, 6462)	0.09743899294669411  (0, 6577)	0.07584040688398722  (0, 7014)	0.1411994244946904  (0, 8279)	0.08640492667506738  (0, 8933)	0.08059390133466789  (0, 9412)	0.05484200414794018  (0, 9889)	0.1021705510571266  (0, 9904)	0.16135781610089342  (0, 11862)	0.10939846318277653  (0, 12538)	0.09340237757549746  (0, 13175)	0.1710312120142912  (0, 13734)	0.11761180940346838  (0, 14054)	0.09585784021083837  (0, 14253)	0.21752785367212252  (0, 14776)	0.09685995315132555  (0, 16153)	0.23990935932460106  (0, 16227)	0.4393886460445331  (0, 16422)	0.09280773746831777  (0, 16926)	0.09439448803196618  (0, 17040)	0.16475572065980365  (0, 17770)	0.06891391456300697  :	:  (0, 23320)	0.11574678278061507  (0, 24297)	0.059677103984542026  (0, 25835)	0.1369439334694653  (0, 25874)	0.13274255723485423  (0, 26128)	0.051159711919025457  (0, 26212)	0.21444193797840574  (0, 26215)	0.17491661048372162  (0, 26739)	0.11464550089298901  (0, 27340)	0.05269482639957014  (0, 29923)	0.08363077738316514  (0, 30169)	0.1599339965251251  (0, 31038)	0.13226472517666696  (0, 32949)	0.06950827863534145  (0, 33265)	0.08054652811112657  (0, 33313)	0.16938285888547425  (0, 33775)	0.08959443622666186  (0, 34711)	0.11069385913713713  (0, 34773)	0.18933960447341827  (0, 34806)	0.04600556005327099  (0, 35387)	0.1688663889777995  (0, 36728)	0.09029746726080001  (0, 37785)	0.08238206337971697  (0, 38327)	0.05497731999968855  (0, 39487)	0.12160533960479848  (0, 39688)	0.08271691890091751"
222,1,1,0,0,1,0,0,0,0,0,"(0, 472)	0.06672367194682718  (0, 473)	0.06608506377618943  (0, 881)	0.0946300126164058  (0, 2502)	0.0716361786363053  (0, 2633)	0.26791756227955305  (0, 3311)	0.08603841518663173  (0, 3599)	0.06922653028569711  (0, 5899)	0.0770565751008929  (0, 6036)	0.10505151608454778  (0, 6509)	0.12109803699635625  (0, 6730)	0.2628507603555692  (0, 7601)	0.19197086732617671  (0, 8803)	0.1513938558369754  (0, 8816)	0.13223544792936448  (0, 8864)	0.18459394167838117  (0, 9027)	0.09873385883256121  (0, 9188)	0.09507140219230859  (0, 10654)	0.27773830892740903  (0, 11427)	0.2663473758531647  (0, 11849)	0.10552086340211406  (0, 12054)	0.1154726338699545  (0, 12238)	0.1064580691480664  (0, 12374)	0.12669750611048056  (0, 12713)	0.0921061088500016  (0, 13382)	0.06806171561383356  :	:  (0, 17179)	0.09234701876750186  (0, 17182)	0.08404993736696664  (0, 18380)	0.15920801763141804  (0, 18488)	0.06401237151938537  (0, 19822)	0.07191077625116012  (0, 20867)	0.0959947504525869  (0, 25554)	0.1042841775045714  (0, 28443)	0.20577688841563352  (0, 29156)	0.14201371891233394  (0, 30675)	0.08614940306063491  (0, 30787)	0.12475669636843609  (0, 31563)	0.06550345531133214  (0, 31811)	0.1019288641260505  (0, 32747)	0.14301995007136445  (0, 33409)	0.09972261181134141  (0, 34694)	0.05851566968076152  (0, 34806)	0.058171188261209233  (0, 36000)	0.13884063601389685  (0, 36394)	0.14473345798257214  (0, 36760)	0.0896506312618258  (0, 36924)	0.08820546656213189  (0, 37235)	0.09643551699279855  (0, 37404)	0.10718247347779876  (0, 39637)	0.09799396104925215  (0, 39887)	0.09683694021709374"
278,0,0,0,0,0,0,0,0,0,0,"(0, 473)	0.033181227856584915  (0, 881)	0.09502722192507869  (0, 1075)	0.05028495518814796  (0, 1373)	0.05500522198063677  (0, 1634)	0.06430722054640121  (0, 1736)	0.04205683179349501  (0, 1737)	0.06444784786225755  (0, 1879)	0.1150950374465308  (0, 2585)	0.07787309533603139  (0, 3113)	0.0863911697846573  (0, 3143)	0.046955200921517136  (0, 3599)	0.03475855426136568  (0, 3789)	0.06961644100211178  (0, 4070)	0.13389332892914127  (0, 4113)	0.06694103861926749  (0, 4421)	0.09847548337454079  (0, 4526)	0.07471829929491207  (0, 5247)	0.0401132624661771  (0, 5888)	0.13307135958682909  (0, 6014)	0.10586419848793358  (0, 6038)	0.058516461085459465  (0, 6340)	0.05480223775904836  (0, 6394)	0.04830787898871268  (0, 6577)	0.048149014988411676  (0, 6640)	0.05395064687270305  :	:  (0, 35372)	0.047088246033953104  (0, 35530)	0.032673301782875044  (0, 35893)	0.04812303350512448  (0, 35970)	0.1478622086771406  (0, 36082)	0.057681591369381974  (0, 36394)	0.10900573231015427  (0, 36752)	0.03451927025689723  (0, 36834)	0.0971537099528722  (0, 36901)	0.11012729497609235  (0, 36918)	0.06519150516231509  (0, 36924)	0.04428785442511745  (0, 37104)	0.1035134081701488  (0, 37882)	0.09666908790251619  (0, 38144)	0.05453935263713829  (0, 38224)	0.04991443422867859  (0, 38327)	0.03490360763408052  (0, 38500)	0.05257740862047351  (0, 38679)	0.04213496615494044  (0, 38756)	0.05565677211493287  (0, 38921)	0.052301850864462866  (0, 39210)	0.03940152350433321  (0, 39239)	0.04301134176726981  (0, 39244)	0.06224803773463864  (0, 39270)	0.05178864377476514  (0, 39978)	0.05732116386840689"
287,1,0,1,1,0,0,0,0,0,0,"(0, 880)	0.11467618193238804  (0, 1074)	0.1407278282630938  (0, 1216)	0.13486954342887897  (0, 2585)	0.12409945567738416  (0, 3311)	0.13768732935814298  (0, 3368)	0.14372135195074048  (0, 3403)	0.14685407893708746  (0, 3599)	0.11078325949045628  (0, 9915)	0.1104390396820744  (0, 10462)	0.09547724045451457  (0, 10649)	0.2511344760041836  (0, 11619)	0.15149030973204  (0, 12327)	0.11431027768367855  (0, 17423)	0.13089100183779512  (0, 17767)	0.1178987965591979  (0, 19565)	0.29067141675123226  (0, 21139)	0.24952831449341958  (0, 23755)	0.3543368235564592  (0, 24681)	0.16866099109229243  (0, 25494)	0.14293694770773824  (0, 28643)	0.0825144903774124  (0, 30169)	0.08090586132843243  (0, 30996)	0.20832175028258743  (0, 31954)	0.10506364954957195  (0, 32614)	0.10592365428610552  (0, 35530)	0.10413709507606014  (0, 35785)	0.178374704756054  (0, 36880)	0.11020100398443639  (0, 38499)	0.45375823227986434  (0, 39243)	0.15501514122957957  (0, 39388)	0.15376124926024934"
293,0,0,0,0,0,0,0,0,0,0,"(0, 9854)	0.5795327963119983  (0, 10462)	0.15814694752408257  (0, 10481)	0.30480156413642134  (0, 10933)	0.18609434060348165  (0, 26132)	0.16231783084742168  (0, 26837)	0.24488955587192435  (0, 30169)	0.1340111522388821  (0, 31563)	0.17363078922123343  (0, 31954)	0.17402571956319152  (0, 33536)	0.2768490791009357  (0, 33660)	0.20202054048355192  (0, 35530)	0.1724909898097642  (0, 36465)	0.29717203238901546  (0, 38692)	0.3337132756742775"
348,0,0,0,0,0,0,1,0,0,0,"(0, 472)	0.0428993702760141  (0, 2425)	0.05796436311098056  (0, 2905)	0.13162848481623274  (0, 3030)	0.08060276050321172  (0, 3984)	0.1661228108926384  (0, 4428)	0.08845120334432136  (0, 4902)	0.05227580002351941  (0, 5247)	0.05136529947351163  (0, 5567)	0.05255468688106252  (0, 5888)	0.042599690328655064  (0, 6253)	0.053390518238055455  (0, 6268)	0.13653922937174784  (0, 6640)	0.0690841622703387  (0, 7638)	0.07837023862708353  (0, 7936)	0.15021798974634523  (0, 7971)	0.06039193637702783  (0, 8133)	0.04131872187447442  (0, 8144)	0.07354737419831595  (0, 8341)	0.11167571834332572  (0, 8495)	0.08831102802864267  (0, 8501)	0.07284587701211441  (0, 8822)	0.06033602423863718  (0, 8915)	0.06046520992668771  (0, 9172)	0.07072877332005745  (0, 9398)	0.07872084399770923  :	:  (0, 34006)	0.07318165470165298  (0, 34194)	0.039177893340620336  (0, 34267)	0.07599872416449335  (0, 34674)	0.058225424493860825  (0, 34694)	0.03762211082424232  (0, 36895)	0.08362722464259058  (0, 36919)	0.15871987651119976  (0, 36921)	0.18040159966585514  (0, 36924)	0.0567108922517477  (0, 37235)	0.06200232736786059  (0, 37255)	0.08724431005744754  (0, 37257)	0.08341957292714902  (0, 38223)	0.06620929328634167  (0, 38316)	0.11044062594517604  (0, 38327)	0.04469430179961511  (0, 38368)	0.1334371957434308  (0, 38626)	0.09761126238221639  (0, 38988)	0.10507961880428923  (0, 39239)	0.055076309300352046  (0, 39243)	0.06227927165736467  (0, 39463)	0.08729716614989484  (0, 39466)	0.1663373538160268  (0, 39565)	0.0671563705027613  (0, 39714)	0.0713486708115855  (0, 39873)	0.05548277437087697"
367,0,0,0,0,0,0,1,0,0,0,"(0, 2321)	0.20862462291368047  (0, 4062)	0.04835289149057659  (0, 5832)	0.09284702817643868  (0, 5888)	0.1364439525621299  (0, 5899)	0.05289410534982373  (0, 5954)	0.08951283068975668  (0, 6470)	0.05086877883961069  (0, 8126)	0.062371135674559666  (0, 9175)	0.11660988987351349  (0, 9637)	0.10468720542645453  (0, 10205)	0.09442955547634278  (0, 10487)	0.08378852243710955  (0, 10933)	0.04819125205566831  (0, 11007)	0.041786988774116754  (0, 11433)	0.06600055169772714  (0, 11441)	0.09682225509376577  (0, 11927)	0.05522559437083513  (0, 11928)	0.10278588061555272  (0, 12087)	0.04289206727703785  (0, 12130)	0.2059658718948931  (0, 12409)	0.0863641535914341  (0, 13370)	0.09450483360287072  (0, 13537)	0.07917280697716533  (0, 14065)	0.04567626078066323  (0, 14212)	0.0719708149547167  :	:  (0, 27947)	0.08415156615761475  (0, 28050)	0.1101483390806863  (0, 28622)	0.056204263296352266  (0, 28627)	0.3328329406061104  (0, 28810)	0.0448771224179487  (0, 28873)	0.12425139366228258  (0, 28909)	0.0653800178569411  (0, 29094)	0.056329271443264876  (0, 29100)	0.09693238402977009  (0, 29667)	0.10267548521164298  (0, 29822)	0.22844553365643522  (0, 30956)	0.08919533030570803  (0, 31745)	0.043761941552621864  (0, 31753)	0.11365039325155397  (0, 32468)	0.12535550098089743  (0, 33294)	0.27145640394584125  (0, 33831)	0.06216150205024399  (0, 33959)	0.06071715022378523  (0, 34026)	0.07849719150847408  (0, 34978)	0.07731592849014052  (0, 36752)	0.09438436015153083  (0, 36787)	0.13455143324313826  (0, 37641)	0.13963028183089168  (0, 38676)	0.07348195086474249  (0, 39009)	0.08438314426355187"


original code that I used to split the datasets, I decided to do this in the processing script

In [0]:
# idx_list = list(df2['id'])
# random.seed(1234)
# random.shuffle(idx_list)
# idx_train= idx_list[0:int(len(df2)*0.50)]
# idx_val= idx_list[int(len(df2)*0.50):int(len(df2)*0.75)]
# idx_test= idx_list[int(len(df2)*0.75):len(df2)]
# train_set = df2.iloc[idx_train]
# val_set = df2.iloc[idx_val]
# test_set = df2.iloc[idx_test]

In [0]:
# train_set_split_0 = pd.DataFrame(train_set)
# train_set_split_0 = train_set_split_0[["id","features"]]
# train_set_split_0 = train_set_split_0["features"].astype('string')
# spf = pd.DataFrame(train_set_split_0).reset_index()
# spf.columns = ["id","features"]
# s_train_set_split_0 = pandas_to_spark(spf)
# s_train_set_split_0.registerTempTable("s_train_set_split")

In [0]:
# test_set_split_0 = pd.DataFrame(test_set)
# test_set_split_0 = test_set_split_0[["id","features"]]
# test_set_split_0 = test_set_split_0["features"].astype('string')
# spf_test = pd.DataFrame(test_set_split_0).reset_index()
# spf_test.columns = ["id","features"]
# s_test_set_split_0 = pandas_to_spark(spf_test)
# s_test_set_split_0.registerTempTable("s_test_set_split")

In [0]:
# valid_set_split_0 = pd.DataFrame(val_set)
# valid_set_split_0 = valid_set_split_0[["id","features"]]
# valid_set_split_0 = valid_set_split_0["features"].astype('string')
# spf_valid = pd.DataFrame(valid_set_split_0).reset_index()
# spf_valid.columns = ["id","features"]
# s_valid_set_split_0 = pandas_to_spark(spf_valid)
# s_valid_set_split_0.registerTempTable("s_valid_set_split")

In [0]:
# train_set_split_0 = pd.DataFrame(train_set)
# train_set_split_0 = train_set_split_0[["id","features"]]
# train_set_split_0 = train_set_split_0["features"].astype('string')
# spf = pd.DataFrame(train_set_split_0).reset_index()
# spf.columns = ["id","features"]
# s_train_set_split_0 = pandas_to_spark(spf)
# s_train_set_split_0.registerTempTable("s_train_set_split")

In [0]:
# %sql
# create table tempdb.s_train_set_split(
# select * 
# from s_train_set_split)

num_affected_rows,num_inserted_rows


In [0]:
# %sql
# create table tempdb.s_test_set_split(
# select * 
# from s_test_set_split)

num_affected_rows,num_inserted_rows


In [0]:
# %sql
# create table tempdb.s_valid_set_split(
# select * 
# from s_valid_set_split)

num_affected_rows,num_inserted_rows


In [0]:
# %sql
# create table tempdb.dataset_splits_lookup as (

# select * 
# from (
#       select 
#           *,
#           'valid' as type
#       from tempdb.s_valid_set_split

#       union all 

#       select 
#           *,
#           'test' as type
#       from tempdb.s_test_set_split

#       union all 

#       select 
#           *,
#           'train' as type
#       from tempdb.s_train_set_split
#       ) a
# )

num_affected_rows,num_inserted_rows


In [0]:

# from pyspark.sql.functions import col 
# df = s_df2
# path = "DATA_TFIDFV1_HADM_TOP10"
# if type(df) != pd.DataFrame:    
#     udf = udf(lambda x: Vectors.stringify(x), StringType())
#     sn_df2 = df.withColumn(col, udf(df[col]))
#     # df2.write.csv(path, header=True)
#     sn_df2.registerTempTable(path)

In [0]:
# batch_output_spark(s_df2, "tfidfv1", "DATA_TFIDFV1_HADM_TOP10", ICD9CODES_TOP10, hadmid_pickle='/dbfs/FileStore/data/TRAIN_VAL_TEST_HADMID.p')


In [0]:
# spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [0]:
s_df2 = spark.createDataFrame(df2)

In [0]:
s_df2 = pandas_to_spark(df2)

In [0]:
# batch_output_spark(s_df2, "word2vecv0", "DATA_WORD2VECV1_HADM_TOP50", ICD9CODES_TOP50,
#                  hadmid_pickle='/dbfs/FileStore/data/TRAIN_VAL_TEST_HADMID.p')
# batch_output_csv(df2, "word2vecv0", "/dbfs/FileStore/data/DATA_WORD2VECV1_HADM_TOP10CAT", ICD9CAT_TOP10,
#                  hadmid_pickle='/dbfs/FileStore/data/TRAIN_VAL_TEST_HADMID.p')
# batch_output_csv(df2, "word2vecv0", "/dbfs/FileStore/data/DATA_WORD2VECV1_HADM_TOP50CAT", ICD9CAT_TOP50,
#                  hadmid_pickle='/dbfs/FileStore/data/TRAIN_VAL_TEST_HADMID.p')

end of original processing code

In [0]:
import gensim

In [0]:
model_w2v = gensim.models.Word2Vec.load('/dbfs/FileStore/data/model_word2vec_skipgram_300dim')

In [0]:
df2 = create_WORD2VEC(df1_pd.copy(),word2vecmodel= model_w2v)