In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


In [2]:
# Load the file into pandas
jsonl = '/usr/local/tmp/out.jsonl'
df_orig = ''
with open(jsonl,"r") as json_file:
    data=json_file.read()
    df_orig = pd.read_json(data, lines=True)

In [3]:
df_orig.head(5)

Unnamed: 0,audience,categories,concepts,gender,id,max_age,min_age,setting,text
0,3.0,"[1040, 1030]",[20016],0.0,aa100180,64.0,13.0,0.0,polyhydramnios topic overview polyhydramnios...
1,3.0,[1030],"[20058, 20044]",0.0,aa100355,64.0,13.0,0.0,premature delivery in multiple pregnancy topi...
2,3.0,"[1018, 1030]",,0.0,aa100367,64.0,13.0,0.0,multiple pregnancy: genetic disorders and birt...
3,3.0,[1021],"[21433, 22373]",2.0,aa10253,110.0,2.0,0.0,ambulatory electrocardiogram test overview a...
4,,,,,aa102542,,,,hair growth most scalp hair grows steadily. a...


In [4]:
# Get all of the unindexed documents
df_unindexed = df_orig[pd.isnull(df_orig['concepts'])]

In [5]:
# Strip out any rows where concepts is null, NaN, ec
df_orig = df_orig[pd.notnull(df_orig['concepts'])]

In [6]:
# Convert the numbers from float to int
df_orig['audience'] = df_orig['audience'].astype(int)
df_orig['max_age'] = df_orig['max_age'].astype(int)
df_orig['min_age'] = df_orig['min_age'].astype(int)
df_orig['gender'] = df_orig['gender'].astype(int)
df_orig['setting'] = df_orig['gender'].astype(int)

In [7]:
df_orig.head(5)

Unnamed: 0,audience,categories,concepts,gender,id,max_age,min_age,setting,text
0,3,"[1040, 1030]",[20016],0,aa100180,64,13,0,polyhydramnios topic overview polyhydramnios...
1,3,[1030],"[20058, 20044]",0,aa100355,64,13,0,premature delivery in multiple pregnancy topi...
3,3,[1021],"[21433, 22373]",2,aa10253,110,2,2,ambulatory electrocardiogram test overview a...
7,3,"[1018, 1030, 1039]","[21474, 21435, 4124]",0,aa103080,64,13,0,pregnancy: should i have amniocentesis?
9,3,"[1008, 1051]","[22752, 4280]",2,aa104339spec,110,19,2,bodywork and manual therapy topic overview b...


### MultiLabelBinarizer
Unless you can insure that your train dataset includes ALL LABELS, a hack needs to be introduced.  We will build out MultiLabelBinarizer off the complete set of concepts and use this object later against the train/test slices.  This will result in warnings, but unless we make certain all labels are present in train - its the only easy thing to do.


In [8]:
# Build our binary map array for the whole collection since we can't
# be certain that we don't use a label in the test set that isn't
# present in the train set
from sklearn import preprocessing
lb = preprocessing.MultiLabelBinarizer()
Y_all = lb.fit_transform(df_orig['concepts'])
lb

MultiLabelBinarizer(classes=None, sparse_output=False)

In [9]:
ids = df_orig['id']

In [10]:
# Split into test/train
X_train, X_test, Y_train, Y_test, id_train, id_test = train_test_split(df_orig, Y_all, ids, test_size=0.1, random_state=1)

print("X Training set: ", len(X_train))
print("Y Training set: ", len(Y_train))
print("X Testing set: ", len(X_test))
print("Y Testing set: ", len(Y_test))
print("IDs Test: ", len(id_train))
print("IDs Train: ", len(id_test))

X Training set:  7943
Y Training set:  7943
X Testing set:  883
Y Testing set:  883
IDs Test:  7943
IDs Train:  883


In [11]:
#X_prime_train = X_train.drop('id',1).drop('categories',1).drop('concepts',1)
#Y_prime_train = Y_train
#X_prime_train

In [12]:
#X_prime_train.shape

In [13]:
#Y_prime_train.shape

In [15]:
X_train

Unnamed: 0,audience,categories,concepts,gender,id,max_age,min_age,setting,text
16002,2,[1010],"[21060, 20446]",2,uq2791,12,2,2,diabetes: high or low blood sugar in young chi...
1626,3,"[1002, 1030]","[23491, 20740, 4777, 4538, 4124]",0,abo3489,64,13,0,pregnancy: dealing with back pain
3785,3,"[1513, 1524, 1508]","[20786, 21781]",2,av2928,110,19,2,endoscopic ultrasound (rectal): what to expect...
313,3,[1037],[20126],2,aa166321,110,19,2,foods high in oxalate topic overview oxalate...
3123,3,[1005],[22655],2,abr9139,110,19,2,pseudobulbar affect (pba) topic overview pse...
12175,3,"[1011, 1012]","[4713, 20219]",2,te4626,110,19,2,food poisoning and safe food handling topic o...
15483,3,"[1523, 1533, 1518]",[21066],0,ug6302,18,13,0,bartholin gland cyst in teens: care instructio...
17613,3,"[1521, 1524]","[21869, 20855]",2,zu2039,110,19,2,bunionectomy: before your surgery bunionectomy...
4049,2,"[1504, 1523, 1533]",[20862],2,bz1121,18,0,2,dry skin in children: care instructions dry sk...
3981,2,"[1521, 1523, 1533, 1531]",[20307],2,bu1140,18,2,2,broken rib in children: care instructions rib ...


```('vectorizer', CountVectorizer(lowercase=True, 
                                   stop_words='english', 
                                   max_df=0.8, 
                                   min_df=2)),
```                                   
0.25254813137

```
classifier = Pipeline([
('vectorizer', CountVectorizer(lowercase=True, 
                                   stop_words='english', 
                                   max_df=0.8, 
                                   min_df=4)),

```

0.249150622877

In [29]:
type(X_train)

pandas.core.frame.DataFrame

In [26]:
type(Y_train)

numpy.ndarray

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

from nltk.corpus import stopwords

classifier = Pipeline([
('vectorizer', CountVectorizer(lowercase=True, 
                                   stop_words=stopwords.words('english'), 
                                   max_df=0.8, 
                                   min_df=.00001)),
('tfidf', TfidfTransformer()),   
('clf', OneVsRestClassifier(LinearSVC()))])

classifier.fit(X_train['text'], Y_train)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


In [22]:
Y_predicted = classifier.predict(X_test['text'])
print("Test predictions: ",len(Y_predicted))

Test predictions:  883


import numpy
for count, prediction in enumerate(Y_predicted):
    answer = Y_test[count]
    print("Prediction length: ",len(prediction))
    print("Answer length: ",len(answer))

    print("--")
    if np.array_equal(prediction, answer):
        print("!")

In [23]:
from sklearn.metrics import accuracy_score
print(len(Y_test))
print(len(Y_predicted))
print("Accuracy Score: ",accuracy_score(Y_predicted, Y_test))


883
883
Accuracy Score:  0.248018120045


for id,concept in zip()

In [None]:
inversed = lb.inverse_transform(Y_predicted)
actual_inverse = lb.inverse_transform(Y_test)

In [None]:
for id, inverse, id, actual in zip(Y_predicted, inversed, id_test, actual_inverse):
    print('%r => %s | %s' % (id, inverse, actual))

#for doc, category in zip(docs_new, predicted):
#    print('%r => %s' % (doc, category))

In [None]:
def fix_hwcvs(ids):
    out = set()
    for i,a in enumerate(ids):
        str_a = a.astype(str)
        if len(str_a) == 4:
            str_a = str("0"+str(a))
        hwcv = 'HWCV_'+str_a
        out.add(hwcv)
    return out
            
            

In [None]:
with open('/usr/local/tmp/results1.tsv', 'w') as f: 
    for id, inverse, id, actual in zip(Y_predicted, inversed, id_test, actual_inverse):
        predicted = fix_hwcvs(inverse)
        if (len(predicted) == 0):
            predicted = ""
        actual = fix_hwcvs(actual)
        
        predicted_s = str(predicted)
        actual_s = str(actual)
        
        x = id+'\t'+predicted_s+'\t'+str(actual_s)
        
        post = ""
        if len(predicted) == 0:
            post = "no prediction"
        elif predicted == actual:
            post = 'exact'
        elif predicted.issubset(actual):
            post = 'partial'
        else:
            post = 'wrong'
        out = str(x)+'\t'+post+'\n'
        f.write(out)
