In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


In [2]:
# Load the file into pandas
jsonl = '/usr/local/tmp/out.jsonl'
df_orig = ''
with open(jsonl,"r") as json_file:
    data=json_file.read()
    df_orig = pd.read_json(data, lines=True)

In [3]:
df_orig.head(5)

Unnamed: 0,audience,categories,concepts,gender,id,max_age,min_age,setting,text
0,3.0,"[1040, 1030]",[20016],0.0,aa100180,64.0,13.0,0.0,polyhydramnios topic overview polyhydramnios...
1,3.0,[1030],"[20058, 20044]",0.0,aa100355,64.0,13.0,0.0,premature delivery in multiple pregnancy topi...
2,3.0,"[1018, 1030]",,0.0,aa100367,64.0,13.0,0.0,multiple pregnancy: genetic disorders and birt...
3,3.0,[1021],"[21433, 22373]",2.0,aa10253,110.0,2.0,0.0,ambulatory electrocardiogram test overview a...
4,,,,,aa102542,,,,hair growth most scalp hair grows steadily. a...


In [4]:
# Get all of the unindexed documents
df_unindexed = df_orig[pd.isnull(df_orig['categories'])]

In [5]:
# Strip out any rows where concepts is null, NaN, ec
df_orig = df_orig[pd.notnull(df_orig['categories'])]

In [6]:
# Convert the numbers from float to int
df_orig['audience'] = df_orig['audience'].astype(int)
df_orig['max_age'] = df_orig['max_age'].astype(int)
df_orig['min_age'] = df_orig['min_age'].astype(int)
df_orig['gender'] = df_orig['gender'].astype(int)
df_orig['setting'] = df_orig['gender'].astype(int)

In [7]:
df_orig.head(5)

Unnamed: 0,audience,categories,concepts,gender,id,max_age,min_age,setting,text
0,3,"[1040, 1030]",[20016],0,aa100180,64,13,0,polyhydramnios topic overview polyhydramnios...
1,3,[1030],"[20058, 20044]",0,aa100355,64,13,0,premature delivery in multiple pregnancy topi...
2,3,"[1018, 1030]",,0,aa100367,64,13,0,multiple pregnancy: genetic disorders and birt...
3,3,[1021],"[21433, 22373]",2,aa10253,110,2,2,ambulatory electrocardiogram test overview a...
6,3,[1034],,2,aa102618,110,19,2,hair loss caused by lack of protein topic ove...


### MultiLabelBinarizer
Unless you can insure that your train dataset includes ALL LABELS, a hack needs to be introduced.  We will build out MultiLabelBinarizer off the complete set of concepts and use this object later against the train/test slices.  This will result in warnings, but unless we make certain all labels are present in train - its the only easy thing to do.


In [8]:
# Build our binary map array for the whole collection since we can't
# be certain that we don't use a label in the test set that isn't
# present in the train set
from sklearn import preprocessing
lb = preprocessing.MultiLabelBinarizer()
Y_all = lb.fit_transform(df_orig['categories'])
lb

MultiLabelBinarizer(classes=None, sparse_output=False)

In [9]:
ids = df_orig['id']

In [10]:
# Split into test/train
X_train, X_test, Y_train, Y_test, id_train, id_test = train_test_split(df_orig, Y_all, ids, test_size=0.3, random_state=1)

print("X Training set: ", len(X_train))
print("Y Training set: ", len(Y_train))
print("X Testing set: ", len(X_test))
print("Y Testing set: ", len(Y_test))
print("IDs Test: ", len(id_train))
print("IDs Train: ", len(id_test))

X Training set:  8151
Y Training set:  8151
X Testing set:  3494
Y Testing set:  3494
IDs Test:  8151
IDs Train:  3494


In [11]:
#X_prime_train = X_train.drop('id',1).drop('categories',1).drop('concepts',1)
#Y_prime_train = Y_train
#X_prime_train

In [12]:
#X_prime_train.shape

In [13]:
#Y_prime_train.shape

In [14]:
X_train

Unnamed: 0,audience,categories,concepts,gender,id,max_age,min_age,setting,text
13808,3,"[1032, 1040, 1049]","[23496, 5193, 5194, 21817]",0,tw3491,64,13,0,abortion: emotional recovery topic overview ...
8554,3,"[1032, 1033, 1034]",[20891],2,hw103896,110,13,2,laser surgery for genital warts surgery overv...
8652,4,"[1025, 1003, 1007, 1041, 1012, 1048]",[20216],2,hw119898,110,1,2,lead poisoning topic overview lead poisoning...
12894,3,"[1040, 1020, 1030]",[4124],0,tn9108,64,13,0,pregnancy: vegetarian diet topic overview a ...
2730,3,[1521],,2,abq6157,110,19,2,weakness: care instructions weakness: generali...
2002,3,"[1531, 1516]",[20480],2,abp2013,110,2,2,"traumatic brain injury, long-term healing: car..."
634,3,"[1032, 1040, 1030]",[21478],0,aa77493,64,13,0,contraction stress test test overview a cont...
2335,3,[1500],[20431],2,abq0133,110,13,2,latex allergy: care instructions latex allergy...
2030,3,[1505],"[4209, 20446]",2,abp2722,110,19,2,learning about insulin pumps diabetes: insulin...
7585,0,[1008],,2,hn-1029002,110,0,2,high homocysteine (holistic) about this condi...


```('vectorizer', CountVectorizer(lowercase=True, 
                                   stop_words='english', 
                                   max_df=0.8, 
                                   min_df=2)),
```                                   
0.25254813137

```
classifier = Pipeline([
('vectorizer', CountVectorizer(lowercase=True, 
                                   stop_words='english', 
                                   max_df=0.8, 
                                   min_df=4)),

```

0.249150622877

In [15]:
type(X_train)

pandas.core.frame.DataFrame

In [16]:
type(Y_train)

numpy.ndarray

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

from nltk.corpus import stopwords

classifier = Pipeline([
('vectorizer', CountVectorizer(lowercase=True, 
                                   stop_words=stopwords.words('english'), 
                                   max_df=0.8, 
                                   min_df=.01)),
('tfidf', TfidfTransformer()),   
('clf', OneVsRestClassifier(LinearSVC()))])

classifier.fit(X_train['text'], Y_train)

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=None, min_df=0.01,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1))])

In [18]:
Y_predicted = classifier.predict(X_test['text'])
print("Test predictions: ",len(Y_predicted))

Test predictions:  3494


import numpy
for count, prediction in enumerate(Y_predicted):
    answer = Y_test[count]
    print("Prediction length: ",len(prediction))
    print("Answer length: ",len(answer))

    print("--")
    if np.array_equal(prediction, answer):
        print("!")

In [None]:
from sklearn.metrics import accuracy_score
print(len(Y_test))
print(len(Y_predicted))
print("Accuracy Score: ",accuracy_score(Y_predicted, Y_test))


for id,concept in zip()

In [None]:
inversed = lb.inverse_transform(Y_predicted)
actual_inverse = lb.inverse_transform(Y_test)

In [None]:
for id, inverse, id, actual in zip(Y_predicted, inversed, id_test, actual_inverse):
    print('%r => %s | %s' % (id, inverse, actual))

#for doc, category in zip(docs_new, predicted):
#    print('%r => %s' % (doc, category))

In [None]:
def fix_hwcvs(ids):
    out = set()
    for i,a in enumerate(ids):
        str_a = a.astype(str)
        if len(str_a) == 4:
            str_a = str("0"+str(a))
        hwcv = 'cat'+str_a
        out.add(hwcv)
    return out
            
            

In [None]:
with open('/usr/local/tmp/categories.tsv', 'w') as f: 
    for id, inverse, id, actual in zip(Y_predicted, inversed, id_test, actual_inverse):
        predicted = fix_hwcvs(inverse)
        if (len(predicted) == 0):
            predicted = ""
        actual = fix_hwcvs(actual)
        
        predicted_s = str(predicted)
        actual_s = str(actual)
        
        x = id+'\t'+predicted_s+'\t'+str(actual_s)
        
        post = ""
        if len(predicted) == 0:
            post = "no prediction"
        elif predicted == actual:
            post = 'exact'
        elif predicted.issubset(actual):
            post = 'partial'
        else:
            post = 'wrong'
        out = str(x)+'\t'+post+'\n'
        f.write(out)
