In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


In [4]:
# Load the file into pandas
jsonl = '../jsonl/inlined.jsonl'
df_orig = ''
with open(jsonl,"r") as json_file:
    data=json_file.read()
    df_orig = pd.read_json(data, lines=True)

In [5]:
df_orig.head(5)

Unnamed: 0,audience,categories,concepts,gender,max_age,min_age,setting,text
0,3.0,"[1040, 1030]",[20016],0.0,64.0,13.0,0.0,hwcv_20016 hwcv_20016 hwcv_20016 topic over...
1,3.0,[1030],"[20058, 20044]",0.0,64.0,13.0,0.0,hwcv_20044 hwcv_20044 hwcv_20058 hwcv_20058 pr...
2,3.0,"[1018, 1030]",,0.0,64.0,13.0,0.0,hwcv_20029 : genetic disorders and hwcv_2003...
3,3.0,[1021],"[21433, 22373]",2.0,110.0,2.0,0.0,hwcv_22373 hwcv_22373 hwcv_21433 hwcv_21433 h...
4,,,,,,,,hair growth most scalp hair grows steadily. a...


In [6]:
# Get all of the unindexed documents
df_unindexed = df_orig[pd.isnull(df_orig['concepts'])]

In [7]:
# Strip out any rows where concepts is null, NaN, ec
df_orig = df_orig[pd.notnull(df_orig['concepts'])]

In [8]:
# Convert the numbers from float to int
df_orig['audience'] = df_orig['audience'].astype(int)
df_orig['max_age'] = df_orig['max_age'].astype(int)
df_orig['min_age'] = df_orig['min_age'].astype(int)
df_orig['gender'] = df_orig['gender'].astype(int)
df_orig['setting'] = df_orig['gender'].astype(int)

In [9]:
df_orig.head(5)

Unnamed: 0,audience,categories,concepts,gender,max_age,min_age,setting,text
0,3,"[1040, 1030]",[20016],0,64,13,0,hwcv_20016 hwcv_20016 hwcv_20016 topic over...
1,3,[1030],"[20058, 20044]",0,64,13,0,hwcv_20044 hwcv_20044 hwcv_20058 hwcv_20058 pr...
3,3,[1021],"[21433, 22373]",2,110,2,2,hwcv_22373 hwcv_22373 hwcv_21433 hwcv_21433 h...
7,3,"[1018, 1030, 1039]","[21474, 21435, 4124]",0,64,13,0,hwcv_4124 hwcv_4124 hwcv_21435 hwcv_21435 hwcv...
9,3,"[1008, 1051]","[22752, 4280]",2,110,19,2,hwcv_4280 hwcv_4280 hwcv_22752 hwcv_22752 hwc...


### MultiLabelBinarizer
Unless you can insure that your train dataset includes ALL LABELS, a hack needs to be introduced.  We will build out MultiLabelBinarizer off the complete set of concepts and use this object later against the train/test slices.  This will result in warnings, but unless we make certain all labels are present in train - its the only easy thing to do.


In [10]:
# Build our binary map array for the whole collection since we can't
# be certain that we don't use a label in the test set that isn't
# present in the train set
from sklearn import preprocessing
lb = preprocessing.MultiLabelBinarizer()
Y_all = lb.fit_transform(df_orig['concepts'])
lb

MultiLabelBinarizer(classes=None, sparse_output=False)

In [11]:
ids = df_orig['id']

KeyError: 'id'

In [15]:
# Split into test/train
#X_train, X_test, Y_train, Y_test, id_train, id_test = train_test_split(df_orig, Y_all, ids, test_size=0.1, random_state=1)
X_train, X_test, Y_train, Y_test = train_test_split(df_orig, Y_all, test_size=0.2, random_state=1)

print("X Training set: ", len(X_train))
print("Y Training set: ", len(Y_train))
print("X Testing set: ", len(X_test))
print("Y Testing set: ", len(Y_test))
#print("IDs Test: ", len(id_train))
#print("IDs Train: ", len(id_test))

X Training set:  2743
Y Training set:  2743
X Testing set:  686
Y Testing set:  686


In [16]:
#X_prime_train = X_train.drop('id',1).drop('categories',1).drop('concepts',1)
#Y_prime_train = Y_train
#X_prime_train

In [17]:
#X_prime_train.shape

In [18]:
#Y_prime_train.shape

In [19]:
X_train

Unnamed: 0,audience,categories,concepts,gender,max_age,min_age,setting,text
819,3,"[1500, 1533]",[20442],2,110,13,2,hwcv_20442 hwcv_20442 hwcv_20442 : care instr...
625,2,"[1025, 1044]",[4579],2,64,13,2,hwcv_4579 hwcv_4579 caring for more than one b...
2432,4,"[1034, 1012]","[20866, 4748, 20795, 20796, 20798]",2,110,0,2,hwcv_20798 hwcv_20798 hwcv_20796 hwcv_20796 hw...
1771,2,"[1025, 1026, 1007]","[20192, 22985, 4223]",2,12,0,2,hwcv_4223 hwcv_4223 hwcv_22985 hwcv_22985 hwcv...
973,3,"[1506, 1500, 1533]",[20442],2,110,6,2,hwcv_20442 hwcv_20442 using a hwcv_05533 : ca...
634,3,"[1032, 1040, 1030]",[21478],0,64,13,0,hwcv_21478 hwcv_21478 contraction hwcv_21316 ...
2710,2,"[1504, 1523, 1524]","[21170, 22039]",2,18,0,2,hwcv_22039 hwcv_22039 hwcv_21170 hwcv_21170 h...
874,3,"[1524, 1502]",[21835],2,110,19,2,hwcv_21835 hwcv_21835 learning about life with...
2276,3,"[1043, 1013]","[4234, 22252, 22124, 5103, 20144, 22097]",2,110,19,2,hwcv_22097 hwcv_22097 hwcv_20144 hwcv_20144 hw...
1846,2,"[1504, 1523, 1524, 1531]","[22692, 4750]",2,18,0,2,hwcv_4750 hwcv_4750 hwcv_22692 hwcv_22692 hwc...


```('vectorizer', CountVectorizer(lowercase=True, 
                                   stop_words='english', 
                                   max_df=0.8, 
                                   min_df=2)),
```                                   
0.25254813137

```
classifier = Pipeline([
('vectorizer', CountVectorizer(lowercase=True, 
                                   stop_words='english', 
                                   max_df=0.8, 
                                   min_df=4)),

```

0.249150622877

In [20]:
type(X_train)

pandas.core.frame.DataFrame

In [21]:
type(Y_train)

numpy.ndarray

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

from nltk.corpus import stopwords

classifier = Pipeline([
('vectorizer', CountVectorizer(lowercase=True, 
                                   stop_words=stopwords.words('english'), 
                                   max_df=0.8, 
                                   min_df=.00001)),
('tfidf', TfidfTransformer()),   
('clf', OneVsRestClassifier(LinearSVC()))])

classifier.fit(X_train['text'], Y_train)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classe

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=None, min_df=1e-05,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=[...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1))])

In [23]:
Y_predicted = classifier.predict(X_test['text'])
print("Test predictions: ",len(Y_predicted))

Test predictions:  686


import numpy
for count, prediction in enumerate(Y_predicted):
    answer = Y_test[count]
    print("Prediction length: ",len(prediction))
    print("Answer length: ",len(answer))

    print("--")
    if np.array_equal(prediction, answer):
        print("!")

In [24]:
from sklearn.metrics import accuracy_score
print(len(Y_test))
print(len(Y_predicted))
print("Accuracy Score: ",accuracy_score(Y_predicted, Y_test))


686
686
Accuracy Score:  0.0918367346939


for id,concept in zip()

In [None]:
inversed = lb.inverse_transform(Y_predicted)
actual_inverse = lb.inverse_transform(Y_test)

In [None]:
for id, inverse, id, actual in zip(Y_predicted, inversed, id_test, actual_inverse):
    print('%r => %s | %s' % (id, inverse, actual))

#for doc, category in zip(docs_new, predicted):
#    print('%r => %s' % (doc, category))

In [None]:
def fix_hwcvs(ids):
    out = set()
    for i,a in enumerate(ids):
        str_a = a.astype(str)
        if len(str_a) == 4:
            str_a = str("0"+str(a))
        hwcv = 'HWCV_'+str_a
        out.add(hwcv)
    return out
            
            

In [None]:
with open('/usr/local/tmp/results1.tsv', 'w') as f: 
    for id, inverse, id, actual in zip(Y_predicted, inversed, id_test, actual_inverse):
        predicted = fix_hwcvs(inverse)
        if (len(predicted) == 0):
            predicted = ""
        actual = fix_hwcvs(actual)
        
        predicted_s = str(predicted)
        actual_s = str(actual)
        
        x = id+'\t'+predicted_s+'\t'+str(actual_s)
        
        post = ""
        if len(predicted) == 0:
            post = "no prediction"
        elif predicted == actual:
            post = 'exact'
        elif predicted.issubset(actual):
            post = 'partial'
        else:
            post = 'wrong'
        out = str(x)+'\t'+post+'\n'
        f.write(out)
