In [0]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import multiprocessing
from tqdm import tqdm
from sklearn import utils
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from gensim.parsing.porter import PorterStemmer
from textblob import TextBlob

In [0]:
train = pd.read_csv('task3_all.tsv', sep="\t", header=None)
train.columns = train.iloc[0]
train = train.drop(train.index[0])
train = train.astype(str)

test = pd.read_csv('task3_validation.tsv', sep="\t", header=None)
test.columns = test.iloc[0]
test = test.drop(test.index[0])
test = test.astype(str)

train = train[train.type == 'ADR']
test = test[test.type == 'ADR']

from bs4 import BeautifulSoup
import re

def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = text.lower()
    return text
train['extraction'] = train['extraction'].apply(cleanText)
test['extraction'] = test['extraction'].apply(cleanText)



In [0]:
test.head()

Unnamed: 0,tweet_id,begin,end,type,extraction,drug,tweet,meddra_code,meddra_term
1,332317478170546176,28,37,ADR,allergies,avelox,"do you have any medication allergies? ""asthma!...",10013661,drug allergy
2,347806215776116737,31,46,ADR,hurt your liver,avelox,"@ashleylvivian if #avelox has hurt your liver,...",10024668,liver damage
3,350336129817509888,48,50,ADR,ad,baclofen,"apparently, baclofen greatly exacerbates the ""...",10003731,attention deficit disorder
4,350336129817509888,88,93,ADR,focus,baclofen,"apparently, baclofen greatly exacerbates the ""...",10003738,attention impaired
5,332540699692130304,11,15,ADR,died,cipro,pt of mine died from cipro rt @ciproispoison: ...,10011906,death


In [0]:
train.meddra_code.value_counts()

10047896    74
10073281    65
10048010    39
10043890    36
10016370    36
10041349    35
10019211    35
10041001    33
10033371    31
10016365    30
10022437    27
10016384    27
10001718    21
10041014    20
10001125    19
10028813    18
10041000    17
10043087    15
10012336    15
10004969    15
10003988    14
10016336    14
10027374    14
10042661    14
10049278    14
10013649    14
10012378    14
10019158    13
10071175    13
10041017    12
            ..
10048013     1
10064160     1
10044124     1
10038001     1
10035805     1
10020197     1
10009696     1
10046571     1
10001488     1
10038744     1
10043498     1
10016821     1
10016344     1
10040559     1
10001639     1
10065015     1
10044573     1
10077275     1
10028322     1
10049183     1
10017375     1
10033434     1
10042076     1
10041005     1
10013082     1
10036507     1
10044698     1
10012790     1
10043439     1
10008477     1
Name: meddra_code, Length: 475, dtype: int64

In [0]:
import nltk
from nltk.corpus import stopwords
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['extraction']), tags=[r.meddra_code]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['extraction']), tags=[r.meddra_code]), axis=1)


In [0]:
import multiprocessing
cores = multiprocessing.cpu_count()

In [0]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

logreg = LogisticRegression(n_jobs=1, C=1e5, max_iter=100, class_weight = 'balanced')


 # Distributed Model

In [0]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])


100%|██████████| 1829/1829 [00:00<00:00, 1311123.23it/s]


In [0]:
%%time
for epoch in range(30):
    model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

100%|██████████| 1829/1829 [00:00<00:00, 1372094.80it/s]
100%|██████████| 1829/1829 [00:00<00:00, 2721313.24it/s]
100%|██████████| 1829/1829 [00:00<00:00, 2771453.04it/s]
100%|██████████| 1829/1829 [00:00<00:00, 2681363.86it/s]
100%|██████████| 1829/1829 [00:00<00:00, 2647129.75it/s]
100%|██████████| 1829/1829 [00:00<00:00, 2862455.98it/s]
100%|██████████| 1829/1829 [00:00<00:00, 1581076.26it/s]
100%|██████████| 1829/1829 [00:00<00:00, 2931364.93it/s]
100%|██████████| 1829/1829 [00:00<00:00, 2354629.23it/s]
100%|██████████| 1829/1829 [00:00<00:00, 1943598.18it/s]
100%|██████████| 1829/1829 [00:00<00:00, 2444672.41it/s]
100%|██████████| 1829/1829 [00:00<00:00, 2543561.68it/s]
100%|██████████| 1829/1829 [00:00<00:00, 2195587.30it/s]
100%|██████████| 1829/1829 [00:00<00:00, 1640235.62it/s]
100%|██████████| 1829/1829 [00:00<00:00, 2705954.86it/s]
100%|██████████| 1829/1829 [00:00<00:00, 2353906.72it/s]
100%|██████████| 1829/1829 [00:00<00:00, 2788579.43it/s]
100%|██████████| 1829/1829 [00:

CPU times: user 1.24 s, sys: 61.3 ms, total: 1.3 s
Wall time: 1.39 s


In [0]:
y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.5013698630136987
Testing F1 score: 0.5059975837574548


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
