In [3]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import os, string, collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import utils
from utils import *

from nltk.corpus import stopwords

from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn import metrics
from keras.wrappers.scikit_learn import KerasClassifier

from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model, load_model
from keras.layers.embeddings import Embedding
from keras.layers import Flatten, Dense, Dropout, Convolution1D, MaxPooling1D, SpatialDropout1D, Input 
from keras.layers import GlobalMaxPooling1D, concatenate, LSTM, Bidirectional
from keras.optimizers import Adam
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [4]:
PATH = os.getcwd()

In [5]:
# df = pd.read_csv(f'{PATH}/data/Airline-Sentiment-2-w-AA.csv', usecols=['text', 'airline_sentiment'], encoding='ISO-8859-1')

In [8]:
df = pd.read_csv("data.csv", usecols=['Tweet Text', 'Medical relevance'])

In [9]:
df.shape

(2099, 2)

Encode categorical label class into numerical

In [10]:
le = LabelEncoder()
df['target'] = le.fit_transform(df['Medical relevance'])

Text cleaning

In [11]:
tc = TextCleaner()
df['clean_text'] = tc.transform(df['Tweet Text'])

Tokenization

In [12]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()

In [13]:
df['tokenized'] = df['clean_text'].apply(lambda row: tokenize(row))

Stopwords removing

In [15]:
stop = set(stopwords.words('english'))
stop.update(['amp', 'rt', 'cc'])
stop = stop - set(['no', 'not'])

In [16]:
def remove_stopwords(row):
    return [t for t in row if t not in stop]

In [17]:
df['tokenized'] = df['tokenized'].apply(lambda row: remove_stopwords(row))

In [18]:
pd.set_option('display.max_colwidth', -1)

In [19]:
df[['Tweet Text', 'tokenized']].head()

Unnamed: 0,Tweet Text,tokenized
0,RT @JarrodTheLord: Yall creating life and cloning animals but want us to believe there is no cure for Cancer or Aids... alright https://t.c‰Û¡ÌÝ_,"[creating, life, cloning, animals, want, us, believe, no, cure, cancer, aids, alright]"
1,RT @TommySobiesski: you could give GameStop the cure to cancer &amp; they‰Û¡ÌÝå»d offer you $3.89 https://t.co/REjIez4G7D,"[could, give, gamestop, cure, cancer, theyd, offer]"
2,"This last year has been the hardest of my life - to all the cancer researchers, doctors, nurses and caregivers who are so selflessly dedicated to finding a cure and healing patients, you are my heroes and I am grateful for the hope and strength you continue to give me &amp; my family","[last, year, hardest, life, cancer, researchers, doctors, nurses, caregivers, selflessly, dedicated, finding, cure, healing, patients, heroes, grateful, hope, strength, continue, give, family]"
3,The year is 2043. You opened your newspaper and read the headlines- - World Health Organization releases a cure for Cancer. - Global warming threat- eliminated! - Enrile celebrating his birthday. - Penguins can fly. .. and you've never been so happy.,"[year, opened, newspaper, read, headlines, world, health, organization, releases, cure, cancer, global, warming, threat, eliminated, enrile, celebrating, birthday, penguins, fly, youve, never, happy]"
4,RT @gorskon: Here we go again. Yet another dubious cancer cure video. Watch to the end and see! https://t.co/JObs8QeYiS,"[go, yet, another, dubious, cancer, cure, video, watch, end, see]"


Vocabulary creation

In [20]:
def update_vocab_counter(row):
    for word in row:
        vocab_counter[word] += 1

In [21]:
vocab_counter = collections.Counter()
df['tokenized'].apply(update_vocab_counter);
vocab = sorted(vocab_counter, key=vocab_counter.get, reverse=True)

In [22]:
len(vocab)

8974

We limit the dictionary size to the top 5000 most frequent tokens

In [23]:
max_words = 5000

Dictionary that map each token with their id

In [24]:
w2id = {w:i for i, w in enumerate(vocab[:max_words])}

We will replace each token out of top 5000 with 'unk'

In [25]:
w2id['unk'] = -1

We transform each token by their id

In [26]:
w2id["full"]

287

In [27]:
def transform_to_ids(row):
    return [w2id[w] if w in w2id else w2id['unk'] for w in row]

In [28]:
df['tokenized_int'] = df['tokenized'].apply(lambda x: transform_to_ids(x))

Tweets length

In [29]:
lens = df['tokenized_int'].apply(lambda x: len(x))

In [30]:
min(lens), max(lens), np.mean(lens)

(2, 66, 21.681753215817057)

We set 20 as max length

In [31]:
maxlen = 60

Train, test split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(df['tokenized_int'].values, df['target'].values, test_size=0.2, random_state=0)

Since we need that each document contains a fixed number of tokens (20), we fill with -1 (id that represents 'unk') every token with size < 20

In [33]:
x_train = pad_sequences(X_train, maxlen=maxlen, value=-1)
x_test = pad_sequences(X_test, maxlen=maxlen, value=-1)

We one-hot encode target classes

In [34]:
dummy_y = np_utils.to_categorical(y_train)
dummy_y_test = np_utils.to_categorical(y_test)

In [65]:
x_train.shape

(1574, 60)

In [66]:
x_test.shape

(525, 60)

In [67]:
len(y_train)

1574

In [129]:
from sklearn.tree import DecisionTreeClassifier

In [130]:
clf = DecisionTreeClassifier(random_state=0)

In [131]:
cross_val_score(clf, x_train, y_train, cv=10)

array([0.77380952, 0.79761905, 0.78571429, 0.75      , 0.72619048,
       0.76190476, 0.8452381 , 0.75      , 0.76190476, 0.77245509])

In [132]:
clf.fit(x_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [133]:
y_pred = clf.predict(x_test)

In [49]:
 from sklearn.metrics import confusion_matrix

In [136]:
confusion_matrix(y_test, y_pred)

array([[309,  59],
       [ 34,  18]], dtype=int64)

In [54]:
from sklearn.metrics import accuracy_score

In [139]:
accuracy_score(y_test, y_pred)

0.7785714285714286

In [89]:
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [90]:
# fit model no training data
model = XGBClassifier()
model.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [93]:
len(predictions)

525

In [97]:
np.sum(predictions == 0)

0

In [96]:
y_pred[:4]

array([0, 0, 0, 0], dtype=int64)

In [40]:
from keras.models import Sequential
from keras.layers import Dense, LSTM

In [45]:
model = Sequential()
model.add(Embedding(20000, 100, input_length=60))
model.add(LSTM(60, dropout=0.4, recurrent_dropout=0.4))
model.add(Dense(10, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [46]:
# data_token = df['tokenized']
# data_x = pad_sequences(sequences, maxlen=50)

In [47]:
model.fit(x_train , y_train, validation_split=0.2, epochs=20)

Train on 1343 samples, validate on 336 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f6415b20550>

In [48]:
pred_y_lstm = model.predict(x_test)

In [52]:
pred_y_lstm = np.round(pred_y_lstm)

In [53]:
confusion_matrix(y_test, pred_y_lstm)

array([[351,  17],
       [ 28,  24]])

In [55]:
accuracy_score(y_test,pred_y_lstm)

0.8928571428571429