In [1]:
from preprocessing import *
from training import *
from fastai.layers import CrossEntropyFlat
import pickle
from fastai.text import get_text_classifier
import dill
from fastai.basic_data import load_data

In [2]:
def save(learner, path) :
    state = {'model' : learner.model.state_dict(), 'opt' : learner.opt.state_dict()}
    torch.save(state, path)

In [3]:
def load(learner, path) :
    state = torch.load(path)
    learn.model.load_state_dict(state['model'])
    learn.opt.load_state_dict(state['opt'])

# Language model tunning

In [4]:
tl = TextList.from_csv('data/train_full_m_shuffled.csv', 'tweet')
sd = SplitData.split_by_func(tl, partial(random_splitter, pctg=0.2))

In [3]:
proc_tok,proc_num = TokenizeProcessor(max_workers=8),NumericalizeProcessor()

In [4]:
ll = label_by_func(sd, lambda x: 0, proc_x = [proc_tok,proc_num])

HBox(children=(IntProgress(value=0, max=1001), HTML(value='')))




HBox(children=(IntProgress(value=0, max=250), HTML(value='')))




In [5]:
pickle.dump(ll, open('data/ll_lm.pkl', 'wb'))
pickle.dump(proc_num.vocab, open('data/vocab_lm.pkl', 'wb'))

In [6]:
ll = pickle.load(open('data/ll_lm.pkl', 'rb'))
vocab = pickle.load(open('data/vocab_lm.pkl', 'rb'))

In [3]:
bs,bptt = 64,70

In [4]:
train_dl, valid_dl = get_lm_dls(ll.train, ll.valid, bs, bptt)

In [5]:
data = Databunch(train_dl, valid_dl)

In [6]:
model = load_pretrained_lm(vocab)

In [7]:
lr = 0.01

opt = torch.optim.Adam(get_model_param_groups(model), lr=lr)

loss_func = CrossEntropyFlat()

learn = Learner(model, opt, loss_func, data)

In [8]:
learn.freeze_to(-1)

In [9]:
info = fit_awd_lstm(1, learn)

HBox(children=(IntProgress(value=0, description='Epoch 0 training', max=10050, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Epoch 0 validation', max=1253, style=ProgressStyle(descriptio…

Epoch 0 training loss : 4.204, train accuracy : 0.349, validation loss : 3.701, valid accuracy : 0.391


In [13]:
save(learn, 'data/my_models/lm_1_epoch.pkl')

In [10]:
learn.freeze_to(-2)

In [23]:
info = fit_awd_lstm(1, learn)

HBox(children=(IntProgress(value=0, description='Epoch 0 training', max=10050, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Epoch 0 validation', max=1253, style=ProgressStyle(descriptio…

Epoch 0 training loss : 3.650, train accuracy : 0.401, validation loss : 3.437, valid accuracy : 0.422


In [24]:
save(learn, 'data/my_models/lm_2_epoch.pkl')

In [25]:
learn.unfreeze()

In [26]:
info = fit_awd_lstm(1, learn)

HBox(children=(IntProgress(value=0, description='Epoch 0 training', max=10050, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Epoch 0 validation', max=1253, style=ProgressStyle(descriptio…

Epoch 0 training loss : 3.381, train accuracy : 0.429, validation loss : 3.263, valid accuracy : 0.442


In [27]:
save(learn, 'data/my_models/lm_3_epoch.pkl')

# ---


In [16]:
save_encoder_lm(learn.model, 'data/my_encoders/lm_1enco.pth')

# Classifier

### On sample

In [4]:
df = pd.read_csv('data/train_full_m_shuffled.csv')

In [5]:
pc = 0.05
np.random.seed(23)
permu = np.random.permutation(df.shape[0])
idxs = permu[:int(pc*df.shape[0])]
df_sample = df.iloc[idxs]

In [6]:
df_sample.shape[0]

125000

In [7]:
vocab = pickle.load(open('data/vocab_lm.pkl', 'rb'))

In [8]:
tl_sample = TextList.from_df(df_sample, 'tweet')
sd = SplitData.split_by_func(tl_sample, partial(random_splitter, pctg=0.2))

In [9]:
proc_tok,proc_num = TokenizeProcessor(max_workers=8),NumericalizeProcessor(vocab=vocab)

In [10]:
tweet_to_label = {}
it = tqdm_notebook(range(df_sample.shape[0]), total=df_sample.shape[0])
for i in it : 
    tweet_to_label[df_sample['tweet'].iloc[i]] = df_sample['label'].iloc[i]

HBox(children=(IntProgress(value=0, max=125000), HTML(value='')))




In [11]:
ll_sample = label_by_func(sd, lambda x: tweet_to_label[x], proc_x = [proc_tok,proc_num])

HBox(children=(IntProgress(value=0, max=51), HTML(value='')))




HBox(children=(IntProgress(value=0, max=13), HTML(value='')))




In [12]:
pickle.dump(ll_sample, open('data/ll_clas_sample.pkl', 'wb'))

In [20]:
ll_sample = pickle.load(open('data/ll_clas_sample.pkl', 'rb'))
vocab = pickle.load(open('data/vocab_lm.pkl', 'rb'))

In [21]:
traind_dl, valid_dl = get_clas_dls(ll_sample.train, ll_sample.valid, 64)

In [22]:
data = Databunch(traind_dl, valid_dl)

In [23]:
model = get_text_classifier(AWD_LSTM, len(vocab), 2)

In [24]:
load_encoder_clas(model, 'data/my_encoders/lm_1enco.pth')

In [25]:
lr = 0.01

opt = torch.optim.Adam(get_class_model_param_groups(model), lr=lr)

loss_func = CrossEntropyFlat()

learn = Learner(model, opt, loss_func, data)

In [26]:
validate(learn, lm=False)

HBox(children=(IntProgress(value=0, description='Validation', max=195, style=ProgressStyle(description_width='…

Loss : 0.693, Accuracy : 0.496


In [27]:
learn.freeze_to(-1)

In [28]:
info = fit_awd_lstm(1, learn, lm=False)

HBox(children=(IntProgress(value=0, description='Epoch 0 training', max=1565, style=ProgressStyle(description_…

HBox(children=(IntProgress(value=0, description='Epoch 0 validation', max=195, style=ProgressStyle(description…

Epoch 0 training loss : 0.518, train accuracy : 0.740, validation loss : 0.417, valid accuracy : 0.803


In [29]:
learn.freeze_to(-2)

In [30]:
info = fit_awd_lstm(1, learn, lm=False)

HBox(children=(IntProgress(value=0, description='Epoch 0 training', max=1565, style=ProgressStyle(description_…

HBox(children=(IntProgress(value=0, description='Epoch 0 validation', max=195, style=ProgressStyle(description…

Epoch 0 training loss : 0.443, train accuracy : 0.794, validation loss : 0.350, valid accuracy : 0.843


In [31]:
learn.freeze_to(-3)

In [32]:
info = fit_awd_lstm(1, learn, lm=False)

HBox(children=(IntProgress(value=0, description='Epoch 0 training', max=1565, style=ProgressStyle(description_…

HBox(children=(IntProgress(value=0, description='Epoch 0 validation', max=195, style=ProgressStyle(description…

Epoch 0 training loss : 0.411, train accuracy : 0.816, validation loss : 0.335, valid accuracy : 0.847


In [33]:
learn.unfreeze()

In [34]:
info = fit_awd_lstm(1, learn, lm=False)

HBox(children=(IntProgress(value=0, description='Epoch 0 training', max=1565, style=ProgressStyle(description_…

HBox(children=(IntProgress(value=0, description='Epoch 0 validation', max=195, style=ProgressStyle(description…

Epoch 0 training loss : 0.392, train accuracy : 0.826, validation loss : 0.331, valid accuracy : 0.853


### On full data

In [35]:
df = pd.read_csv('data/train_full_m_shuffled.csv')

In [40]:
tl = TextList.from_df(df, 'tweet')
sd = SplitData.split_by_func(tl, partial(random_splitter, pctg=0.2))

In [37]:
vocab = pickle.load(open('data/vocab_lm.pkl', 'rb'))

In [38]:
proc_tok,proc_num = TokenizeProcessor(max_workers=8),NumericalizeProcessor(vocab=vocab)

In [36]:
tweet_to_label = {}
it = tqdm_notebook(range(df.shape[0]), total=df.shape[0])
for i in it : 
    tweet_to_label[df['tweet'].iloc[i]] = df['label'].iloc[i]

HBox(children=(IntProgress(value=0, max=2500000), HTML(value='')))

In [41]:
ll = label_by_func(sd, lambda x: tweet_to_label[x], proc_x = [proc_tok,proc_num])

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=251), HTML(value='')))

In [42]:
pickle.dump(ll_sample, open('data/ll_clas.pkl', 'wb'))

In [6]:
ll = pickle.load(open('data/ll_clas.pkl', 'rb'))
vocab = pickle.load(open('data/vocab_lm.pkl', 'rb'))

In [5]:
traind_dl, valid_dl = get_clas_dls(ll.train, ll.valid, 64)

In [4]:
data_cals_wt = load_data('data', 'data_cals_wt_2_james')

In [15]:
data = Databunch(traind_dl, valid_dl)

In [7]:
model = get_text_classifier(AWD_LSTM, len(vocab), 2)

In [8]:
load_encoder_clas(model, 'data/my_encoders/lm_1enco.pth')

In [9]:
pickle.dump(model, open('data/enc.pth', 'wb'))

In [10]:
lr = 0.01

opt = torch.optim.Adam(get_class_model_param_groups(model), lr=lr)

loss_func = CrossEntropyFlat()

learn = Learner(model, opt, loss_func, data_cals_wt)

In [13]:
len(data_cals_wt.valid_dl)

40

In [11]:
validate(learn, lm=False)

HBox(children=(IntProgress(value=0, description='Validation', max=40, style=ProgressStyle(description_width='i…

Loss : 0.696, Accuracy : 0.506


In [16]:
 [t.shape for t in learn.opt.param_groups[3]['params']]

[torch.Size([1600, 400]),
 torch.Size([1600, 1150]),
 torch.Size([1600, 400]),
 torch.Size([1600]),
 torch.Size([1600])]

In [14]:
learn.freeze_to(-1)

In [15]:
info = fit_awd_lstm(1, learn, lm=False)

HBox(children=(IntProgress(value=0, description='Epoch 0 training', max=39023, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Epoch 0 validation', max=40, style=ProgressStyle(description_…

Epoch 0 training loss : 0.508, train accuracy : 0.748, validation loss : 0.380, valid accuracy : 0.822


In [51]:
#save(learn, 'data/models/my_class_1epoch.pth')

In [21]:
#load(learn, 'data/models/my_class_1epoch.pth')

In [22]:
validate(learn, lm=False)

HBox(children=(IntProgress(value=0, description='Validation', max=3908, style=ProgressStyle(description_width=…

Loss : 0.642, Accuracy : 0.632


In [16]:
learn.freeze_to(-2)

In [None]:
info = fit_awd_lstm(1, learn, lm=False)

HBox(children=(IntProgress(value=0, description='Epoch 0 training', max=39023, style=ProgressStyle(description…

In [54]:
#save(learn, 'data/models/my_class_2epoch.pth')

In [57]:
load(learn, 'data/models/my_class_2epoch.pth')

In [59]:
validate(learn, lm=False)

HBox(children=(IntProgress(value=0, description='Validation', max=3910, style=ProgressStyle(description_width=…

Loss : 0.317, Accuracy : 0.862


In [60]:
learn.freeze_to(-3)

In [62]:
info = fit_awd_lstm(1, learn, lm=False, n_max=0.001)

HBox(children=(IntProgress(value=0, description='Epoch 0 training', max=31245, style=ProgressStyle(description…

HBox(children=(IntProgress(value=0, description='Epoch 0 validation', max=3910, style=ProgressStyle(descriptio…

Epoch 0 training loss : 0.371, train accuracy : 0.838, validation loss : 0.302, valid accuracy : 0.869


In [63]:
#save(learn, 'data/models/my_class_3epoch.pth')

In [None]:
load(learn, 'data/models/my_class_3epoch.pth')

In [64]:
learn.unfreeze()

In [65]:
info = fit_awd_lstm(1, learn, lm=False, n_max=0.001)

HBox(children=(IntProgress(value=0, description='Epoch 0 training', max=31245, style=ProgressStyle(description…

HBox(children=(IntProgress(value=0, description='Epoch 0 validation', max=3910, style=ProgressStyle(descriptio…

Epoch 0 training loss : 0.353, train accuracy : 0.848, validation loss : 0.301, valid accuracy : 0.870


In [67]:
save(learn, 'data/models/my_class_4epoch.pth')

In [10]:
load(learn, 'data/models/my_class_4epoch.pth')

In [11]:
learn.opt.param_groups[4]

{'lr': 3.125e-05,
 'betas': (0.9, 0.999),
 'eps': 1e-08,
 'weight_decay': 0,
 'amsgrad': False,
 'params': [Parameter containing:
  tensor([ 0.2782, -1.3711, -0.3493,  ...,  2.9088,  3.7730,  2.4020],
         requires_grad=True), Parameter containing:
  tensor([-0.9768,  1.5762, -0.5757,  ...,  0.5209,  0.0652,  0.6209],
         requires_grad=True), Parameter containing:
  tensor([[ 0.1745,  0.7057, -0.4060,  ..., -1.6285, -1.3912,  0.1869],
          [-1.3980,  0.2979,  0.6931,  ...,  1.4607,  0.3595,  1.1824],
          [-0.2787,  0.1445, -0.3769,  ...,  0.6551, -0.8367,  0.6143],
          ...,
          [-1.5518,  1.0422, -0.7327,  ..., -3.7855, -0.6925, -2.3094],
          [-0.7724,  1.6011, -0.7478,  ..., -0.6108,  0.2634,  2.2824],
          [-0.0216,  0.4520,  1.4221,  ..., -0.4907,  0.0192, -1.0327]],
         requires_grad=True), Parameter containing:
  tensor([-0.5904,  0.9099,  2.2925, -0.5473, -2.3628, -0.9704,  0.7392,  1.0951,
          -0.8352, -0.0456, -0.3574, -1.86