In [670]:
import torch
from torch import nn
from torch import optim
import utils
import pandas as pd
import datetime
import ciso8601
import time
import numpy as np
from sklearn import model_selection

In [671]:
path = "~/Downloads/coronavirus_data/"

In [672]:
contacts = pd.read_csv('../contacts_15meters_30sec.csv')

In [673]:
targets =  pd.read_csv(path + 'infusr.csv')

In [674]:
def get_timestamp(date_string):
    ts = ciso8601.parse_datetime(date_string)
    return int(time.mktime(ts.timetuple()))
START_DATE = get_timestamp("2020-04-13T23:59:59.000000")
END_DATE = get_timestamp("2020-04-24T23:59:59.000000")
DAY_LENGTH = 86400
# уникальные юзеры
unique_users = pd.unique(contacts['user1'])
# Формируем сетку
time_grid = np.arange(START_DATE, END_DATE, DAY_LENGTH)

In [675]:
targets = targets.rename(columns={'user_id': 'user2', 'time': 'contact_time'})

In [676]:
contacts_w_targets = contacts.join(targets.set_index('user2'), on='user2', how='left')

In [677]:
def add_to_learning_grid(dict_, time, user1, user2):
    key = user1 + '/' + str(time)
    if key not in dict_.keys():
        dict_[key] = set()
    dict_[key].add(user2)

In [678]:
contacts_dict = {}
infected_dict = {}
for i, row in contacts_w_targets.iterrows():
    time_hash = row['time']
    time = time_hash*30
    user1 = row['user1']
    user2 = row['user2']
    close_time_elems = time_grid[((time_grid < time + 7*DAY_LENGTH) & (time_grid > time))]
    for time_elem in close_time_elems:
        add_to_learning_grid(contacts_dict, time_elem, user1, user2)
    contact_time = row['contact_time']
    if contact_time == contact_time:
        close_time_elems_before = time_grid[
            ((time < time_grid + 5*DAY_LENGTH) & (time_grid < time))
        ]
        close_time_elems_after = time_grid[
            ((time_grid < time + 3*DAY_LENGTH) & (time_grid > time))
        ]
        for time_elem in close_time_elems_before:
            add_to_learning_grid(infected_dict, time_elem, user1, user2)
        for time_elem in close_time_elems_after:
            add_to_learning_grid(infected_dict, time_elem, user1, user2)
    

In [679]:
learning_list = []
for time_step in time_grid:
    for user in unique_users:
        dict_elem = {'user_id': user, 'grid_time': time_step}
        key = user + '/' + str(time_step)
        if key in contacts_dict.keys():
            dict_elem['contacts_num'] = len(contacts_dict[key])
        else:
            dict_elem['contacts_num']  = 0
        if key in infected_dict.keys():
            dict_elem['infected_contacts_num'] = len(infected_dict[key])
        else:
            dict_elem['infected_contacts_num']  = 0
        learning_list.append(dict_elem)

In [680]:
learning_data = pd.DataFrame(learning_list)

In [681]:
learning_data.shape

(2750, 4)

In [682]:
targets =  pd.read_csv(path + 'infusr.csv')

In [683]:
learning_data = learning_data.join(targets.set_index('user_id'), on='user_id', how='left')

In [684]:
def calc_target(inf_time, grid_time):
    if inf_time == inf_time:
        if 0 <= grid_time - inf_time <= 5*DAY_LENGTH:
            return 1
        if 0 < inf_time - grid_time <= 3*DAY_LENGTH:
            return 1
    return 0

In [685]:
learning_data['target'] = learning_data.apply(
    lambda x: calc_target(x['time'], x['grid_time']),  axis=1
)

In [686]:
def calc_p_t(contacts_num):
    logsigmoid = nn.LogSigmoid()
    return logsigmoid(torch.tensor(contacts_num/8)).item()

In [687]:
learning_data['log_p_t'] = learning_data.apply(
    lambda x: calc_p_t(x['contacts_num']),  axis=1
)

In [688]:
def calc_prev_p_t(user_id, grid_time):
    prev_time = grid_time - DAY_LENGTH
    row = learning_data[(
        (learning_data['grid_time'] == prev_time) & 
        (learning_data['user_id'] == user_id))]
    if row.shape[0] > 0:
        return row['log_p_t'].item()
    return 0

In [689]:
learning_data['prev_log_p_t'] = learning_data.apply(
    lambda x: calc_prev_p_t(x['user_id'], x['grid_time']),  axis=1
)

In [690]:
features = learning_data[[el for el in learning_data.columns if el != 'target']]

In [691]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(
features, learning_data['target'], test_size=0.3
)

In [692]:
learning_data_new = x_train.copy()
learning_data_new['target'] =y_train 

In [693]:
learning_data_new = learning_data_new[
    ((learning_data_new['contacts_num'] > 0) | (learning_data_new['target'] == 1))]

In [694]:
learning_data_new.shape

(1150, 8)

In [696]:
from sklearn.utils import shuffle
learning_data_new = shuffle(learning_data_new)

In [697]:
def yield_batches(batch_size, dataset):
    datast_size = dataset.shape[0]
    step = np.arange(0, datast_size, batch_size)
    for i in range(1, len(step)):
        slice_ = dataset.iloc[step[i-1]:step[i]]
        yield torch.FloatTensor(slice_[['contacts_num', 'infected_contacts_num' ]
        ].values), torch.FloatTensor(slice_['prev_log_p_t'].values),torch.LongTensor(
            slice_['target'].values)

In [717]:
class LogReg(nn.Module):
    def __init__(self, input_dim):
        super(LogReg, self).__init__()
        self.register_parameter(name='bias',
                                param=torch.nn.Parameter(torch.randn(1, 1)))
        self.register_parameter(name='weight', param=torch.nn.Parameter(
            torch.randn(input_dim, 1)))
        self.logsigmoid = nn.LogSigmoid()

    def forward(self, input_, logp):
        weight = torch.exp(self.weight)
        bias = torch.exp(self.bias)
        out = input_.matmul(weight) - bias + logp.unsqueeze(1)
        out_positive = self.logsigmoid(out)
        out_negative = self.logsigmoid(out) - out

        return torch.cat([out_negative, out_positive], dim=1)

In [718]:
class CustomLoss(nn.Module):
    def __init__(self, weight):
        super(CustomLoss, self).__init__()
        self.weight = weight.unsqueeze(0)

    def forward(self, input_, target):
        target = target.unsqueeze(1)
        targets = torch.cat([target, 1 - target], dim=1).float()
        return (-input_ * targets*self.weight).sum()

In [719]:
net = LogReg(2)

In [720]:
net.weight

Parameter containing:
tensor([[ 1.5300],
        [-0.2431]], requires_grad=True)

In [721]:
iterator = yield_batches(8, learning_data_new)

In [722]:
y = torch.randn(8, 2)
logp = torch.randn(8)
pred = net(y, logp)
target = torch.tensor([1, 0 ,1, 0, 1, 0, 1, 0])
CustomLoss(weight=torch.FloatTensor([1, 6]))(pred, target)

tensor(62.0054, grad_fn=<SumBackward0>)

In [723]:
def recalc_log_p(net):
    learning_data_old['log_p_t'] = pd.Series(net(
        torch.FloatTensor(x_train[['contacts_num', 'infected_contacts_num']].values),
        torch.FloatTensor(x_train['prev_log_p_t'].values)
    ).detach().numpy()[:, 1])
    learning_data_new['prev_log_p_t'] = x_train.apply(
    lambda x: calc_prev_p_t(x['user_id'], x['grid_time']),  axis=1
)

In [724]:
PRINT_TEMPLATE = (
    'iteration={iteration:}\tavg_loss={avg_loss:.5f}\t'
)

net.train()
training_logs = []
avg_loss = 0
learning_rate = 0.05
loss_function = CustomLoss(weight=torch.FloatTensor([1, 1]))
print_every = 3
optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate)
iteration = 0
for  feats, logp, targets in iterator:
    optimizer.zero_grad()
    predictions = net(feats, logp)
    loss = loss_function(predictions, target)
    loss.backward()
    optimizer.step()
    avg_loss += loss.item()
    recalc_log_p(net)
    if iteration % print_every == print_every - 1:
        print(PRINT_TEMPLATE.format(
            iteration=iteration,
            avg_loss=avg_loss/print_every
        ))
        avg_loss = 0
    iteration += 1

iteration=2	avg_loss=9.64565	
iteration=5	avg_loss=6.25140	
iteration=8	avg_loss=8.21641	
iteration=11	avg_loss=5.78216	
iteration=14	avg_loss=5.74387	
iteration=17	avg_loss=6.48955	
iteration=20	avg_loss=6.53631	
iteration=23	avg_loss=5.68565	
iteration=26	avg_loss=5.26053	
iteration=29	avg_loss=6.18987	
iteration=32	avg_loss=6.29299	
iteration=35	avg_loss=5.96458	
iteration=38	avg_loss=5.77903	
iteration=41	avg_loss=5.75628	
iteration=44	avg_loss=6.09969	
iteration=47	avg_loss=5.40546	
iteration=50	avg_loss=5.42493	
iteration=53	avg_loss=5.76015	
iteration=56	avg_loss=5.34993	
iteration=59	avg_loss=7.70770	
iteration=62	avg_loss=5.48883	
iteration=65	avg_loss=6.04001	
iteration=68	avg_loss=5.56634	
iteration=71	avg_loss=6.99431	
iteration=74	avg_loss=5.37838	
iteration=77	avg_loss=6.57014	
iteration=80	avg_loss=5.65762	
iteration=83	avg_loss=10.07633	
iteration=86	avg_loss=6.47464	
iteration=89	avg_loss=5.78040	
iteration=92	avg_loss=5.80271	
iteration=95	avg_loss=5.77408	
iteration=

In [725]:
p_t_test = pd.Series(torch.exp(net(
        torch.FloatTensor(x_test[['contacts_num', 'infected_contacts_num']].values),
        torch.FloatTensor(x_test['prev_log_p_t'].values)
    )).detach().numpy()[:, 1])

In [654]:
learning_data_old['p_t'] = pd.Series(torch.exp(net(
        torch.FloatTensor(learning_data_old[['contacts_num', 'infected_contacts_num']].values),
        torch.FloatTensor(learning_data_old['prev_log_p_t'].values)
    )).detach().numpy()[:, 1])

In [726]:
targets = y_test.values
predictions = p_t_test.values

In [727]:
for treshold in np.arange(0, 1, 0.05):
    tp = ((targets == 1) & (predictions > treshold )).sum()
    fp = ((targets == 0) & (predictions > treshold)).sum()
    tn = ((targets == 0) & (predictions < treshold )).sum()
    fn = ((targets == 1) & (predictions < treshold )).sum()
    print('treshold:', treshold, 'precision:',  tp/(tp + fp), 'recall:',  tp/(tp + fn))

treshold: 0.0 precision: 0.10909090909090909 recall: 1.0
treshold: 0.05 precision: 0.10909090909090909 recall: 1.0
treshold: 0.1 precision: 0.10909090909090909 recall: 1.0
treshold: 0.15000000000000002 precision: 0.10909090909090909 recall: 1.0
treshold: 0.2 precision: 0.10909090909090909 recall: 1.0
treshold: 0.25 precision: 0.10909090909090909 recall: 1.0
treshold: 0.30000000000000004 precision: 0.125 recall: 0.7333333333333333
treshold: 0.35000000000000003 precision: 0.12571428571428572 recall: 0.7333333333333333
treshold: 0.4 precision: 0.1780821917808219 recall: 0.43333333333333335
treshold: 0.45 precision: 0.19047619047619047 recall: 0.4
treshold: 0.5 precision: 0.21621621621621623 recall: 0.17777777777777778
treshold: 0.55 precision: 0.2 recall: 0.08888888888888889
treshold: 0.6000000000000001 precision: 0.1724137931034483 recall: 0.05555555555555555
treshold: 0.65 precision: 0.3 recall: 0.03333333333333333
treshold: 0.7000000000000001 precision: 0.3333333333333333 recall: 0.033

In [716]:
x_train.to_csv('x_train.csv')
y_train.to_csv('y_train.csv')
x_test.to_csv('x_test.csv')
y_test.to_csv('y_test.csv')

In [731]:
torch.exp(net.weight)

tensor([[0.3931],
        [0.3455]], grad_fn=<ExpBackward>)

In [732]:
torch.exp(net.bias)

tensor([[0.2439]], grad_fn=<ExpBackward>)

In [662]:
for treshold in np.arange(0, 1, 0.05):
    tp = ((targets == 1) & (predictions > treshold )).sum()
    fp = ((targets == 0) & (predictions > treshold)).sum()
    tn = ((targets == 0) & (predictions < treshold )).sum()
    fn = ((targets == 1) & (predictions < treshold )).sum()
    print('treshold:', treshold, 'precision:',  tp/(tp + fp), 'recall:',  tp/(tp + fn))

treshold: 0.0 precision: 0.1290909090909091 recall: 1.0
treshold: 0.05 precision: 0.1290909090909091 recall: 1.0
treshold: 0.1 precision: 0.1290909090909091 recall: 1.0
treshold: 0.15000000000000002 precision: 0.1290909090909091 recall: 1.0
treshold: 0.2 precision: 0.1290909090909091 recall: 1.0
treshold: 0.25 precision: 0.1290909090909091 recall: 1.0
treshold: 0.30000000000000004 precision: 0.1290909090909091 recall: 1.0
treshold: 0.35000000000000003 precision: 0.1290909090909091 recall: 1.0
treshold: 0.4 precision: 0.1290909090909091 recall: 1.0
treshold: 0.45 precision: 0.1525917297612114 recall: 0.7380281690140845
treshold: 0.5 precision: 0.2206148282097649 recall: 0.3436619718309859
treshold: 0.55 precision: 0.218562874251497 recall: 0.2056338028169014
treshold: 0.6000000000000001 precision: 0.2571428571428571 recall: 0.1267605633802817
treshold: 0.65 precision: 0.30357142857142855 recall: 0.04788732394366197
treshold: 0.7000000000000001 precision: 0.3611111111111111 recall: 0.036

  
