In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Путь к папке в которой лежат папки с данными и логи

In [None]:
path = '/content/drive/MyDrive/project/'

In [None]:
import torch
import pandas as pd
from torch import nn, optim
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import copy
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
torch.manual_seed(1) 

<torch._C.Generator at 0x7f21697a3f70>

Настроим логгинг

In [None]:
import logging
def start_log(path):
    logging.basicConfig(
        filename= path + 'models.log',
        # filename='models.log',
        filemode='a',
        format='%(asctime)s - %(message)s', 
        datefmt='%d-%b-%y %H:%M:%S',
        level = logging.INFO
    )

# Датасет

Функция для загрузки датасета для данной позиции

In [None]:
def load_data(position, path, replaced):
  data_type = 'data/'
  if replaced:
    data_type = 'data replaced/'
  dpath = path + data_type + position
  X_train = pd.read_csv(dpath + '_train.csv')
  X_test = pd.read_csv(dpath + '_test.csv')
  X_val = pd.read_csv(dpath + '_val.csv')
  # logging.info("========================================================================")
  # logging.info("Loading dataset for {}".format(position))
  return X_train, X_test, X_val
  

Функция для загрузки датасета по всем позициям

In [None]:
def load_all(path, replaced):
  X_train, X_test, X_val =  pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
  positions = ['Centerbacks', 'Forwards', 'Fullbacks', 'Midfielders', 'Wingers']
  for position in positions:
    df1, df2, df3 = load_data(position, path, replaced)
    X_train = X_train.append(df1)
    X_test = X_test.append(df2)
    X_val = X_val.append(df3)
  return X_train, X_test, X_val

Функция для корректировки показателей на владение

In [None]:
def time_adj(df, attr_x, attr_y):
  for attr in attr_x:
    df[attr] = (df[attr] * 90) / df['minutes_x']
  for attr in attr_y:
    df[attr] = (df[attr] * 90) / df['minutes_y']

Функция для обработки датасета

In [None]:
def preprocess(df, attr_list, to_90_attr, target, pca_attributes, encode_countries = True, do_PCA = True):
    columns_x = attr_list
    target_cols = []
    target_cols.append(target)
    if (encode_countries):
        columns_x += ['country_x', 'country_y'] #у команд пока нет чемпионата, только у игроков, поэтому пока добавлю временно
    attr_x = [name + "_x" for name in to_90_attr if (name in pl_attr)]
    attr_y = [name + "_y" for name in to_90_attr if (name in target_cols)]
    df = df[columns_x + target_cols + pca_attributes].dropna() #убираем лишние столбцы и наны
    time_adj(df, attr_x, attr_y) #корректируем на 90 минут игрового времени
    

    if do_PCA:
        pca = PCA(n_components=4)
        x = df[pca_attributes].copy()
        principalComponents = pca.fit_transform(x)
        principalComponents = principalComponents / 100
        principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2', 'principal component 3', 'principal component 4'])
        df = pd.concat([df[columns_x + target_cols], principalDf], axis=1)
    df = df.dropna() #PCA почему то одбавляет наны
    df_y = df[target_cols]
    df.drop(target_cols, inplace = True, axis = 1)
    if (encode_countries):
        df = pd.get_dummies(df, columns= ['country_x', 'country_y']) #энкодим чемпионат
    return df, df_y

Функция возвращающая готовые для обучения датафреймы

In [None]:
def create_dataframes(X_train, X_test, X_val, encode_countries, target, attributes, attr_to_90, pca_attributes, PCA = True):
    X_train, y_train = preprocess(X_train, attributes, attr_to_90, target, pca_attributes, encode_countries, do_PCA = PCA)
    X_test, y_test = preprocess(X_test, attributes, attr_to_90, target, pca_attributes, encode_countries, do_PCA = PCA)
    X_val, y_val = preprocess(X_val, attributes, attr_to_90, target, pca_attributes, encode_countries, do_PCA = PCA)
    logging.info("================================================================")
    logging.info("List of attributes: {}".format(attributes))
    logging.info("Target: {}".format(target))
    logging.info("Target mean values: train = {}, val = {}, test = {}".format(y_train.mean(), y_val.mean(), y_test.mean()))
    return (X_train, X_test, X_val, y_train, y_test, y_val)

Итоговая функция для создания дафафреймов, вызывающая все вышеперечисленные функции в нужном порядке

In [None]:
def dataframes(position, target, path, replaced = True, encode_countries = False, PCA = True):
    # Загружаем датасет по нужным позициям
    if position == 'All':
        X_train, X_test, X_val = load_all(path, replaced)
    else:
        X_train, X_test, X_val = load_data(position, path, replaced)
    
    # Импортируем словарь с самыми важными показателями для предсказания каждого из важнейших атрибутов
    with open(path + 'param.yaml') as f: attr_dict = yaml.load(f)
    if target not in attr_dict:
        raise ValueError('There is no attributes list in the dict for this target')
    else:
        attributes = attr_dict[target]

    # Импортируем словарь с показателями для PCA
    if PCA:
        with open(path + 'PCA.yaml') as f: pca_dict = yaml.load(f)
        pca_attributes = [attr for attr in pca_dict[position] if (attr not in attributes)]
    else:
        pca_attributes = []
    
    # Создаем все необходимые датафреймы и возвращаем их
    return create_dataframes(X_train, X_test, X_val, encode_countries, target, attributes, [], pca_attributes, PCA = PCA)

# Модель 3

Создадим даталоадер

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, X, y, device='cuda'):
        
        super().__init__()

        self.len = len(y)
        self.X   = X.values
        self.y   = (y.values).reshape(-1, 1)
        self.device = device
        
    def __getitem__(self, index):

        features = torch.FloatTensor(self.X[index]).to(device)
        target = torch.FloatTensor(self.y[index]).to(device)

        return features, target
    
    def __len__(self):
        return self.len

device = 'cuda'

Функция для создания даталоадеров

In [None]:
def create_dataloaders(X_train, X_val, X_test, y_train, y_val, y_test):
  Trainloader = DataLoader(Dataset(X_train, y_train), 
                        batch_size=1024, shuffle=True)
  Validloader = DataLoader(Dataset(X_val, y_val),
                        batch_size=1024, shuffle=False)
  Testloader = DataLoader(Dataset(X_test, y_test),
                        batch_size=1024, shuffle=False)
  return (Trainloader, Validloader, Testloader)

Класс нейронной сети

In [None]:
class NN(nn.Module):
    def __init__(self, input_size, first_layer_size, second_layer_size):
        super(NN, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_size, first_layer_size),
            nn.ReLU(),
            nn.Linear(first_layer_size, second_layer_size),
            nn.ReLU(),
            nn.Linear(second_layer_size, 1)
        )
        

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

Функция для обучения модели

In [None]:
def train(ep_count, epochs, Trainloader, Validloader, Testloader, optimizer, model, criterion):
  train_losses, valid_losses = [], []
  for i in range(epochs):
    train_loss = 0
    for X, y in Trainloader:
        optimizer.zero_grad()
        pred = model(X)
        loss = criterion(pred, y)
        loss.backward()
        optimizer.step()
        # scheduler.step()
        
        train_loss += loss.item()
        
    else:
      valid_loss = 0
      model.eval()
      with torch.no_grad():
        for X, y in Validloader:
          pred = model(X)
          valid_loss += criterion(pred, y)
        
      if (i + 1) % 10 == 0:
        train_losses.append(train_loss/len(Trainloader))
        valid_losses.append(valid_loss/len(Validloader))
        if (i + 1) % 20 == 0:
          print(f"Epoch: {i + 1 + ep_count} ", f"Training Loss: {train_losses[-1]:.4f} ", f"Valid Loss: {valid_losses[-1]:.4f}")
        if (i + 1) % 200 == 0:
          logging.info(f"Epoch: {i + 1 + ep_count} " + f"Training Loss: {train_losses[-1]:.4f} " f"Valid Loss: {valid_losses[-1]:.4f}")
      model.train()
  
  ep_count += epochs
  test(model, Testloader, criterion) #тестируем 
  return ep_count, train_losses, valid_losses

Функция для подсчета ошибки на тесте

In [None]:
 def test(mod, Testloader, criterion): 
  test_loss = 0
  mod.eval()
  with torch.no_grad():
    for X, y in Testloader:
      pred = mod(X)
      test_loss += criterion(pred, y)

  mod.train()
  print("Test loss {}".format(test_loss))        
  logging.info("Test loss {}".format(test_loss))

Функция для подсчета MAPE

In [None]:
def MAPE(output, target):
  return torch.mean(torch.minimum(torch.abs((target - output) / (target+ 0.000001)), torch.tensor(1.5)))   

График ошибки во время последнего обучения

In [None]:
def make_plot():
    plt.plot(train_losses, label='Training loss')
    plt.plot(valid_losses, label='Validation loss')
    plt.yscale("log")
    plt.legend(frameon=False)

Общая функция, принимает даталоадеры, возвращает модель обученную на них.
Опционально принимает гиперпараметры, т.к. я пока не дописал автоподбор.

In [None]:
def run(Trainloader, Validloader, Testloader, inp_layer, layer1, layer2, lr = 0.001, criterion = MAPE,
        optimizer = optim.AdamW, scheduler = None, epochs = 200, plot = False):

    # Инициализируем модель
    model = NN(inp_layer, layer1, layer2).to('cuda')
    optimizer = optimizer(model.parameters(), lr=lr)
    ep_count = 0
    scheduler_state = (scheduler.state_dict() if scheduler  else 'None')
    logging.info("----------------------------------------------------------------------------------")
    logging.info('New model, model {}, criterion {}, optimizer {}'.format(model, criterion, optimizer))
    logging.info('scheduler {}'.format(scheduler))
    logging.info('{}'.format(scheduler_state))

    # Обучаем модель
    no_growth = 0
    best_loss = 10000
    best_model = copy.deepcopy(model)
    while (no_growth < 2):
        ep_count, train_losses, valid_losses = train(ep_count, epochs, Trainloader, Validloader, Testloader, optimizer, model, criterion)
        if valid_losses[-1] < best_loss:
            best_model = copy.deepcopy(model)
            best_loss = valid_losses[-1]
            no_growth = 0
        else:
            no_growth += 1
        if ep_count > 2000 and valid_losses[-1] > 1:
            return best_model, ep_count
        if ep_count > 10000:
            return best_model, ep_count
    if plot:
        make_plot()

    return best_model, ep_count

Финальная функция, принимает позицию и целевую переменную, возвращает предсказания_на_валидации, предсказания_на_тесте, мапе_на_валидации, мапе_на_тесте. Если модель выдает плохое качество пытается обучить её еще раз (потому что иногда модель застревает в "плохом" локальном минимуме).

In [None]:
def final(position, target, path, replaced = True, encode_countries = False, criterion = MAPE, PCA = True):
    start_log(path)
    X_train, X_test, X_val, y_train, y_test, y_val = dataframes(position, target, path, replaced = replaced, encode_countries = encode_countries, PCA = PCA)
    Trainloader, Validloader, Testloader = create_dataloaders(X_train, X_val, X_test, y_train, y_val, y_test)

    layer1, layer2 = 32, 16
    lr = 0.001
    i = 0

    best_ev = 2
    best_model = None
    while True:
        X, y = next(iter(Testloader))
        model, ep_count = run(Trainloader, Validloader, Testloader, X_train.shape[1], layer1, layer2, lr = lr, criterion = criterion)
        model.eval()
        X, y = next(iter(Validloader))
        err_val = criterion(model(X), y)
        X, y = next(iter(Testloader))
        err_test = criterion(model(X), y)

        #обновляем наилучшую модель
        if err_val.item() < best_ev:
            best_model = copy.deepcopy(model)
            best_ev = err_val.item()
            X, y = next(iter(Testloader))
            pred_test = best_model(X)
            best_et = criterion(pred_test, y)
            bl1 = layer1 #храним размеры слоев лучших моделей
            bl2 = layer2

        model.train()
        if (best_ev < 0.1 * (2 + i)):
            break
        else:
            i += 1
            if i >= 2:
                layer1 = X_train.shape[1]
                layer2 = layer1 // 2
            if ep_count >= 4000:
                lr *= 2
    
    X, y = next(iter(Validloader))
    pred_val = best_model(X)

    return best_model, X_train.shape[1], bl1, bl2, pred_val, pred_test, best_ev, best_et.item()

In [None]:
best_model, inp, bl1, bl2, pv, pt, ev, et = final('Wingers', 'crosses_y', '/content/drive/MyDrive/project/')

# Запуск и сохранение

Функция для сохранения модели

In [None]:
def save_checkpoint(path, name, inp, bl1, bl2):
    checkpoint = {'input_size': inp,
                'layer1': bl1,
                'layer2': bl2,
                'state_dict': best_model.state_dict()}

    torch.save(checkpoint, (path + 'models/' + name + 'checkpoint.pth'))

Функция для загрузки модели

In [None]:
def load_checkpoint(filepath):
    checkpoint = torch.load(filepath)
    model = NN(checkpoint['input_size'], checkpoint['layer1'], checkpoint['layer2'])
    model.load_state_dict(checkpoint['state_dict'])
    
    return model

In [None]:
# new_model = load_checkpoint((path + 'checkpoint.pth'))

Построим датафреймы с ошибками моделей для всех признаков

In [None]:
pos_dict = {'Midfielders' :['sca_per90_y', 'gca_per90_y', 'npxg_xa_per90_y','passes_progressive_distance_y',
'passes_into_penalty_area_y', 'passes_completed_y'],
           'Forwards' : ['sca_per90_y', 'gca_per90_y', 'npxg_per90_y','shots_total_per90_y'],
            'Wingers' : ['sca_per90_y', 'gca_per90_y', 'dribbles_completed_pct_y','crosses_y','carry_progressive_distance_y'],
            'Centerbacks': ['aerials_won_y', 'tackles_won_y','interceptions_y'],
            'Fullbacks' : ['tackles_won_y', 'crosses_y', 'interceptions_y']}
pos = ['Centerbacks', 'Midfielders', 'Wingers', 'Forwards', 'Fullbacks']
attrs = ['sca_per90_y', 'gca_per90_y', 'npxg_xa_per90_y', 'passes_progressive_distance_y', 'passes_into_penalty_area_y', 'passes_completed_y',
         'npxg_per90_y', 'shots_total_per90_y',
         'dribbles_completed_pct_y', 'crosses_y', 'carry_progressive_distance_y',
         'aerials_won_y', 'tackles_won_y', 'interceptions_y']


val_df = pd.DataFrame(columns = attrs, index = pos)
test_df = pd.DataFrame(columns = attrs, index = pos)
pos_frames = []
for p, at_list in pos_dict.items():
    pos_frame = pd.DataFrame(columns = at_list)
    for at in at_list:
        best_mod, inp, bl1, bl2, pv, pt, ev, et = final(p, at, '/content/drive/MyDrive/project/')
        val_df.at[p, at] = ev
        test_df.at[p, at] = et
        name = p + '_' + at + '_'
        save_checkpoint('/content/drive/MyDrive/project/', name, inp, bl1, bl2)
        pos_frame[at] = np.reshape(pv.cpu().detach().numpy(), pv.cpu().detach().numpy().shape[0]).tolist()
    pos_frames.append(pos_frame)

In [None]:
val_df

Unnamed: 0,sca_per90_y,gca_per90_y,npxg_xa_per90_y,passes_progressive_distance_y,passes_into_penalty_area_y,passes_completed_y,npxg_per90_y,shots_total_per90_y,dribbles_completed_pct_y,crosses_y,carry_progressive_distance_y,aerials_won_y,tackles_won_y,interceptions_y
Centerbacks,,,,,,,,,,,,0.444826,0.426283,0.570315
Midfielders,0.365005,0.925911,0.513973,0.388074,0.476056,0.372991,,,,,,,,
Wingers,0.282728,0.49631,,,,,,,0.168855,0.596257,0.621595,,,
Forwards,0.426603,0.689448,,,,,0.497541,0.296662,,,,,,
Fullbacks,,,,,,,,,,0.476164,,,0.454195,0.53385


In [None]:
test_df

Unnamed: 0,sca_per90_y,gca_per90_y,npxg_xa_per90_y,passes_progressive_distance_y,passes_into_penalty_area_y,passes_completed_y,npxg_per90_y,shots_total_per90_y,dribbles_completed_pct_y,crosses_y,carry_progressive_distance_y,aerials_won_y,tackles_won_y,interceptions_y
Centerbacks,,,,,,,,,,,,0.543358,0.519645,0.694901
Midfielders,0.401077,0.582115,0.562376,0.447747,0.65483,0.476808,,,,,,,,
Wingers,0.323212,0.588692,,,,,,,0.196729,0.61889,0.59857,,,
Forwards,0.262983,0.69492,,,,,0.358908,0.317042,,,,,,
Fullbacks,,,,,,,,,,0.640928,,,0.66632,0.651123


In [None]:
pos_frames[4].to_csv('/content/drive/MyDrive/project/pred_FB.csv', index = True)

In [None]:
val_df.to_csv('/content/drive/MyDrive/project/val_DF1.csv', index = True)

In [None]:
test_df.to_csv('/content/drive/MyDrive/project/test_DF1.csv', index = True)

In [None]:
pos = ['All']
attrs = [ 'sca_per90_y', 'gca_per90_y', 'npxg_xa_per90_y','passes_progressive_distance_y',
'passes_into_penalty_area_y', 'passes_completed_y', 'npxg_per90_y','shots_total_per90_y', 'dribbles_completed_pct_y','crosses_y','carry_progressive_distance_y', 
'aerials_won_y', 'tackles_won_y','interceptions_y']


val_df = pd.DataFrame(columns = attrs, index = pos)
test_df = pd.DataFrame(columns = attrs, index = pos)

for p in pos:
    for at in attrs:
        best_mod, inp, bl1, bl2, _, _, ev, et = final(p, at, '/content/drive/MyDrive/project/', PCA = False)
        val_df.at[p, at] = ev
        test_df.at[p, at] = et
        name = p + '_' + at + '_'
        save_checkpoint('/content/drive/MyDrive/project/', name, inp, bl1, bl2)

# Логи

Просмотр логов

In [None]:
!cat /content/drive/MyDrive/project/models.log

In [None]:
y_train.mean()

Очистка логов

In [None]:
# !echo -n > /content/drive/MyDrive/project/models.log