<a href="https://colab.research.google.com/github/cappelchi/calcio_notebooks/blob/main/draft/football_w2v_ncvis_catboost_draft_221214.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Project config

In [None]:
try:
    import neptune.new as neptune
except:
    !pip install neptune-client >> None
    import neptune.new as neptune
#from neptune.new.integrations.tensorflow_keras import NeptuneCallback
def get_credential(frmwork = 'neptune_team'):
    with open('credential.txt', 'r') as container:
        for line in container:
            if frmwork in line:
                login, psw = line.split(' ')[1], line.split(' ')[2].split('\n')[0]
                return login, psw

In [None]:
#@title Set API key for neptune.ai
set_api = True #@param {type:"boolean"}
if set_api:
    username, api_key = get_credential()

### Installations

In [None]:
!pip install --upgrade gensim >> None
!pip install catboost >> None

### Downloads

In [None]:
dataset_name = './dataset.npz'
dataframe_name = './data_ready.csv.gz'
#validation_dataset_name = './prem_validation.csv'
data_version = 'data_221212/'
project = neptune.init_project(
    name="scomesse/football", 
    api_token = api_key
    )
project[data_version + 'dataset_npz'].download(dataset_name)
project[data_version + 'data_ready'].download(dataframe_name)
word2vec_params = project[data_version+ 'word2vec_params'].fetch()
project.stop()

https://app.neptune.ai/scomesse/football/
Remember to stop your project once you’ve finished logging your metadata (https://docs.neptune.ai/api/project#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
All 0 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/metadata


### Imports

In [None]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 50
pd.options.display.max_rows = 100
print(pd.__version__)
print(np.__version__)

1.3.5
1.21.6


In [None]:
from tqdm import tqdm

In [None]:
from catboost import CatBoost
from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
np.random.seed(147)

### Code

#### Load tensors

In [None]:
#@title Target
target_result = "HOME" #@param ["HOME", "DRAW", "AWAY"]
target_dict = {'HOME':0, 'DRAW':1, 'AWAY':2}
clmn = target_dict[target_result]
data_npz = np.load(dataset_name)
X_train, X_test, X_validation, X_production = \
data_npz['X_train'], data_npz['X_test'], data_npz['X_validation'], data_npz['X_production']
y_train, y_test, y_validation,  y_production = \
data_npz['y_class_train'][:, clmn], data_npz['y_class_test'][:, clmn], data_npz['y_class_validation'][:, clmn], data_npz['y_class_production'][:, clmn]
Line_production, embedding_matrix = data_npz['Line_production'], data_npz['embedding_matrix']

In [None]:
data_df = pd.read_csv('./data_ready.csv.gz')

In [None]:
names =['home_input_1', 
         'home_input_2', 
         'home_input_3', 
         'home_input_4', 
         'home_input_5', 
         'home_input_6', 
         'home_input_7', 
         'home_input_8', 
         'home_input_9', 
         'home_input_10',
         'away_input_1', 
         'away_input_2', 
         'away_input_3', 
         'away_input_4', 
         'away_input_5', 
         'away_input_6', 
         'away_input_7', 
         'away_input_8', 
         'away_input_9', 
         'away_input_10']

In [None]:
val_date = pd.to_datetime('2022-01-01').timestamp()
train_date = pd.to_datetime('2019-01-01').timestamp()

In [None]:
val_date = pd.to_datetime('2022-01-01').timestamp()
train_date = pd.to_datetime('2019-01-01').timestamp()
validation_vector = (data_df['timestamp'] > val_date).values
test_vector = ((data_df['timestamp'] < val_date) & (data_df['timestamp'] > train_date)).values
train_vector = (data_df['timestamp'] < train_date).values
print(train_vector.sum(), test_vector.sum(), validation_vector.sum())

1923326 582753 197544


In [None]:
for sentence in data_df[names][validation_vector].values:
    print(sentence.astype(str))
    break

23 149 143 41 143 17 41 12 46 148 485 11 137 35 17 179 137 156 58 16


In [None]:
X_train = pd.DataFrame([
    ' '.join(sentence.astype(str))
    for sentence in tqdm(data_df[names][train_vector].values)
                ])
X_test = pd.DataFrame([
    ' '.join(sentence.astype(str))
    for sentence in tqdm(data_df[names][test_vector].values)
                ])
X_validation = pd.DataFrame([
    ' '.join(sentence.astype(str))
    for sentence in tqdm(data_df[names][validation_vector].values)
                ])

100%|██████████| 1923326/1923326 [00:46<00:00, 41806.03it/s]
100%|██████████| 582753/582753 [00:13<00:00, 44072.77it/s]
100%|██████████| 197544/197544 [00:04<00:00, 40871.89it/s]


In [None]:
# Our target variable
y_train = data_df['binary_output'][train_vector].copy()
y_test = data_df['binary_output'][test_vector].copy()
y_validation = data_df['binary_output'][validation_vector].copy()

(20015, 18025)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
vectorizer = TfidfVectorizer(max_features = 1000)
X_train_vect = vectorizer.fit_transform([
    ' '.join(sentence.astype(str))
    for sentence in tqdm(data_df[names][train_vector].values)
                ])
X_test_vect = vectorizer.fit_transform([
    ' '.join(sentence.astype(str))
    for sentence in tqdm(data_df[names][test_vector].values)
                ])

100%|██████████| 1923326/1923326 [01:06<00:00, 28902.76it/s]
100%|██████████| 582753/582753 [00:16<00:00, 36017.97it/s]


In [None]:
#train_data = Pool(X_train, y_train, text_features=[0])
#test_data = Pool(X_test, y_test, text_features=[0])
train_data = Pool(X_train_vect.toarray(), y_train) ## toarray() is added to prevent catboost from failing (to avoid sparse array error)
test_data = Pool(X_test_vect.toarray(), y_test) ## toarray() is added to prevent catboost from failing (to avoid sparse array error)

In [None]:
booster = CatBoostClassifier(iterations=10)

In [None]:
#booster = CatBoostClassifier(iterations=100, learning_rate=0.001, eval_metric='Accuracy',
#                             text_processing=['NaiveBayes+Word|BoW+Word:min_token_occurrence=2'])

In [None]:
booster.fit(train_data, eval_set=test_data)

In [None]:
print("\nTest  Accuracy : %.2f"%booster.score(train_data))
print("Train Accuracy : %.2f"%booster.score(test_data))

CatBoostError: ignored

### NCVIS

In [None]:
!pip install ncvis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ncvis
  Downloading ncvis-1.5.9.tar.gz (290 kB)
[K     |████████████████████████████████| 290 kB 32.5 MB/s 
[?25hBuilding wheels for collected packages: ncvis
  Building wheel for ncvis (setup.py) ... [?25l[?25hdone
  Created wheel for ncvis: filename=ncvis-1.5.9-cp38-cp38-linux_x86_64.whl size=829518 sha256=80944eeed643780b05778815b24a10a5a98b4fa29916620001ac31a3a0e0c3c7
  Stored in directory: /root/.cache/pip/wheels/d0/12/05/37c1bf30302aabb78904eed5b3ec8ed67fa01e5a93112a8437
Successfully built ncvis
Installing collected packages: ncvis
Successfully installed ncvis-1.5.9


In [None]:
import ncvis
import plotly.express as px

In [None]:
embedding_matrix.shape

(793, 16)

In [None]:
vis = ncvis.NCVis(d = 2, n_neighbors = 100, n_epochs =100, random_seed = 47, min_dist = 0.25)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations



In [None]:
%%time
Y = vis.fit_transform(embedding_matrix)

CPU times: user 4.88 s, sys: 5.85 ms, total: 4.89 s
Wall time: 2.48 s


In [None]:
px.scatter(pd.DataFrame(Y, columns = ['c1', 'c2']), x = 'c1', y = 'c2')

In [None]:
zeros1_train = X_train[:,:10].sum(axis = 1) == 0
zeros2_train = X_train[:,10:].sum(axis = 1) == 0
X_train = X_train[~(zeros1_train | zeros2_train)]

zeros1_test = X_test[:,:10].sum(axis = 1) == 0
zeros2_test = X_test[:,10:].sum(axis = 1) == 0
X_test = X_test[~(zeros1_test | zeros2_test)]

zeros1_validation = X_validation[:,:10].sum(axis = 1) == 0
zeros2_validation = X_validation[:,10:].sum(axis = 1) == 0
X_validation = X_validation[~(zeros1_validation | zeros2_validation)]

In [None]:
zeros1_train.sum(), zeros2_train.sum(), (zeros1_train | zeros2_train).sum()

(20015, 18025, 29103)

In [None]:
zeros1_test.sum(), zeros2_test.sum(), (zeros1_test | zeros2_test).sum()

(5164, 4278, 7910)

In [None]:
zeros1_validation.sum(), zeros2_validation.sum(), (zeros1_validation | zeros2_validation).sum()

(1454, 1219, 2327)

In [None]:
y_train = y_train[~(zeros1_train | zeros2_train)]
y_test = y_test[~(zeros1_test | zeros2_test)]
y_validation = y_validation[~(zeros1_validation | zeros2_validation)]

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_validation.shape, y_validation.shape

((1894223, 20), (1894223,), (574843, 20), (574843,), (195217, 20), (195217,))

In [None]:
X_train_mod = Y[X_train.flatten()].reshape(-1, 40)
X_test_mod = Y[X_test.flatten()].reshape(-1, 40)
X_validation_mod = Y[X_validation.flatten()].reshape(-1, 40)

In [None]:
X_train_mod.shape, X_test_mod.shape, X_validation_mod.shape

((1894223, 40), (574843, 40), (195217, 40))

In [None]:
#train_data = Pool(X_train, y_train, text_features=[0])
#test_data = Pool(X_test, y_test, text_features=[0])
train_data = Pool(X_train_mod, y_train) ## toarray() is added to prevent catboost from failing (to avoid sparse array error)
test_data = Pool(X_test_mod, y_test) ## toarray() is added to prevent catboost from failing (to avoid sparse array error)

In [None]:
booster = CatBoostClassifier(iterations=100)

In [None]:
booster.fit(train_data, eval_set=test_data)

Learning rate set to 0.5
0:	learn: 0.6826745	test: 0.6813715	best: 0.6813715 (0)	total: 589ms	remaining: 58.4s
1:	learn: 0.6760055	test: 0.6736729	best: 0.6736729 (1)	total: 1s	remaining: 49.3s
2:	learn: 0.6712705	test: 0.6682682	best: 0.6682682 (2)	total: 1.44s	remaining: 46.5s
3:	learn: 0.6679656	test: 0.6644342	best: 0.6644342 (3)	total: 1.78s	remaining: 42.8s
4:	learn: 0.6657423	test: 0.6617106	best: 0.6617106 (4)	total: 2.13s	remaining: 40.5s
5:	learn: 0.6637888	test: 0.6595078	best: 0.6595078 (5)	total: 2.54s	remaining: 39.8s
6:	learn: 0.6623079	test: 0.6578207	best: 0.6578207 (6)	total: 2.88s	remaining: 38.2s
7:	learn: 0.6608790	test: 0.6562588	best: 0.6562588 (7)	total: 3.29s	remaining: 37.9s
8:	learn: 0.6597525	test: 0.6549824	best: 0.6549824 (8)	total: 3.7s	remaining: 37.4s
9:	learn: 0.6586010	test: 0.6536451	best: 0.6536451 (9)	total: 4.07s	remaining: 36.6s
10:	learn: 0.6577092	test: 0.6527730	best: 0.6527730 (10)	total: 4.43s	remaining: 35.9s
11:	learn: 0.6570138	test: 0.65

<catboost.core.CatBoostClassifier at 0x7f3089c03a60>

In [None]:
print("\nTest  Accuracy : %.4f"%booster.score(train_data))
print("Train Accuracy : %.4f"%booster.score(test_data))


Test  Accuracy : 0.6197
Train Accuracy : 0.6290
