In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece
!nvidia-smi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m103.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.1
Looking in indexes: https://pypi.org/simple, https://u

In [3]:
import os

DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [4]:
class CFG:
  debug=False
  ver=3
  seed=42
  model="lightgbm"
  n_folds = 15
  target_col="y"

In [5]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

train["src"]="train"
test["src"]="test"

df = pd.concat([train,test],ignore_index=True)

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 7)


Unnamed: 0,id,title,year,abstract,keywords,y,src
0,1,Hierarchical Adversarially Learned Inference,2018,We propose a novel hierarchical generative mod...,"generative, hierarchical, unsupervised, semisu...",0,train
1,2,Learning to Compute Word Embeddings On the Fly,2018,Words in natural language follow a Zipfian dis...,"NLU, word embeddings, representation learning",0,train
2,3,Graph2Seq: Scalable Learning Dynamics for Graphs,2018,Neural networks are increasingly used as a gen...,,0,train


(6393, 6)


Unnamed: 0,id,title,year,abstract,keywords,src
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode...",test
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r...",test
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine...",test


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [6]:
train.isnull().sum()

id            0
title         0
year          0
abstract      0
keywords    480
y             0
src           0
dtype: int64

In [7]:
test.isnull().sum()

id            0
title         0
year          0
abstract      0
keywords    775
src           0
dtype: int64

In [8]:
train["num_title_word"] = train["title"].apply(lambda x: len(x.split()))
train["num_abst_word"] = train["abstract"].apply(lambda x: len(x.split()))

test["num_title_word"] = test["title"].apply(lambda x: len(x.split()))
test["num_abst_word"] = test["abstract"].apply(lambda x: len(x.split()))

In [9]:
#train["full_text"] = train["title"] + "[SEP]" + train["abstract"]
#test["full_text"] = test["title"] + "[SEP]" + test["abstract"]

train["full_text"] = train["title"]
test["full_text"] = test["title"]

In [10]:
if CFG.debug:
  print(train.shape)
  print(test.shape)
  train = train.sample(n=500, random_state=CFG.seed).reset_index(drop=True)
  test = test.sample(n=500, random_state=CFG.seed).reset_index(drop=True)
  print(train.shape)
  print(test.shape)

In [11]:
from transformers import AutoModel,AutoTokenizer
import torch
import torch.nn.functional as F
from tqdm import tqdm
import gc
import random

In [12]:
def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [13]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state.detach().cpu()
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )

In [14]:
BATCH_SIZE = 3

class EmbedDataset(torch.utils.data.Dataset):
    def __init__(self,df):
        self.df = df.reset_index(drop=True)
    def __len__(self):
        return len(self.df)
    def __getitem__(self,idx):
        text = self.df.loc[idx,"full_text"]
        tokens = tokenizer(
                text,
                None,
                add_special_tokens=True,
                padding='max_length',
                truncation=True,
                max_length=MAX_LEN,return_tensors="pt")
        tokens = {k:v.squeeze(0) for k,v in tokens.items()}
        return tokens

ds_tr = EmbedDataset(train)
embed_dataloader_tr = torch.utils.data.DataLoader(ds_tr,\
                        batch_size=BATCH_SIZE,\
                        shuffle=False)
ds_te = EmbedDataset(test)
embed_dataloader_te = torch.utils.data.DataLoader(ds_te,\
                        batch_size=BATCH_SIZE,\
                        shuffle=False)

In [15]:
tokenizer = None
MAX_LEN = 640

def get_embeddings(MODEL_NM='', MAX=640, BATCH_SIZE=3, verbose=True):
    global tokenizer, MAX_LEN
    DEVICE="cuda"
    model = AutoModel.from_pretrained( MODEL_NM )
    tokenizer = AutoTokenizer.from_pretrained( MODEL_NM )
    MAX_LEN = MAX
    
    model = model.to(DEVICE)
    model.eval()
        
    all_train_text_feats = []
    for batch in tqdm(embed_dataloader_tr,total=len(embed_dataloader_tr)):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        with torch.no_grad():
            model_output = model(input_ids=input_ids,attention_mask=attention_mask)
        sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
        # Normalize the embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
        all_train_text_feats.extend(sentence_embeddings)
    all_train_text_feats = np.array(all_train_text_feats)
    if verbose:
        print('Train embeddings shape',all_train_text_feats.shape)

    te_text_feats = []
    for batch in tqdm(embed_dataloader_te,total=len(embed_dataloader_te)):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        with torch.no_grad():
            model_output = model(input_ids=input_ids,attention_mask=attention_mask)
        sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
        # Normalize the embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
        te_text_feats.extend(sentence_embeddings)
    te_text_feats = np.array(te_text_feats)
    if verbose:
        print('Test embeddings shape',te_text_feats.shape)
        
    return all_train_text_feats, te_text_feats

In [16]:
MODEL_NM = 'microsoft/deberta-base'
all_train_text_feats, te_text_feats = get_embeddings(MODEL_NM)

Downloading (…)lve/main/config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

100%|██████████| 1658/1658 [01:34<00:00, 17.52it/s]


Train embeddings shape (4974, 768)


100%|██████████| 2131/2131 [01:57<00:00, 18.18it/s]

Test embeddings shape (6393, 768)





In [17]:
MODEL_NM = 'microsoft/deberta-v3-large'
all_train_text_feats2, te_text_feats2 = get_embeddings(MODEL_NM)

Downloading (…)lve/main/config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)"spm.model";:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 1658/1658 [04:11<00:00,  6.60it/s]


Train embeddings shape (4974, 1024)


100%|██████████| 2131/2131 [05:21<00:00,  6.62it/s]

Test embeddings shape (6393, 1024)





In [18]:
MODEL_NM = 'microsoft/deberta-large'
all_train_text_feats3, te_text_feats3 = get_embeddings(MODEL_NM)

Downloading (…)lve/main/config.json:   0%|          | 0.00/475 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

100%|██████████| 1658/1658 [04:15<00:00,  6.49it/s]


Train embeddings shape (4974, 1024)


100%|██████████| 2131/2131 [05:28<00:00,  6.50it/s]

Test embeddings shape (6393, 1024)





In [19]:
#MODEL_NM = 'microsoft/deberta-large-mnli'
#all_train_text_feats4, te_text_feats4 = get_embeddings(MODEL_NM, MAX=512)

In [20]:
MODEL_NM = 'microsoft/deberta-xlarge'
all_train_text_feats5, te_text_feats5 = get_embeddings(MODEL_NM, MAX=512)

Downloading (…)lve/main/config.json:   0%|          | 0.00/475 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

100%|██████████| 1658/1658 [06:31<00:00,  4.23it/s]


Train embeddings shape (4974, 1024)


100%|██████████| 2131/2131 [08:23<00:00,  4.23it/s]

Test embeddings shape (6393, 1024)





In [21]:
#MODEL_NM = 'microsoft/deberta-v2-xlarge'
#all_train_text_feats6, te_text_feats6 = get_embeddings(MODEL_NM, MAX=512)

In [22]:
MODEL_NM = 'microsoft/deberta-v3-base'
all_train_text_feats7, te_text_feats7 = get_embeddings(MODEL_NM)

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)"spm.model";:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 1658/1658 [01:29<00:00, 18.44it/s]


Train embeddings shape (4974, 768)


100%|██████████| 2131/2131 [01:55<00:00, 18.45it/s]

Test embeddings shape (6393, 768)





In [23]:
all_train_text_feats = np.concatenate([all_train_text_feats,all_train_text_feats2,
                                       all_train_text_feats3,#all_train_text_feats4,
                                       all_train_text_feats5, #all_train_text_feats6,
                                     all_train_text_feats7],axis=1)

te_text_feats = np.concatenate([te_text_feats,te_text_feats2,
                                te_text_feats3,#te_text_feats4,
                                te_text_feats5,#te_text_feats6,
                                te_text_feats7],axis=1)

del all_train_text_feats2, te_text_feats2
del all_train_text_feats3, te_text_feats3
#del all_train_text_feats4, te_text_feats4
del all_train_text_feats5, te_text_feats5
#del all_train_text_feats6, te_text_feats6
del all_train_text_feats7, te_text_feats7
gc.collect()

print('Our concatenated embeddings have shape', all_train_text_feats.shape )

Our concatenated embeddings have shape (4974, 4608)


In [24]:
from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold
skf = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
for i,(train_index, val_index) in enumerate(skf.split(train,train[CFG.target_col])):
    train.loc[val_index,'fold'] = i

print('Train samples per fold:')
train["fold"] = train["fold"].astype(int)
display(train.groupby("fold").size())

Train samples per fold:


fold
0     332
1     332
2     332
3     332
4     332
5     332
6     332
7     332
8     332
9     331
10    331
11    331
12    331
13    331
14    331
dtype: int64

In [25]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

oof_df = pd.DataFrame()
scores = []
preds = []
for fold in range(CFG.n_folds):
    print('#'*25)
    print('### Fold',fold+1)
    print('#'*25)
    
    tr_ = train[train["fold"]!=fold]
    ev_ = train[train["fold"]==fold]
    
    tr_feats = all_train_text_feats[list(tr_.index)]
    ev_feats = all_train_text_feats[list(ev_.index)]

    clf = SVC(C=1, gamma='auto', probability=True, random_state=CFG.seed)
    clf.fit(tr_feats, tr_[CFG.target_col].values)
    ev_preds = clf.predict_proba(ev_feats)
    ev_["pred_values"] = ev_preds[:,1]
    oof_df = pd.concat([oof_df, ev_])

    test_pred = clf.predict_proba(te_text_feats)
    print()
    score = accuracy_score(ev_[CFG.target_col].values, (ev_["pred_values"]>.5).astype(int))
    scores.append(score)
    print("Fold : {} Accuracy score: {}".format(fold,score))
    preds.append(test_pred[:,1])
    
print('#'*25)
print('Overall CV Accuracy =',np.mean(scores))
print('OOF CV Accuracy = ',accuracy_score(oof_df[CFG.target_col].values, (oof_df["pred_values"]>.5).astype(int)))

#########################
### Fold 1
#########################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]



Fold : 0 Accuracy score: 0.6957831325301205
#########################
### Fold 2
#########################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]



Fold : 1 Accuracy score: 0.6927710843373494
#########################
### Fold 3
#########################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]



Fold : 2 Accuracy score: 0.6927710843373494
#########################
### Fold 4
#########################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]



Fold : 3 Accuracy score: 0.6927710843373494
#########################
### Fold 5
#########################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]



Fold : 4 Accuracy score: 0.6927710843373494
#########################
### Fold 6
#########################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]



Fold : 5 Accuracy score: 0.6927710843373494
#########################
### Fold 7
#########################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]



Fold : 6 Accuracy score: 0.6927710843373494
#########################
### Fold 8
#########################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]



Fold : 7 Accuracy score: 0.6927710843373494
#########################
### Fold 9
#########################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]



Fold : 8 Accuracy score: 0.6927710843373494
#########################
### Fold 10
#########################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]



Fold : 9 Accuracy score: 0.6948640483383686
#########################
### Fold 11
#########################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]



Fold : 10 Accuracy score: 0.6948640483383686
#########################
### Fold 12
#########################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]



Fold : 11 Accuracy score: 0.6948640483383686
#########################
### Fold 13
#########################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]



Fold : 12 Accuracy score: 0.6948640483383686
#########################
### Fold 14
#########################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]



Fold : 13 Accuracy score: 0.6948640483383686
#########################
### Fold 15
#########################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]



Fold : 14 Accuracy score: 0.6948640483383686
#########################
Overall CV Accuracy = 0.6938090731506085
OOF CV Accuracy =  0.6938078005629272


In [26]:
best_score = 0
best_thresh = 0.5
for thresh in np.arange(0.1, 0.70, 0.01):
    thresh = np.round(thresh, 2)
    score = accuracy_score(oof_df[CFG.target_col], (oof_df["pred_values"]>thresh).astype(int))
    print("Accuracy score at threshold {0} is {1}".format(thresh, score))
    if score > best_score:
      best_score = score
      best_thresh = thresh
print()
print("best Accuracy score at threshold {0} is {1}".format(best_thresh, accuracy_score(oof_df[CFG.target_col], (oof_df.pred_values>best_thresh).astype(int))))

Accuracy score at threshold 0.1 is 0.30619219943707277
Accuracy score at threshold 0.11 is 0.30619219943707277
Accuracy score at threshold 0.12 is 0.30619219943707277
Accuracy score at threshold 0.13 is 0.30619219943707277
Accuracy score at threshold 0.14 is 0.30619219943707277
Accuracy score at threshold 0.15 is 0.30619219943707277
Accuracy score at threshold 0.16 is 0.30619219943707277
Accuracy score at threshold 0.17 is 0.30619219943707277
Accuracy score at threshold 0.18 is 0.30619219943707277
Accuracy score at threshold 0.19 is 0.30619219943707277
Accuracy score at threshold 0.2 is 0.30619219943707277
Accuracy score at threshold 0.21 is 0.30619219943707277
Accuracy score at threshold 0.22 is 0.30639324487334135
Accuracy score at threshold 0.23 is 0.30719742661841576
Accuracy score at threshold 0.24 is 0.30840369923602734
Accuracy score at threshold 0.25 is 0.3110172899075191
Accuracy score at threshold 0.26 is 0.3182549256131886
Accuracy score at threshold 0.27 is 0.33514274225975

In [27]:
test_pred = np.mean(preds,axis=0)
sub = pd.DataFrame({'y': test_pred})
sub[CFG.target_col] = (sub[CFG.target_col]>best_thresh).astype(int)
sub.reset_index(drop=True)
sub.index += 1
sub.index.name = 'id'

sub.to_csv(os.path.join(OUTPUT_DIR, f'submit_{CFG.model}_seed{CFG.seed}_ver{CFG.ver}.csv'))
display(sub)
display(sub.y.value_counts())

Unnamed: 0_level_0,y
id,Unnamed: 1_level_1
1,0
2,0
3,0
4,0
5,0
...,...
6389,0
6390,0
6391,0
6392,0


0    6392
1       1
Name: y, dtype: int64