In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:

import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn import metrics
import re
import random

import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.models import load_model
from keras import initializers, regularizers, constraints, optimizers, layers
import torch


seed = 42
n_folds = 5

random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

In [3]:
import os

DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [4]:
class CFG:
  debug=False
  ver=5
  seed=42
  model="GRU"
  n_folds = 15
  target_col="y"

In [5]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

train["src"]="train"
test["src"]="test"

df = pd.concat([train,test],ignore_index=True)

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 7)


Unnamed: 0,id,title,year,abstract,keywords,y,src
0,1,Hierarchical Adversarially Learned Inference,2018,We propose a novel hierarchical generative mod...,"generative, hierarchical, unsupervised, semisu...",0,train
1,2,Learning to Compute Word Embeddings On the Fly,2018,Words in natural language follow a Zipfian dis...,"NLU, word embeddings, representation learning",0,train
2,3,Graph2Seq: Scalable Learning Dynamics for Graphs,2018,Neural networks are increasingly used as a gen...,,0,train


(6393, 6)


Unnamed: 0,id,title,year,abstract,keywords,src
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode...",test
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r...",test
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine...",test


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [6]:
#train["full_text"] = train["title"] + " " + train["abstract"]
#test["full_text"] = test["title"] + " " + test["abstract"]

train["full_text"] = train["abstract"]
test["full_text"] = test["abstract"]

In [7]:
if CFG.debug:
  print(train.shape)
  print(test.shape)
  train = train.sample(n=500, random_state=CFG.seed).reset_index(drop=True)
  test = test.sample(n=500, random_state=CFG.seed).reset_index(drop=True)
  print(train.shape)
  print(test.shape)

In [8]:
from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold
skf = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
for i,(train_index, val_index) in enumerate(skf.split(train,train[CFG.target_col])):
    train.loc[val_index,'fold'] = i

print('Train samples per fold:')
train["fold"] = train["fold"].astype(int)
display(train.groupby("fold").size())

Train samples per fold:


fold
0     332
1     332
2     332
3     332
4     332
5     332
6     332
7     332
8     332
9     331
10    331
11    331
12    331
13    331
14    331
dtype: int64

In [9]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

oof_df = pd.DataFrame()
preds = np.zeros((len(test),1))
for fold in range(CFG.n_folds):
  print("="*10+f" fold = {fold} "+"="*10)

  train_df = train.loc[train.fold!=fold]
  val_df = train.loc[train.fold==fold]

  ## fill up the missing values
  train_X = train_df["full_text"].fillna("_na_").values
  val_X = val_df["full_text"].fillna("_na_").values
  test_X = test["full_text"].fillna("_na_").values

  ## Tokenize the sentences
  tokenizer = Tokenizer(num_words=max_features)
  tokenizer.fit_on_texts(list(train_X))
  train_X = tokenizer.texts_to_sequences(train_X)
  val_X = tokenizer.texts_to_sequences(val_X)
  test_X = tokenizer.texts_to_sequences(test_X)

  ## Pad the sentences 
  train_X = pad_sequences(train_X, maxlen=maxlen)
  val_X = pad_sequences(val_X, maxlen=maxlen)
  test_X = pad_sequences(test_X, maxlen=maxlen)

  ## Get the target values
  train_y = train_df['y'].values
  val_y = val_df['y'].values

  inp = Input(shape=(maxlen,))
  x = Embedding(max_features, embed_size)(inp)
  x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
  x = GlobalMaxPool1D()(x)
  x = Dense(16, activation="relu")(x)
  x = Dropout(0.1)(x)
  x = Dense(1, activation="sigmoid")(x)
  model = Model(inputs=inp, outputs=x)
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  model.fit(train_X, train_y, batch_size=512, epochs=3, validation_data=(val_X, val_y))

  pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)
  pred_noemb_test_y = model.predict([test_X], batch_size=1024, verbose=1)

  val_df['pred_values'] = pred_noemb_val_y
  preds+=pred_noemb_test_y

  oof_df = pd.concat([oof_df, val_df])

  #model.save(OUTPUT_MODEL_DIR+f'GRU_model_fold{fold}.h5')

  del model,train_df,val_df,train_X,val_X,test_X,pred_noemb_val_y,pred_noemb_test_y

Epoch 1/3
Epoch 2/3
Epoch 3/3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df['pred_values'] = pred_noemb_val_y


Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3



Epoch 2/3
Epoch 3/3
Epoch 1/3



Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [10]:
from sklearn.metrics import accuracy_score
best_score = 0
best_thresh = 0.5
for thresh in np.arange(0.1, 0.70, 0.01):
    thresh = np.round(thresh, 2)
    score = accuracy_score(oof_df[CFG.target_col], (oof_df["pred_values"]>thresh).astype(int))
    print("Accuracy score at threshold {0} is {1}".format(thresh, score))
    if score > best_score:
      best_score = score
      best_thresh = thresh
print()
print("best Accuracy score at threshold {0} is {1}".format(best_thresh, accuracy_score(oof_df[CFG.target_col], (oof_df.pred_values>best_thresh).astype(int))))

Accuracy score at threshold 0.1 is 0.30619219943707277
Accuracy score at threshold 0.11 is 0.30619219943707277
Accuracy score at threshold 0.12 is 0.30619219943707277
Accuracy score at threshold 0.13 is 0.30619219943707277
Accuracy score at threshold 0.14 is 0.30619219943707277
Accuracy score at threshold 0.15 is 0.30619219943707277
Accuracy score at threshold 0.16 is 0.30619219943707277
Accuracy score at threshold 0.17 is 0.30619219943707277
Accuracy score at threshold 0.18 is 0.30619219943707277
Accuracy score at threshold 0.19 is 0.30619219943707277
Accuracy score at threshold 0.2 is 0.30619219943707277
Accuracy score at threshold 0.21 is 0.30619219943707277
Accuracy score at threshold 0.22 is 0.30619219943707277
Accuracy score at threshold 0.23 is 0.30619219943707277
Accuracy score at threshold 0.24 is 0.30619219943707277
Accuracy score at threshold 0.25 is 0.30719742661841576
Accuracy score at threshold 0.26 is 0.30900683554483316
Accuracy score at threshold 0.27 is 0.316043425814

In [11]:
test_pred = preds / CFG.n_folds
sub = test.copy()
sub[CFG.target_col] = test_pred.T[0]
sub[CFG.target_col] = (sub[CFG.target_col]>best_thresh).astype(int)


sub[["id","y"]].to_csv(os.path.join(OUTPUT_DIR, f'submit_{CFG.model}_seed{CFG.seed}_ver{CFG.ver}.csv'),index=False)
display(sub[["id","y"]])
display(sub.y.value_counts())

Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
6388,6389,0
6389,6390,0
6390,6391,0
6391,6392,0


0    6377
1      16
Name: y, dtype: int64