In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os

DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [3]:
class CFG:
  ver=2
  seed=42
  model="lightgbm"
  n_folds = 25
  target_col="y"

In [4]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 6)


Unnamed: 0,id,title,year,abstract,keywords,y
0,1,Hierarchical Adversarially Learned Inference,2018,We propose a novel hierarchical generative mod...,"generative, hierarchical, unsupervised, semisu...",0
1,2,Learning to Compute Word Embeddings On the Fly,2018,Words in natural language follow a Zipfian dis...,"NLU, word embeddings, representation learning",0
2,3,Graph2Seq: Scalable Learning Dynamics for Graphs,2018,Neural networks are increasingly used as a gen...,,0


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

tfidf_title_svd = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("svd", TruncatedSVD(n_components=8))
])
tfidf_abst_svd = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("svd", TruncatedSVD(n_components=128))
])

title_tr_vec = tfidf_title_svd.fit_transform(train["title"].fillna(''))
title_te_vec = tfidf_title_svd.transform(test["title"].fillna(''))

abst_tr_vec = tfidf_abst_svd.fit_transform(train["abstract"].fillna(''))
abst_te_vec = tfidf_abst_svd.transform(test["abstract"].fillna(''))

In [6]:
#tr_kyewords_count = np.array(train.keywords.str.split(',').apply(pd.Series).count(axis=1).values)
#te_kyewords_count = np.array(test.keywords.str.split(',').apply(pd.Series).count(axis=1).values)

In [7]:
all_train_feats = np.concatenate([title_tr_vec, abst_tr_vec],axis=1)
all_test_feats = np.concatenate([title_te_vec, abst_te_vec],axis=1)

print(all_train_feats.shape)

(4974, 136)


In [8]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
for i,(train_index, val_index) in enumerate(skf.split(train,train[CFG.target_col])):
    train.loc[val_index,'fold'] = i

print('Train samples per fold:')
train["fold"] = train["fold"].astype(int)
display(train.groupby("fold").size())

Train samples per fold:


fold
0     199
1     199
2     199
3     199
4     199
5     199
6     199
7     199
8     199
9     199
10    199
11    199
12    199
13    199
14    199
15    199
16    199
17    199
18    199
19    199
20    199
21    199
22    199
23    199
24    198
dtype: int64

In [9]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score

params = {
        'objective': "binary",
        'metric': 'binary_logloss',
        'learning_rate': 0.1,
        'boosting': "gbdt",
        'seed': CFG.seed,
        "reg_alpha":0.1,
        "n_estimators":1000
    }

oof_df = pd.DataFrame()
scores = []
preds = []
for fold in range(CFG.n_folds):
    print('#'*25)
    print('### Fold',fold+1)
    print('#'*25)
    
    tr_ = train[train["fold"]!=fold]
    ev_ = train[train["fold"]==fold]
    
    tr_feats = all_train_feats[list(tr_.index)]
    ev_feats = all_train_feats[list(ev_.index)]

    clf = lgb.LGBMClassifier(**params)
    clf.fit(tr_feats, tr_[CFG.target_col].values,
              eval_set=[(ev_feats, ev_[CFG.target_col].values)],
              verbose = 200,
              early_stopping_rounds=50)
    ev_preds = clf.predict_proba(ev_feats)
    ev_["pred_values"] = ev_preds[:,1]
    oof_df = pd.concat([oof_df, ev_])

    test_pred = clf.predict_proba(all_test_feats)
    print()
    score = accuracy_score(ev_[CFG.target_col].values, (ev_["pred_values"]>.5).astype(int))
    scores.append(score)
    print("Fold : {} Accuracy score: {}".format(fold,score))
    preds.append(test_pred[:,1])
    
print('#'*25)
print('Overall CV Accuracy =',np.mean(scores))
print('OOF CV Accuracy = ',accuracy_score(oof_df[CFG.target_col].values, (oof_df["pred_values"]>.5).astype(int)))

#########################
### Fold 1
#########################
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[3]	valid_0's binary_logloss: 0.607341

Fold : 0 Accuracy score: 0.6984924623115578
#########################
### Fold 2
#########################
Training until validation scores don't improve for 50 rounds.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Early stopping, best iteration is:
[3]	valid_0's binary_logloss: 0.615024

Fold : 1 Accuracy score: 0.6934673366834171
#########################
### Fold 3
#########################
Training until validation scores don't improve for 50 rounds.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Early stopping, best iteration is:
[36]	valid_0's binary_logloss: 0.599665

Fold : 2 Accuracy score: 0.6984924623115578
#########################
### Fold 4
#########################
Training until validation scores don't improve for 50 rounds.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Early stopping, best iteration is:
[20]	valid_0's binary_logloss: 0.596744

Fold : 3 Accuracy score: 0.6934673366834171
#########################
### Fold 5
#########################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[15]	valid_0's binary_logloss: 0.588194

Fold : 4 Accuracy score: 0.7085427135678392
#########################
### Fold 6
#########################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.61626

Fold : 5 Accuracy score: 0.6934673366834171
#########################
### Fold 7
#########################
Training until validation scores don't improve for 50 rounds.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Early stopping, best iteration is:
[5]	valid_0's binary_logloss: 0.599091

Fold : 6 Accuracy score: 0.6934673366834171
#########################
### Fold 8
#########################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[7]	valid_0's binary_logloss: 0.613022

Fold : 7 Accuracy score: 0.6934673366834171
#########################
### Fold 9
#########################
Training until validation scores don't improve for 50 rounds.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Early stopping, best iteration is:
[3]	valid_0's binary_logloss: 0.612521

Fold : 8 Accuracy score: 0.6934673366834171
#########################
### Fold 10
#########################
Training until validation scores don't improve for 50 rounds.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Early stopping, best iteration is:
[40]	valid_0's binary_logloss: 0.595717

Fold : 9 Accuracy score: 0.6934673366834171
#########################
### Fold 11
#########################
Training until validation scores don't improve for 50 rounds.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Early stopping, best iteration is:
[13]	valid_0's binary_logloss: 0.610684

Fold : 10 Accuracy score: 0.6984924623115578
#########################
### Fold 12
#########################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[36]	valid_0's binary_logloss: 0.598557

Fold : 11 Accuracy score: 0.678391959798995
#########################
### Fold 13
#########################
Training until validation scores don't improve for 50 rounds.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Early stopping, best iteration is:
[31]	valid_0's binary_logloss: 0.596168

Fold : 12 Accuracy score: 0.7185929648241206
#########################
### Fold 14
#########################
Training until validation scores don't improve for 50 rounds.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.616682

Fold : 13 Accuracy score: 0.6934673366834171
#########################
### Fold 15
#########################
Training until validation scores don't improve for 50 rounds.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Early stopping, best iteration is:
[42]	valid_0's binary_logloss: 0.587396

Fold : 14 Accuracy score: 0.7135678391959799
#########################
### Fold 16
#########################
Training until validation scores don't improve for 50 rounds.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Early stopping, best iteration is:
[26]	valid_0's binary_logloss: 0.598052

Fold : 15 Accuracy score: 0.6934673366834171
#########################
### Fold 17
#########################
Training until validation scores don't improve for 50 rounds.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Early stopping, best iteration is:
[8]	valid_0's binary_logloss: 0.611595

Fold : 16 Accuracy score: 0.6934673366834171
#########################
### Fold 18
#########################
Training until validation scores don't improve for 50 rounds.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Early stopping, best iteration is:
[19]	valid_0's binary_logloss: 0.611641

Fold : 17 Accuracy score: 0.6934673366834171
#########################
### Fold 19
#########################
Training until validation scores don't improve for 50 rounds.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Early stopping, best iteration is:
[13]	valid_0's binary_logloss: 0.607355

Fold : 18 Accuracy score: 0.6934673366834171
#########################
### Fold 20
#########################
Training until validation scores don't improve for 50 rounds.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Early stopping, best iteration is:
[6]	valid_0's binary_logloss: 0.607055

Fold : 19 Accuracy score: 0.6934673366834171
#########################
### Fold 21
#########################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[41]	valid_0's binary_logloss: 0.610448

Fold : 20 Accuracy score: 0.6934673366834171
#########################
### Fold 22
#########################
Training until validation scores don't improve for 50 rounds.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Early stopping, best iteration is:
[22]	valid_0's binary_logloss: 0.598101

Fold : 21 Accuracy score: 0.6834170854271356
#########################
### Fold 23
#########################
Training until validation scores don't improve for 50 rounds.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Early stopping, best iteration is:
[4]	valid_0's binary_logloss: 0.616167

Fold : 22 Accuracy score: 0.6934673366834171
#########################
### Fold 24
#########################
Training until validation scores don't improve for 50 rounds.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Early stopping, best iteration is:
[10]	valid_0's binary_logloss: 0.606283

Fold : 23 Accuracy score: 0.6934673366834171
#########################
### Fold 25
#########################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[16]	valid_0's binary_logloss: 0.607199

Fold : 24 Accuracy score: 0.6868686868686869
#########################
Overall CV Accuracy = 0.6952134409420841
OOF CV Accuracy =  0.6952151186168074


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ev_["pred_values"] = ev_preds[:,1]


In [10]:
best_score = 0
best_thresh = 0.5
for thresh in np.arange(0.1, 0.70, 0.01):
    thresh = np.round(thresh, 2)
    score = accuracy_score(oof_df[CFG.target_col], (oof_df["pred_values"]>thresh).astype(int))
    print("Accuracy score at threshold {0} is {1}".format(thresh, score))
    if score > best_score:
      best_score = score
      best_thresh = thresh
print()
print("best Accuracy score at threshold {0} is {1}".format(best_thresh, accuracy_score(oof_df[CFG.target_col], (oof_df.pred_values>best_thresh).astype(int))))

Accuracy score at threshold 0.1 is 0.30639324487334135
Accuracy score at threshold 0.11 is 0.30699638118214717
Accuracy score at threshold 0.12 is 0.30719742661841576
Accuracy score at threshold 0.13 is 0.30739847205468435
Accuracy score at threshold 0.14 is 0.3086047446722959
Accuracy score at threshold 0.15 is 0.3106151990349819
Accuracy score at threshold 0.16 is 0.31443506232408525
Accuracy score at threshold 0.17 is 0.317048652995577
Accuracy score at threshold 0.18 is 0.3234821069561721
Accuracy score at threshold 0.19 is 0.3281061519903498
Accuracy score at threshold 0.2 is 0.3365500603136309
Accuracy score at threshold 0.21 is 0.3474065138721351
Accuracy score at threshold 0.22 is 0.35605146763168477
Accuracy score at threshold 0.23 is 0.3691194209891435
Accuracy score at threshold 0.24 is 0.3864093285082429
Accuracy score at threshold 0.25 is 0.40450341777241655
Accuracy score at threshold 0.26 is 0.42923200643345394
Accuracy score at threshold 0.27 is 0.4529553679131484
Accur

In [11]:
sub = test.copy()
sub.loc[:,CFG.target_col] = np.mean(preds,axis=0)
sub[CFG.target_col] = (sub[CFG.target_col]>best_thresh).astype(int)
sub_columns = sample_sub.columns
sub = sub[sub_columns]

sub.to_csv(os.path.join(OUTPUT_DIR, f'submit_{CFG.model}_seed{CFG.seed}_ver{CFG.ver}.csv'))
display(sub)
display(sub.y.value_counts())

Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
6388,6389,0
6389,6390,0
6390,6391,0
6391,6392,0


0    6393
Name: y, dtype: int64