In [1]:
#Kailidis Kyrillos AM:4680
#Kerasovitis Ilias AM:4699

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import networkx as nx
import numpy as np
import random
from tqdm import tqdm 
from joblib import Parallel, delayed
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score,roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from node2vec import Node2Vec
import xgboost as xgb
import torch
import os
from sklearn.ensemble import StackingClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.metrics import AUC
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load Data
print("Loading data...")

# abstract.txt
abstracts = pd.read_csv("abstracts.txt",sep = r"\|--\|" , names=["id", "abstract"] , engine="python")
abstracts = abstracts.dropna(subset=["abstract"])

# authors.txt
authors = pd.read_csv("authors.txt", sep = r"\|--\|" , names = ["id","author"] , engine = "python")
authors["author"] = authors["author"].str.split(",")
authors = authors.explode("author").dropna(subset=["author"])
authors["author"] = authors["author"].str.strip()
# edgelist.txt
edgelist = pd.read_csv("edgelist.txt", sep = "," , names = ["source_id"  , "target_id"] , engine = "python")


print("Data loaded")

Loading data...
Data loaded


In [4]:
#Train graph creation

#List with all the edges 
edges = list(pd.read_csv("edgelist.txt", sep=",", names=["source_id", "target_id"]).itertuples(index=False, name=None))

#Splitting train edges and validation edges
train_edges, val_edges = train_test_split(edges, test_size=0.2, random_state=7)

#Set with all the nodes in the edgelist
nodes = set()
for source_id , target_id in edges:
    nodes.add(source_id)
    nodes.add(target_id)

#Graph creation with all the nodes and the train edges only     
graph = nx.DiGraph()
graph.add_nodes_from(nodes)
graph.add_edges_from(train_edges)
un_graph = graph.to_undirected()
print("Graph created")


Graph created


In [5]:
#Node2vec embeddings
n2v_file = "node2vec_embeddings.kv"
#If file exists load it or else compute the embeddings and save them 
if os.path.exists(n2v_file):
    print("Loading node2vec embeddings...")
    n2v_emb = KeyedVectors.load(n2v_file)
else:
    print("Computing node2vec embeddings...")   

    #Generate random walks 
    node2vec = Node2Vec(un_graph, dimensions=128, walk_length=20, num_walks=20, workers=1)
    walks = node2vec.walks
    
    #Treat walks as words and compute the embeddings 
    model = Word2Vec(   
        sentences=walks,
        vector_size=64,
        window=10,
        min_count=1,
        batch_words=4,
        sg=1  
    )
    model.wv.save(n2v_file)
    n2v_emb = KeyedVectors.load(n2v_file)


Loading node2vec embeddings...


In [6]:
#PCA for node2vec embeddings
paper_ids = list(n2v_emb.index_to_key)
embeddings = np.array([n2v_emb[node] for node in paper_ids])

pca = PCA()
pca.fit(embeddings)

var = np.cumsum(pca.explained_variance_ratio_)
n_comp = np.searchsorted(var, 0.95) + 1

print(f"PCA: reduced embeddings vector to {n_comp} components keeping 95% of variance")

pca = PCA(n_components=n_comp)
pca_n2v_emb = pca.fit_transform(embeddings)

size = pca_n2v_emb.shape[1]
n2v_emb = KeyedVectors(vector_size=size)
n2v_emb.add_vectors(paper_ids, pca_n2v_emb)

PCA: reduced embeddings vector to 59 components keeping 95% of variance


In [7]:
#BERT EMBEDDINGS 

#Check for supported nvidia gpu or else cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bert_file = "bert_embeddings.npy"
#If file exists load it or else compute the embeddings and save them 

if os.path.exists(bert_file):
    print("Loading precomputed BERT embeddings...")
    embeddings = np.load(bert_file, allow_pickle=True)
else:
    print("Computing BERT embeddings...")
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertModel.from_pretrained("bert-base-uncased").to(device).eval()

    def get_bert_embeddings(texts, batch_size=8):
        embeddings = []
        for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"): #tqdm for progress bar 
            batch_texts = texts[i:i+batch_size]
            encodings = tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
            input_ids = encodings["input_ids"].to(device, non_blocking=True)
            attention_mask = encodings["attention_mask"].to(device, non_blocking=True)
            with torch.no_grad():
                with torch.amp.autocast(device_type="cuda", dtype=torch.float16):
                    outputs = model(input_ids, attention_mask=attention_mask)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            embeddings.append(batch_embeddings)
        return np.vstack(embeddings)

    #Compute embeddings for all abstracts and save to file
    embeddings = get_bert_embeddings(abstracts["abstract"].tolist(), batch_size=8)
    np.save(bert_file, embeddings)



Loading precomputed BERT embeddings...


In [8]:
#PCA for bert embeddings

pca = PCA()
pca.fit(embeddings)

var = np.cumsum(pca.explained_variance_ratio_)
n_comp = np.searchsorted(var, 0.95) + 1

print(f"PCA: reduced embeddings vector to {n_comp} components keeping 95% of variance")

pca = PCA(n_components=n_comp)
pca_bert_emb = pca.fit_transform(embeddings)
abstracts["bert_embedding"] = list(pca_bert_emb)


PCA: reduced embeddings vector to 225 components keeping 95% of variance


In [9]:
#Functions
print("functions cell")
def generate_negative_pairs(article_ids, positive_pairs, num_neg_samples):  #Generates random pairs 
    positive_set = set(positive_pairs)
    article_ids = list(article_ids)
    negative_pairs = set()

    while len(negative_pairs) < num_neg_samples:
        u = random.choice(article_ids)
        v = random.choice(article_ids)

        if u == v:  #If u and v is the same node skip it 

            continue  
        pair = (u, v)

        if pair not in positive_set and pair[::-1] not in positive_set: #If theres no edge between the 2 nodes include them in the negative pairs 

            negative_pairs.add(pair)

    return list(negative_pairs)

def cosine_sim(p1_id, p2_id):   #Cosine similarity of bert embeddins 
    if p1_id in abstracts['id'].values and p2_id in abstracts['id'].values:
        p1 = abstracts.loc[abstracts['id'] == p1_id, 'bert_embedding'].iloc[0]
        p2 = abstracts.loc[abstracts['id'] == p2_id, 'bert_embedding'].iloc[0]
        return cosine_similarity(p1.reshape(1, -1), p2.reshape(1, -1))[0][0]    #Convert it to 2d and compute cosine similarity 
    return 0

def n2v_cosine_sim(p1_id , p2_id):  #Cosine similarity of Node2Vec embeddings 
    if str(p1_id) in n2v_emb and str(p2_id) in n2v_emb:    
        p1 = n2v_emb[str(p1_id)]
        p2 = n2v_emb[str(p2_id)]
        return cosine_similarity([p1], [p2])[0][0]
    return 0

    
pagerank = nx.pagerank(un_graph)

def pagerank_sum(p1_id, p2_id):    #Compute pagerank 
    return pagerank.get(p1_id, 0) + pagerank.get(p2_id, 0)

def clustering_coefficient(p1_id, p2_id):   #Compute clustering coefficient 
    if un_graph.has_node(p1_id) and un_graph.has_node(p2_id):
        return nx.clustering(un_graph, p1_id) + nx.clustering(un_graph, p2_id)
    return 0    

def common_neighbors(p1_id, p2_id):    #Compute common neighbors 
    if un_graph.has_node(p1_id) and un_graph.has_node(p2_id):
        return len(list(nx.common_neighbors(un_graph, p1_id, p2_id)))
    return 0

def jaccard_coef(p1_id, p2_id):    #Compute jaccard coefficient 
    if un_graph.has_node(p1_id) and un_graph.has_node(p2_id):
        preds = list(nx.jaccard_coefficient(un_graph, [(p1_id, p2_id)]))
        if preds:
            return preds[0][2]
    return 0

def adamic_adar_index(p1_id, p2_id):    #Compute adamic adar 
    if un_graph.has_node(p1_id) and un_graph.has_node(p2_id):
        preds = list(nx.adamic_adar_index(un_graph, [(p1_id, p2_id)]))
        if preds:
            return preds[0][2]
    return 0


def authorship_overlap(p1_id, p2_id):   #Compute authorship overlap
    authors1 = set(authors.loc[authors['id'] == p1_id, 'author'].tolist())
    authors2 = set(authors.loc[authors['id'] == p2_id, 'author'].tolist())
    union = authors1 | authors2    #Compute the union to get the percentage of overlap 
    if len(union) == 0:
        return 0
    return len(authors1 & authors2) / len(union)

functions cell


In [10]:
def split_dataframe(df, num_chunks):    #Split the dataframe into num_chunks parts for parallel processing

    return np.array_split(df, num_chunks)

def compute_feature_parallel(feature_name, feature_func, df):
    print(f"Computing {feature_name} with all cores...") 
    chunks = split_dataframe(df, 4)    #Split the dataframe into 4 parts (4 core cpu)
    results = Parallel(n_jobs=4)(   # Assign each chunk to a separate core 
        delayed(lambda chunk: chunk.apply(
            lambda row: feature_func(row["p1"], row["p2"]), axis=1
        ))(chunk) for chunk in chunks
    )
    return pd.concat(results)

feature_tasks = {
    "n2v_cosine_sim": n2v_cosine_sim,
    "cosine_sim": cosine_sim,
    "pagerank_sum": pagerank_sum,
    "clustering_coefficient": clustering_coefficient,
    "common_neighbors": common_neighbors,
    "jaccard_coef": jaccard_coef,
    "adamic_adar_index": adamic_adar_index,
    "authorship_overlap": authorship_overlap,
}    

In [11]:
#Train data positive and negative sampling 
article_ids = list(graph.nodes())
neg_train_edges = generate_negative_pairs(article_ids, train_edges, len(train_edges))
train_pairs = train_edges + neg_train_edges
y_train = [1] * len(train_edges) + [0] * len(neg_train_edges)
y_train = np.array(y_train)

In [12]:
#Features for training 
print("Train feature computation")
train_file = "citation_features.pkl"

if os.path.exists(train_file):
    print("Loading training features from cache...")
    train_features = pd.read_pickle(train_file)
else:
    print("Computing training features...")

    train_features = pd.DataFrame(train_pairs, columns=["p1", "p2"])

    for feature_name, feature_func in feature_tasks.items():
        tqdm.pandas(desc=f"Computing {feature_name}")
        train_features[feature_name] = compute_feature_parallel(feature_name, feature_func, train_features)
        print(f"{feature_name} computation completed!")

    train_features.to_pickle(train_file)
    print("Training features saved to cache.")
   

Train feature computation
Loading training features from cache...


In [13]:
#Validation data positive and negative sampling 
neg_val_edges = generate_negative_pairs(article_ids, val_edges, len(val_edges))
val_pairs = val_edges + neg_val_edges
y_val = [1] * len(val_edges) + [0] * len(neg_val_edges)
y_val = np.array(y_val)


In [14]:
#Features for validation 
print("Validation feature computation")
validation_file = "val_features.pkl"

if os.path.exists(validation_file):
    print("Loading validation features from cache...")
    val_features = pd.read_pickle(validation_file)
else:
    print("Computing validation features...")

    val_features = pd.DataFrame(val_pairs, columns=["p1", "p2"])

    for feature_name, feature_func in feature_tasks.items():
        tqdm.pandas(desc=f"Computing {feature_name}")
        val_features[feature_name] = compute_feature_parallel(feature_name, feature_func, val_features)
        print(f"{feature_name} computation completed!")

    val_features.to_pickle(validation_file)
    print("Validation features saved to cache.")

Validation feature computation
Loading validation features from cache...


In [15]:
#Features for test
test_feature_file = "test_features.pkl"

if os.path.exists(test_feature_file):
    print("Loading test features from cache...")
    test_features = pd.read_pickle(test_feature_file)
else:
    print("Computing test features...")

    test = pd.read_csv("test.txt", sep=",", names=["p1", "p2"], engine="python")
    test_features = test.copy()

    for feature_name, feature_func in feature_tasks.items():
        tqdm.pandas(desc=f"Computing {feature_name}")
        test_features[feature_name] = compute_feature_parallel(feature_name, feature_func, test_features)
        print(f"{feature_name} computation completed!")

    test_features.to_pickle(test_feature_file)
    print("Test features saved to cache.")


Loading test features from cache...


In [16]:
#Train/val scalling 
X_train = train_features.drop(columns=["p1", "p2"])
X_val = val_features.drop(columns=["p1", "p2"])
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

X_val_scaled = scaler.transform(X_val)

In [17]:
#XGB hyperperameters tuning with hyperopt

#Hyperparameters range to work on 
space = {
    'max_depth': hp.quniform("max_depth", 2, 20, 1),
    'gamma': hp.uniform('gamma', 0, 9),
    'reg_alpha': hp.quniform('reg_alpha', 0, 200, 1),
    'reg_lambda': hp.uniform('reg_lambda', 0, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'min_child_weight': hp.quniform('min_child_weight', 0, 10, 1),
    'n_estimators': 3000,
    'learning_rate': hp.uniform('learning_rate', 0.001, 0.05),
    'random_state': 7
}

def objective(space):
    clf = xgb.XGBClassifier(
        n_estimators=int(space['n_estimators']),
        early_stopping_rounds = 100,    #early stopping in case loss doesnt decrease or getting worse 
        max_depth=int(space['max_depth']),
        gamma=space['gamma'],
        reg_alpha=space['reg_alpha'],
        reg_lambda=space['reg_lambda'],
        colsample_bytree=space['colsample_bytree'],
        min_child_weight=int(space['min_child_weight']),
        learning_rate=space['learning_rate'],
        eval_metric='logloss',
        random_state=space['random_state']
    )

    clf.fit(X_train_scaled, y_train,
            eval_set=[(X_val_scaled, y_val)],
            verbose=False)
    
    pred = clf.predict_proba(X_val_scaled)[:, 1]
    loss = log_loss(y_val, pred)
    
    return {'loss': loss, 'status': STATUS_OK}

trials = Trials()

best_hyperparams = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,   #Max 50 itterates for the algorithm
    trials=trials
)

print(best_hyperparams)


100%|██████████| 50/50 [51:45<00:00, 62.11s/trial, best loss: 0.20084655626114145]   
{'colsample_bytree': 0.6188751003181727, 'gamma': 3.5340649831797766, 'learning_rate': 0.001317232670413897, 'max_depth': 17.0, 'min_child_weight': 7.0, 'reg_alpha': 199.0, 'reg_lambda': 0.4462252302461981}


In [18]:
# XGB 
print("XGBoost")
xgb_model = xgb.XGBClassifier(
    n_estimators=3000,
    early_stopping_rounds=100,
    colsample_bytree = 0.502600880974819,
    gamma = 4.953562191448858,
    learning_rate =0.002880387319661854,
    max_depth = 16,
    min_child_weight = 5,
    reg_alpha = 166,
    reg_lambda = 0.7318649905880288,
    eval_metric="logloss",
    random_state=7
    )
eval_set = [(X_val_scaled, y_val)]

xgb_model.fit(
    X_train_scaled, y_train,
    eval_set=[(X_val_scaled, y_val)],
    verbose=True
)
xgb_val = xgb_model.predict_proba(X_val_scaled)[:, 1]

print(f"Log Loss: {log_loss(y_val, xgb_val):.4f}")
print(f"Accuracy: {accuracy_score(y_val, xgb_val > 0.5):.4f}")
print(f"AUC: {roc_auc_score(y_val, xgb_val):.4f}")

XGBoost
[0]	validation_0-logloss:0.69117
[1]	validation_0-logloss:0.68934
[2]	validation_0-logloss:0.68698
[3]	validation_0-logloss:0.68503
[4]	validation_0-logloss:0.68276
[5]	validation_0-logloss:0.68045
[6]	validation_0-logloss:0.67816
[7]	validation_0-logloss:0.67588
[8]	validation_0-logloss:0.67361
[9]	validation_0-logloss:0.67135
[10]	validation_0-logloss:0.66911
[11]	validation_0-logloss:0.66725
[12]	validation_0-logloss:0.66503
[13]	validation_0-logloss:0.66285
[14]	validation_0-logloss:0.66125
[15]	validation_0-logloss:0.65905
[16]	validation_0-logloss:0.65688
[17]	validation_0-logloss:0.65471
[18]	validation_0-logloss:0.65258
[19]	validation_0-logloss:0.65047
[20]	validation_0-logloss:0.64884
[21]	validation_0-logloss:0.64727
[22]	validation_0-logloss:0.64515
[23]	validation_0-logloss:0.64306
[24]	validation_0-logloss:0.64099
[25]	validation_0-logloss:0.63893
[26]	validation_0-logloss:0.63740
[27]	validation_0-logloss:0.63576
[28]	validation_0-logloss:0.63411
[29]	validation_

In [19]:
#Random Forest
print("Random Forest")

rf_model = RandomForestClassifier(n_estimators = 300 ,
                                  max_depth = 5 ,
                                  n_jobs = -1 ,
                                  criterion = 'log_loss',
                                  verbose = 2,
                                  random_state = 7)

rf_model.fit(X_train_scaled, y_train)
rf_val = rf_model.predict_proba(X_val_scaled)[:, 1]

print(f"Log Loss: {log_loss(y_val, rf_val):.4f}")
print(f"Accuracy: {accuracy_score(y_val, rf_val > 0.5):.4f}")
print(f"AUC: {roc_auc_score(y_val, rf_val):.4f}")


Random Forest


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


building tree 1 of 300building tree 2 of 300
building tree 3 of 300

building tree 4 of 300
building tree 5 of 300
building tree 6 of 300
building tree 7 of 300
building tree 8 of 300
building tree 9 of 300
building tree 10 of 300
building tree 11 of 300
building tree 12 of 300
building tree 13 of 300
building tree 14 of 300
building tree 15 of 300
building tree 16 of 300
building tree 17 of 300
building tree 18 of 300
building tree 19 of 300
building tree 20 of 300
building tree 21 of 300
building tree 22 of 300
building tree 23 of 300
building tree 24 of 300
building tree 25 of 300
building tree 26 of 300
building tree 27 of 300
building tree 28 of 300
building tree 29 of 300
building tree 30 of 300
building tree 31 of 300
building tree 32 of 300
building tree 33 of 300


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   21.3s


building tree 34 of 300
building tree 35 of 300
building tree 36 of 300
building tree 37 of 300
building tree 38 of 300
building tree 39 of 300
building tree 40 of 300
building tree 41 of 300
building tree 42 of 300
building tree 43 of 300
building tree 44 of 300
building tree 45 of 300
building tree 46 of 300
building tree 47 of 300
building tree 48 of 300
building tree 49 of 300
building tree 50 of 300
building tree 51 of 300
building tree 52 of 300
building tree 53 of 300
building tree 54 of 300
building tree 55 of 300
building tree 56 of 300
building tree 57 of 300
building tree 58 of 300
building tree 59 of 300
building tree 60 of 300
building tree 61 of 300
building tree 62 of 300
building tree 63 of 300
building tree 64 of 300
building tree 65 of 300
building tree 66 of 300
building tree 67 of 300
building tree 68 of 300
building tree 69 of 300
building tree 70 of 300
building tree 71 of 300
building tree 72 of 300
building tree 73 of 300
building tree 74 of 300
building tree 75

[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.7min


building tree 155 of 300
building tree 156 of 300
building tree 157 of 300
building tree 158 of 300
building tree 159 of 300
building tree 160 of 300
building tree 161 of 300
building tree 162 of 300
building tree 163 of 300
building tree 164 of 300
building tree 165 of 300
building tree 166 of 300
building tree 167 of 300
building tree 168 of 300
building tree 169 of 300
building tree 170 of 300
building tree 171 of 300
building tree 172 of 300
building tree 173 of 300
building tree 174 of 300
building tree 175 of 300
building tree 176 of 300
building tree 177 of 300
building tree 178 of 300
building tree 179 of 300
building tree 180 of 300
building tree 181 of 300
building tree 182 of 300
building tree 183 of 300
building tree 184 of 300
building tree 185 of 300
building tree 186 of 300
building tree 187 of 300
building tree 188 of 300
building tree 189 of 300
building tree 190 of 300
building tree 191 of 300
building tree 192 of 300
building tree 193 of 300
building tree 194 of 300


[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.5min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:    1.6s finished


Log Loss: 0.1932
Accuracy: 0.9283
AUC: 0.9803


In [20]:
#Neural Network
model_nn = Sequential([    #3-layer perceptron with L2 regularization, batch normalization, and dropout
    
    #First hidden layer: 16 units with relu
    Dense(16, activation='relu', kernel_regularizer=l2(0.1), input_shape=(X_train_scaled.shape[1],)),
    BatchNormalization(),   #
    Dropout(0.7),  
    
    #Second hidden layer: 8 units with relu 
    Dense(8, activation='relu', kernel_regularizer=l2(0.1)),
    BatchNormalization(),
    Dropout(0.5),
    
    #Third hidden layer: 4 units with relu
    Dense(4, activation='relu', kernel_regularizer=l2(0.1)),
    BatchNormalization(),
    Dropout(0.2),
    #Output layer: 1 unit with sigmoid  
    Dense(1, activation='sigmoid')  # Output for binary classification
])

model_nn.compile(
    optimizer=Adam(learning_rate=0.1), #Starting with 0.1 learning rate 
    loss='binary_crossentropy',
    metrics=[AUC(name='auc')]
)

callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1),   #Early stopping if the validation loss doesnt get better after 10 epochs 
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1)    #Reduce learning rate after 3 rounds of validation loss not getting better  
]

model_nn.fit(
    X_train_scaled, y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=50,
    batch_size=512,  
    callbacks=callbacks,
    verbose=1
)

nn_val_probs = model_nn.predict(X_val_scaled).flatten()
print(f"Log Loss: {log_loss(y_val, nn_val_probs):.4f}")
print(f"Accuracy: {accuracy_score(y_val, nn_val_probs > 0.5):.4f}")
print(f"AUC: {roc_auc_score(y_val, nn_val_probs):.4f}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m3413/3413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3ms/step - auc: 0.9849 - loss: 0.3797 - val_auc: 0.9800 - val_loss: 0.3777 - learning_rate: 0.1000
Epoch 2/50
[1m3413/3413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - auc: 0.9896 - loss: 0.3510 - val_auc: 0.9796 - val_loss: 0.2874 - learning_rate: 0.1000
Epoch 3/50
[1m3413/3413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - auc: 0.9894 - loss: 0.3569 - val_auc: 0.8673 - val_loss: 2.1984 - learning_rate: 0.1000
Epoch 4/50
[1m3413/3413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - auc: 0.9880 - loss: 0.3417 - val_auc: 0.9817 - val_loss: 0.3026 - learning_rate: 0.1000
Epoch 5/50
[1m3396/3413[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - auc: 0.9867 - loss: 0.3841
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.05000000074505806.
[1m3413/3413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - auc: 0.9867 -

In [21]:
#Ensemble training 

print("Ensemble")

#Get validation predictions from each model
rf_val = rf_model.predict_proba(X_val_scaled)[:, 1]
nn_val_probs = model_nn.predict(X_val_scaled).flatten()
xgb_val = xgb_model.predict_proba(X_val_scaled)[:, 1]

X_stack_val = np.vstack([rf_val, nn_val_probs, xgb_val]).T

#Logistic regression as meta model
stacker = LogisticRegression()  
stacker.fit(X_stack_val, y_val)

stack_val_probs = stacker.predict_proba(X_stack_val)[:, 1]

print(f"Log Loss: {log_loss(y_val, stack_val_probs):.4f}")
print(f"Accuracy: {accuracy_score(y_val, stack_val_probs > 0.5):.4f}")
print(f"AUC: {roc_auc_score(y_val, stack_val_probs):.4f}")


Ensemble


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    0.7s


[1m  123/13650[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m11s[0m 829us/step

[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:    1.6s finished


[1m13650/13650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 812us/step
Log Loss: 0.1505
Accuracy: 0.9481
AUC: 0.9848


In [22]:
#Ensemble test predictions 

print("ensemble test")
#Load test features and drop the paper ids 
test = pd.read_csv("test.txt", sep=",", names=["p1", "p2"], engine="python")    
test_features = pd.read_pickle("test_features.pkl")
test_features = test_features.drop(columns=["p1", "p2"])

#Scale test features
test_scaled = scaler.transform(test_features)

#Get predictions from each model
rf_test = rf_model.predict_proba(test_scaled)[:, 1]
nn_test = model_nn.predict(test_scaled).flatten()
xgb_test = xgb_model.predict_proba(test_scaled)[:, 1]

X_stack_test = np.vstack([rf_test, nn_test, xgb_test]).T

final_probs = stacker.predict_proba(X_stack_test)[:, 1]

test["ID"] = test.index
test["Label"] = final_probs
test[["ID", "Label"]].to_csv("submission.csv", index=False)

print("submission.csv")

ensemble test


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    0.1s


[1m 120/3335[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 851us/step 

[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:    0.3s finished


[1m3335/3335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 814us/step
submission.csv
