## Import libraries

In [80]:
import csv
import nltk
import numpy as np
import networkx as nx #We used networkx since it is the library we used during the practice session
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Windows\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Windows\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Creating the graphs

In [81]:
with open("./data/node_information.csv", "r") as f:
    file = csv.reader(f)
    node_info = list(file)

IDs = [int(element[0]) for element in node_info]

with open("./data/training_set.txt", "r") as f:
    file =csv.reader(f, delimiter='\t')
    set_file=list(file)
set= np.array([values[0].split(" ") for values in set_file]).astype(int)


#Define the directed graph and add the IDs
directed_graph=nx.DiGraph()
directed_graph.add_nodes_from(IDs)
for ID_source_train,ID_sink_train,link_train in set:
    if link_train==1:#the link are added if this statement is true
        directed_graph.add_edge(ID_source_train,ID_sink_train)

graph = nx.Graph(directed_graph)

## Set the features

In [82]:
Year = [int(element[1]) for element in node_info]
Title = [element[2] for element in node_info]
Authors=[element[3] for element in node_info]
Journal_name=[element[4] for element in node_info]
Corpus=[element[5] for element in node_info]

page_rank = nx.pagerank_scipy(graph)
hub_score, authority_score = nx.hits(graph)

#TF-IDF cosine similarity
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(Corpus)

#One hot encoding on node information for the cooccurence computation
one_hot_encoder=CountVectorizer()
Authors_encoded=one_hot_encoder.fit_transform(Authors)
Titles_encoded=one_hot_encoder.fit_transform(Title)

one_hot_corpus_encoding=CountVectorizer(stop_words="english")
corpus_encoded = one_hot_corpus_encoding.fit_transform(Corpus)

## Generate the features

In [83]:
def get_features(article_1,article_2):#To generate the features between 2 papers
    article_1_id,article_2_id=IDs.index(article_1),IDs.index(article_2)

    #tfidf cosine similarity
    tf_1 ,tf_2= tfidf_matrix[1], tfidf_matrix[2]
    tfidf_sim = cosine_similarity(tf_1, tf_2)[0][0]

    ## Features from contextual information of the nodes
    co_occurence_corpus=(corpus_encoded[article_1_id]@corpus_encoded[article_2_id].T).toarray()[0][0]
    same_authors=(Authors_encoded[article_1_id]@Authors_encoded[article_2_id].T).toarray()[0][0]
    co_occurence_title=(Titles_encoded[article_1_id]@Titles_encoded[article_2_id].T).toarray()[0][0]

    same_journal = int(Journal_name[article_1_id] == Journal_name[article_2_id])
    years_diff=Year[article_1_id]-Year[article_2_id]

    #Feature from the directed graph

    triad_features = [0.0]*8
    for w in sorted(nx.common_neighbors(graph, article_1, article_2)):
        if graph.has_edge(article_1, w) and graph.has_edge(w, article_2):
            triad_features[0]+=1
        if graph.has_edge(article_1, w) and graph.has_edge(article_2, w):
            triad_features[1]+=1
        if graph.has_edge(w, article_1) and graph.has_edge(w, article_2):
            triad_features[2] += 1
        if graph.has_edge(w, article_1) and graph.has_edge(article_2, w):
            triad_features[3] += 1
    for i in range(4, 8):
        if triad_features[i-4]!=0:
            triad_features[i] = triad_features[i-4]/common_neig

    #Feature from the graph
    adamic_adar=nx.adamic_adar_index(graph, [(article_1, article_2)])
    for _,_,p in adamic_adar:
        adamic_adar_coef= p
    jaccard = nx.jaccard_coefficient(graph, [(article_1, article_2)])
    for _,_,p in jaccard:
        jaccard_coef= p
    pref_attachement = nx.preferential_attachment(graph, [(article_1, article_2)])
    for _,_,p in pref_attachement:
        pref_attachement_coef= p
    common_neig=len(sorted(nx.common_neighbors(graph, article_1, article_2)))


    node_info_features = [co_occurence_corpus, same_authors, co_occurence_title, years_diff, same_journal, tfidf_sim]
    degree_features = [directed_graph.in_degree(article_1), directed_graph.out_degree(article_1), directed_graph.in_degree(article_2), directed_graph.out_degree(article_2)]
    heuristic_graph_features = [jaccard_coef, adamic_adar_coef, pref_attachement_coef, common_neig,page_rank[article_2],hub_score[article_1],authority_score[article_2]]

    #We are returning 25 features
    return node_info_features + heuristic_graph_features + degree_features + triad_features

## Build the train and test set

In [84]:
try:
    X_train= np.load("./saved_data/X_train.npy")
    y_train=np.load("./saved_data/y_train.npy")
except:
    y_train=[]
    X_train= []
    step=0
    for source,sink,link in set:
        step+=1
        if step%1000==0:
            print("Finished ",step,"steps over ",len(set),end='\r')
        X_train.append(get_features(source,sink))
        y_train.append(link)
    X_train=np.array(X_train)
    X_train = preprocessing.scale(X_train)
    y_train=np.array(y_train)
    np.save("./saved_data/X_train.npy", X_train)
    np.save("./saved_data/y_train.npy", y_train)


In [85]:
with open("./data/testing_set.txt", "r") as f:
    file =csv.reader(f, delimiter='\t')
    set_file=list(file)
set_test= np.array([values[0].split(" ") for values in set_file]).astype(int)
try:
    X_test=np.load("./saved_data/X_test.npy")
except:
    X_test=[]
    y_test=[]
    print("Features construction for Testing...")
    step=0
    for source,sink in set_test:
        step+=1
        if step%1000==0:
            print("Finished ",step,"steps over ",len(set_test),end='\r')
        X_test.append(get_features(source,sink))
    X_test=np.array(X_test)
    X_test = preprocessing.scale(X_test)
    np.save("./saved_data/X_test.npy", X_test)


### Predictions

In [86]:
def run_prediction(model,model_name):
    model.fit(X_train, y_train)
    y_pred = list(model.predict(X_test))
    pred= zip(range(len(set_test)), y_pred)
    filename="./submission/submit_"+model_name+".csv"
    with open(filename,"w",newline="") as prediction:
        fieldnames = ['id', 'category']
        csv_out = csv.writer(prediction)
        csv_out.writerow(fieldnames)
        for row in pred:
            csv_out.writerow(row)

SVM Classifier

In [87]:
from sklearn import svm
#run_prediction(svm.LinearSVC(),"LinearSVC") #Score: 0.96634 on kaggle

Random Forest Classifier

In [88]:
from sklearn.ensemble import RandomForestClassifier
#run_prediction(RandomForestClassifier(n_estimators=150),"Random_Forest") #Score: 0.97114 on kaggle

Gradient Boosting Classifier

In [89]:
from sklearn.ensemble import GradientBoostingClassifier
#run_prediction(GradientBoostingClassifier(n_estimators=150, learning_rate=1e-2, random_state=0),"GBoost")#Score: 0.96988 on kaggle

Light GBM Classifier

In [90]:
import lightgbm as lgb
#run_prediction(lgb.LGBMClassifier(n_estimators=150),"LGBM") #Score: 0.96969 on kaggle
#run_prediction(lgb.LGBMClassifier(n_estimators=150,learning_rate=1e-5,max_depth=9,min_child_weight=2,reg_alpha=3,min_child_samples=20)) 
#Score: 0.70594 on kaggle

XGBoost Classifier

In [91]:
from xgboost import XGBClassifier
#run_prediction(XGBClassifier(n_estimators=150),"XGB")#Score: 0.96006

## Fine tuning

We decided to keep working on Light GBM regarding how well it performed on the kaggle submition also teh boosting type can be specified first we will try the classical gradient boosting decision tree and after the dropouts meet Multiple Additive Regression (DART)

In [92]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import lightgbm as lgb

In [93]:
def fine_tune(model,trainx,trainy,valx,valy):
    model.fit(trainx, trainy)
    y_pred = list(model.predict(valx))
    return accuracy_score(valy,y_pred)

In [94]:
(trainx, valx, trainy, valy) = train_test_split(X_train, y_train, test_size=0.3, random_state=26)

In [95]:
num_leaves=[30,40,50,60]
min_child_samples=[20,30,40,50,60]
reg_alpha=[i for i in range(5)]
fine_tuning_lgbm=[]
for nl in num_leaves:
    for mcs in min_child_samples:
        for ra in reg_alpha:
            acc=fine_tune(lgb.LGBMClassifier(n_estimators=100,num_leaves=nl,min_child_samples=mcs,reg_alpha=ra),trainx,trainy,valx,valy)
            fine_tuning_lgbm.append([nl,mcs,ra,acc])

df=pd.DataFrame(fine_tuning_lgbm,columns=["num_leaves","min_child_samples","reg_alpha","accuracy"])

In [96]:
df.loc[df['accuracy'].idxmax()]

num_leaves           60.000000
min_child_samples    50.000000
reg_alpha             3.000000
accuracy              0.975137
Name: 93, dtype: float64

In [100]:
run_prediction(lgb.LGBMClassifier(n_estimators=100,num_leaves=600,min_child_samples=50,reg_alpha=3),"LGBM")
#0.96970 on kaggle

## DART LGBM

In [98]:
run_prediction(lgb.LGBMClassifier(boosting_type='dart',n_estimators=150),"LGBM")#Score 0.97382 on Kaggle

here is the best result we had on kaggle now let's try to fine tune it we saw previously that having a small learning is not useful so we will tune on the other parameters

In [101]:
num_leaves=[55,80,100]
min_child_samples=[50,65,80]
reg_alpha=[i for i in range(5)]
fine_tuning_lgbm_dart=[]
for nl in num_leaves:
    for mcs in min_child_samples:
        for ra in reg_alpha:
            acc=fine_tune(lgb.LGBMClassifier(boosting_type='dart',n_estimators=100,num_leaves=nl,min_child_samples=mcs,reg_alpha=ra),trainx,trainy,valx,valy)
            fine_tuning_lgbm_dart.append([nl,mcs,ra,acc])

df_dart=pd.DataFrame(fine_tuning_lgbm_dart,columns=["num_leaves","min_child_samples","reg_alpha","accuracy"])

In [102]:
df_dart.loc[df_dart['accuracy'].idxmax()]

num_leaves           100.000000
min_child_samples     80.000000
reg_alpha              2.000000
accuracy               0.974146
Name: 42, dtype: float64

In [103]:
fine_tune(lgb.LGBMClassifier(boosting_type='dart',n_estimators=100),trainx,trainy,valx,valy) #acc former dart model

0.9731822760405948

We have a gain of 0.1% in accuracy compared to the former dart model but in submitting on Kaggle the best result were with the default dart model with a number of estimators of 100

In [104]:
run_prediction(lgb.LGBMClassifier(boosting_type='dart',n_estimators=100),"best_LGBM_DART")#Score 0.97386 on Kaggle

In [105]:
run_prediction(lgb.LGBMClassifier(boosting_type='dart',n_estimators=100,num_leaves=100,min_child_samples=80,reg_alpha=2),"LGBM_DART_Tuned")
#Score 0.97257