### INF554 Machine and Deep Learning
# Data Challenge: H-index Prediction

Alexandre Hirsch, Antonin Wattel

In [1]:
import os
import pandas as pd
import re
import numpy as np
import networkx as nx
import json
import gensim.downloader as api
from gensim.parsing.preprocessing import STOPWORDS
import gc
from googletrans import Translator

# Loading Data

In [2]:
#5.9s

# read training data
df_train = pd.read_csv('train.csv', dtype={'author': np.int64, 'hindex': np.float32})
n_train = df_train.shape[0]

# read test data
df_test = pd.read_csv('test.csv', dtype={'author': np.int64})
n_test = df_test.shape[0]

# Text features

In [3]:
#this may be useful for preprocessing
#https://kavita-ganesan.com/text-preprocessing-tutorial/#.XHa4-ZNKhuU

In [48]:
#different pre-trained word-embedding models

wv = api.load('word2vec-google-news-300') #best
#wv = api.load('glove-wiki-gigaword-300')
#wv = api.load('conceptnet-numberbatch-17-06-300')
#wv = Word2Vec.load("w2v.model").wv

print("Word2Vec model loaded")

Word2Vec model loaded


In [None]:
#version with translator (not used)

# translator = Translator()
# def store_abstracts_translated():
#     paper_IDs = dict()
#     with open('data/abstracts.txt') as f:
#         for l in f:
#             paper_ID, abstract = l.split("----",1)
#             d = json.loads(abstract)
#             InvertedIndex, IndexLength = d["InvertedIndex"], d["IndexLength"]
#             words = InvertedIndex.keys()

#             detected = False
#             while not detected:
#                 try:
#                     language_code = translator.detect(' '.join(list(words)[:10])).lang
#                     detected = True
#                 except:
#                     print(f"can't detect for paper {paper_ID}, waiting two seconds")
#                     time.sleep(2)
#                     continue

#             if language_code != 'en':
#                 ab = [None]*IndexLength
#                 for word, v in InvertedIndex.items():
#                     for i in v:
#                         ab[i] = word
#                 translated = False
#                 while not translated:
#                     try:
#                         words = set(translator.translate(' '.join(list(filter(None, ab))), src = language_code, dest = 'en').text.split())
#                         translated = True
#                     except:
#                         print(f"can't translate for paper {paper_ID}, waiting two seconds")
#                         time.sleep(2)
#                         continue
#             paper_IDs[int(paper_ID)] = list(words - STOPWORDS)
#     # with open('data/stopped_translated_abstracts.json', 'w') as f:
#     #     json.dump(paper_IDs, f)
#     return paper_IDs

# paper_IDs = store_abstracts_translated()
# print('abstracts stored')

In [50]:
#without translation
def store_abstracts():
    paper_IDs = dict()
    #with open('abstracts.txt') as f:
    with open('abstracts.txt', encoding='utf-8') as f:
        for l in f:
            paper_ID, abstract = l.split("----",1)
            #sent = [None]*d["IndexLength"]
            #for word, v in d["InvertedIndex"].items():
                #for i in v:
                    #sent[i] = word
            #sentences += nltk.sent_tokenize(' '.join(list(filter(None, sent))))
            paper_IDs[int(paper_ID)] = json.loads(abstract)["InvertedIndex"].keys() - STOPWORDS
    return paper_IDs #, sentences

paper_IDs = store_abstracts()
print('abstracts stored')

abstracts stored


In [51]:
#12 s
def store_authors():
    author_IDs = dict()
    with open('author_papers.txt') as f:
        for l in f:
            author_ID, papers = l.split(':')
            author_IDs[int(author_ID)] = list(map(int,papers.split('-')))
    return author_IDs
    
author_IDs = store_authors()
print('authors stored')

authors stored


In [52]:
n = wv.vector_size

def get_paper_value(paper_ID):
    vec = np.zeros(wv.vector_size)
    try:
        words_used = set()
        for token in paper_IDs[paper_ID]:
            words = re.sub(r'[-/]', ' ', re.sub(r'[.…,:?!;\'‘’"“”()*–]|[0-9]+-|[0-9]|\'s', '', token))
            for w in words.split():
                if w not in STOPWORDS and w not in words_used:
                    words_used.add(w)
                    try:
                        vec += wv[w]
                    except:
                        continue
    except:
        pass
    return vec
    
def get_author_value(author_ID):
    vec = np.zeros(wv.vector_size)
    for paper_ID in author_IDs[author_ID]:
        vec += get_paper_value(paper_ID)
    return vec/len(author_IDs[author_ID])
 

In [53]:
#6m

X_train = np.zeros((n_train, wv.vector_size))
#y_train = np.zeros(n_train)

for i,row in df_train.iterrows():
    author = row['author']
    X_train[i,:] = get_author_value(author)
    #y_train[i] = row['hindex']

print('training data loaded')

X_test = np.zeros((n_test, wv.vector_size))
for i,row in df_test.iterrows():
    author = row['author']
    X_test[i,:] = get_author_value(author)
print('testing data loaded')

training data loaded
testing data loaded


In [54]:
#save feature vectors from the abstracts

np.save('X_train_abstract_google.npy', X_train)
np.save('X_test_abstract_google.npy', X_test)
# np.save('X_train_abstract_gigaword.npy', X_train)
# np.save('X_test_abstract_gigaword.npy', X_test)
# np.save('X_train_abstract_numberbatch.npy', X_train)
# np.save('X_test_abstract_numberbatch.npy', X_test)
# np.save('X_train_abstract_custom.npy', X_train)
# np.save('X_test_abstract_custom.npy', X_test)

del X_train
del X_test
gc.collect()

0

# Graph features

## Loading Graph

In [4]:
#5s

# load the graph    
G = nx.read_edgelist('coauthorship.edgelist', delimiter=' ', nodetype=int)
n_nodes = G.number_of_nodes()
n_edges = G.number_of_edges() 
print('Number of nodes:', n_nodes)
print('Number of edges:', n_edges)

Number of nodes: 217801
Number of edges: 1718164


In [5]:
#convert IDs to numeric indices in order to use karate library

mapping_to_indices = { node : i for node, i in zip([node for node in G.nodes()], [i for i in range(n_nodes)]) }
inverse_mapping = dict(zip(mapping_to_indices.values(),mapping_to_indices.keys()))
G = nx.relabel_nodes(G, mapping_to_indices)
node_indices = [node for node in G.nodes()]
print("node = ", node_indices[0:10])

node =  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


## Features

### Centrality based features

In [None]:
# fast
core_number = nx.core_number(G)
print('core_number')
page_rank = nx.pagerank(G) #not so sure about the alpha
print('pagerank')
#nb_triangles = nx.triangles(G)
#print('triangles')
#deg_centrality = nx.degree_centrality(G) 
#print('deg_centrality')
# eig_centrality = nx.eigenvector_centrality(G)
# print('eigenvector_centrality')

#slow: >20 mins
# close_centrality = nx.closeness_centralsity(G)
# print('closeness_centrality')
#bet_centrality = nx.betweenness_centrality(G, normalized = True, endpoints = False)
# print('bet_centrality')
# katz_centrality = nx.katz_centrality(G)
# print('katz_centrality')
# current_flow_closeness_centrality = nx.current_flow_closeness_centrality(G)
# print('current_flow')
# current_flow_betweenness_centrality = nx.current_flow_betweenness_centrality(G)
# print('current_flow_betweenness_centrality')
# load_centrality = nx.load_centrality(G)
# print('load_centrality')
# harmonic_centrality = nx.harmonic_centrality(G)
# print('harmonic')



In [None]:
X_train = np.zeros((n_train, 3))

for i,row in df_train.iterrows():
    author = row['author']
    X_train[i, 0] = G.degree(author)
    X_train[i, 1] = core_number[author]
    X_train[i, 2] = page_rank[author] #actually this is only useful for directed networks
    #X_train[i, 3] = nb_triangles[author]
    #X_train[i, 4] = deg_centrality[author]
    # X_train[i, 3] = eig_centrality[author]


    # X_train[i, 6] = close_centrality[author]
    #X_train[i, 5] = bet_centrality[author]
    # X_train[i, 8] = katz_centrality[author]
    # X_train[i, 9] = current_flow_closeness_centrality[author]
    # X_train[i, 10] = current_flow_betweenness_centrality[author]
    # X_train[i, 11] = load_centrality[author]
    # X_train[i, 12] = harmonic_centrality[author]
    # X_train[i, 13] = percolation_centrality[author]
    # X_train[i, 14] = second_order_centrality[author]
    # X_train[i, 15] = trophic_levels[author]

print('loaded training features')

#-----------------------------

X_test = np.zeros((n_test, 3))

for i,row in df_test.iterrows():
    author = row['author']
    X_test[i, 0] = G.degree(author)
    X_test[i, 1] = core_number[author]
    X_test[i, 2] = page_rank[author]
    #X_test[i, 3] = nb_triangles[author]
    #X_test[i, 4] = deg_centrality[author]
    # X_test[i, 3] = eig_centrality[author]

    # X_test[i, 6] = close_centrality[author]
    #X_test[i, 5] = bet_centrality[author]
    # X_test[i, 8] = katz_centrality[author]
    # X_test[i, 9] = current_flow_closeness_centrality[author]
    # X_test[i, 10] = current_flow_betweenness_centrality[author]
    # X_test[i, 11] = load_centrality[author]
    # X_test[i, 12] = harmonic_centrality[author]
    # X_test[i, 13] = percolation_centrality[author]
    # X_test[i, 14] = second_order_centrality[author]
    # X_test[i, 15] = trophic_levels[author]

print('loaded testing features')

In [None]:
np.save('X_train_graph.npy', X_train)
np.save('X_test_graph.npy', X_test)

del X_train
del X_test
gc.collect()

### Proximity preserving node embeddings
https://github.com/benedekrozemberczki/karateclub/tree/master/karateclub/node_embedding/neighbourhood#

In [None]:
#takes a few hours to run...
#380 m

from karateclub.node_embedding.neighbourhood import Node2Vec 
model = Node2Vec()
model.fit(G)
embedding = model.get_embedding()
np.save('Node2Vec_embedding.npy', embedding )

del embedding
gc.collect()

In [None]:
#about 20 m
from karateclub.node_embedding.neighbourhood import BoostNE

model = BoostNE()
model.fit(G)
embedding = model.get_embedding()
np.save('BoostNE_embedding.npy', embedding )

del embedding
gc.collect()

In [None]:
#30s
from karateclub.node_embedding.neighbourhood import NetMF

model = NetMF()
model.fit(G)
embedding = model.get_embedding()
np.save('NetMF_embedding.npy', embedding  )

del embedding
gc.collect()

In [None]:
#16m
from karateclub.node_embedding.neighbourhood import DeepWalk

model = DeepWalk()
model.fit(G)
embedding = model.get_embedding()
np.save('DeepWalk_embedding.npy', embedding  )

del embedding
gc.collect()

In [None]:
#12 s

from karateclub.node_embedding.neighbourhood import RandNE

model = RandNE()
model.fit(G)
embedding = model.get_embedding()
np.save('RandNE_embedding.npy', embedding  )

del embedding
gc.collect()

In [None]:
#>17 m 
from karateclub.node_embedding.neighbourhood import GraRep

model = GraRep()
model.fit(G)
embedding = model.get_embedding()
np.save('GraRep.npy', embedding  )

del embedding
gc.collect()

In [None]:
#a few hours
#136min
from karateclub.node_embedding.neighbourhood import Diff2Vec

model = Diff2Vec()
model.fit(G)
embedding = model.get_embedding()
np.save('Diff2Vec.npy', embedding  )

del embedding
gc.collect()

In [None]:
#23 m
from karateclub.node_embedding.neighbourhood import Walklets

model = Walklets()
model.fit(G)
embedding = model.get_embedding()
np.save('Walklets.npy', embedding  )

del embedding
gc.collect()

In [None]:
#81 m
model = Walklets(walk_number = 15, walk_length = 100, dimensions = 64, workers = 16)
model.fit(G)
embedding = model.get_embedding()
np.save('Walklets_2.npy', embedding  )

del embedding
gc.collect()

In [None]:
#241 m
model = Walklets(walk_number = 20, walk_length = 120, dimensions = 100, workers = 16)
model.fit(G)
embedding = model.get_embedding()
np.save('Walklets_3.npy', embedding  )

del embedding
gc.collect()

In [None]:
from karateclub.node_embedding.neighbourhood import NMFADMM

model =  NMFADMM()
model.fit(G)
embedding = model.get_embedding()
np.save('NMFADMM.npy', embedding  )

del embedding
gc.collect()

In [None]:
from karateclub.node_embedding.neighbourhood import LaplacianEigenmaps

model = LaplacianEigenmaps()
model.fit(G)
embedding = model.get_embedding()
np.save('LaplacianEigenmaps.npy', embedding  )

del embedding
gc.collect()

In [None]:
from karateclub.node_embedding.neighbourhood import HOPE

model = HOPE()
model.fit(G)
embedding = model.get_embedding()
np.save('HOPE.npy', embedding  )

del embedding
gc.collect()

In [None]:
from karateclub.node_embedding.neighbourhood import NodeSketch

model = NodeSketch()
model.fit(G)
embedding = model.get_embedding()
np.save('NodeSketch.npy', embedding  )

del embedding
gc.collect()

In [None]:
from karateclub.node_embedding.neighbourhood import GLEE

model = GLEE()
model.fit(G)
embedding = model.get_embedding()
np.save('GLEE.npy', embedding  )

del embedding
gc.collect()

### Structural Node Level embeddings

https://github.com/benedekrozemberczki/karateclub/tree/master/karateclub/node_embedding/structural


In [None]:
# import networkx as nx
from karateclub.node_embedding.structural import Role2Vec

model = Role2Vec()
model.fit(G)
embedding = model.get_embedding()
np.save('Role2Vec_embedding.npy', embedding )

del embedding
gc.collect()

### Meta embeddings

In [6]:
from karateclub.node_embedding.meta import NEU
from karateclub.node_embedding.neighbourhood import Walklets

model = Walklets()
meta_model = NEU()
meta_model.fit(G, model)
embedding = meta_model.get_embedding()
np.save('NEU_Walklets.npy', embedding )

del embedding
gc.collect()

0

### Stacking node embeddings + dimensionality reduction

In [None]:
from sklearn.preprocessing import MinMaxScaler

def normalize(data):
    scaler = MinMaxScaler()
    scaler.fit(data)
    data = scaler.transform(data)
    return data

In [None]:
def stack_features(features_list):
    X_train = np.array([])
    X_test = np.array([])
    for name in features_list:
        X_train_path = 'X_train_'+name+ '.npy'
        X_test_path = 'X_test_'+name+ '.npy'
        a = normalize(np.load(X_train_path).astype(float))
        b = normalize(np.load(X_test_path).astype(float))
        if len(X_train) ==0:
            X_train = a
            X_test = b
        else:
            #print(a.shape)
            X_train = np.concatenate((X_train,a), axis=1)
            X_test = np.concatenate((X_test, b), axis=1)

    return X_train, X_test

print(X_train.shape)

oyea
(174241, 1256)


In [None]:
embeddings = ['Node2Vec', 'BoostNE', 'NetMF', 'RandNE', 
        'Deepwalk','Diff2Vec', 'Role2Vec','Walklets', 
        'NMFADMM', 'LaplacianEigenmaps', 'HOPE']

X_train, X_test = stack_features(embeddings)

In [None]:
#now perform dimensionality reduction using pca
#30s

from sklearn.decomposition import PCA

principal=PCA(n_components=150)
principal.fit(X_train)
x=principal.transform(X_train)

a=principal.transform(X_train)
b=principal.transform(X_test)

np.save('X_train_pca.npy', a)
np.save('X_test_pca.npy', b)


#use another form of dimensionality reduction

## loading features from saved graph embeddings

In [3]:
#6 s

#make sure this is right
def load_and_save_from_embedding(embedding_string):

    #path = embedding_string+ '_embedding.npy'
    path = embedding_string+ '.npy'
    if not os.path.exists(path):
        print(path+ ' does not exist')
        return 0
        
    embedding = np.load(path) 
    #to do: try_ except

    X_train = np.zeros((n_train, len(embedding[0])))
    y_train = np.zeros(n_train)
    #print(X_train.shape)

    for i,row in df_train.iterrows():
        author = row['author']
        #j = int(np.where(original_indices == author)[0]) #I guess this is really not efficient
        j = mapping_to_indices[author]#not so sure
        X_train[i,:] = embedding[j]
        #y_train[i] = row['hindex']

    #print('training data loaded')

    X_test = np.zeros((n_test, len(embedding[0])))
    for i,row in df_test.iterrows():
        author = row['author']
        #j = np.where(np.array(original_indices) == author)[0]
        j = mapping_to_indices[author]
        X_test[i,:] = embedding[j]

    #print('testing data loaded')

    np.save('X_train_'+embedding_string+'.npy', X_train)
    np.save('X_test_'+embedding_string+'.npy', X_test)

    del X_train
    del X_test
    gc.collect()    

    print(embedding_string+' done')


In [None]:
embedding_list = ['Node2Vec', 'GraRep', 'BoostNE', 'NetMF', 'RandNE', 
                 'Deepwalk','Diff2Vec', 'Role2Vec', 'Walklets', 'NMFADMM',
                 'LaplacianEigenmaps', 'HOPE', 'SocioDim', 'NodeSketch',
                 'Walklets_2', 'Walklets_3','NEU_Walklets']

for embedding in embedding_list:
    load_and_save_from_embedding(embedding)


# Merging features

Now that we have features, we can assemble them and perform regression to get our prediction model.<br/>
here, we load pre-computed features

In [4]:
y_train = np.zeros(n_train)
for i,row in df_train.iterrows():
    y_train[i] = row['hindex']

In [12]:
a1 = np.load('X_train_abstract.npy').astype(float)
a2 = np.load('X_train_graph.npy').astype(float)
a3 = np.load('X_train_Walklets_2.npy').astype(float)

b1 = np.load('X_test_abstract.npy').astype(float)
b2 = np.load('X_test_graph.npy').astype(float)
b3 = np.load('X_test_Walklets_2.npy').astype(float)

In [13]:
#stack features 

X_train = np.concatenate((a1, a2, a3), axis=1)
X_test = np.concatenate((b1, b2, b3), axis=1)

print(X_train.shape)
print(X_test.shape)



(174241, 559)
(43560, 559)


In [5]:
#split training set into training_1 and validation

def shuffle_split(X_train, y_train):
    #shuffle
    s = np.arange(X_train.shape[0])
    np.random.shuffle(s)
    X_train  = X_train[s]
    y_train = y_train[s]

    #split
    X_train_1 = X_train[len(df_train)//5:]
    y_train_1 = y_train[len(df_train)//5:]
    X_validation= X_train[:len(df_train)//5]
    y_validation = y_train[:len(df_train)//5]

    return X_train_1, y_train_1, X_validation, y_validation



In [8]:
X_train_1, y_train_1, X_validation, y_validation = shuffle_split(X_train, y_train)

# Regressor comparison

In [6]:
def compute_mse(y_pred_1, y_validation):
    mse = (np.square(y_pred_1 - y_validation)).mean(axis=0)
    #print(y_pred_1[10:20])
    #print(y_validation[10:20])
    print('MSE =',  mse)
    return mse

In [7]:
import sklearn
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.isotonic import IsotonicRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.neural_network import MLPRegressor
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn import tree

### Compare Different node embeddings

In [9]:

def train_test_graph_pipeline(embedding_string, y_train, X_train_1=0, y_train_1=0, X_validation=0, y_validation=0):

    print("----------------------------------------------------------------------------")
    print(embedding_string)
    print("----------------------------------------------------------------------------")

    if embedding_string != 'No':
        X_train_path = 'X_train_'+embedding_string+ '.npy'
        X_test_path = 'X_test_'+embedding_string+ '.npy'

        if not os.path.exists(X_train_path) or not os.path.exists(X_test_path):
            print('path does not exist')
            return 1000000000000
        
        X_train = np.load(X_train_path).astype(float)
        X_test = np.load(X_test_path).astype(float) 

        #split training set into training_1 and validation

        
        #X_train_1, y_train_1, X_validation, y_validation = shuffle_split(X_train, X_test)
        #shuffle
        s = np.arange(X_train.shape[0])
        np.random.shuffle(s)
        X_train  = X_train[s]
        y_train = y_train[s]

        #split
        X_train_1 = X_train[len(df_train)//5:]
        y_train_1 = y_train[len(df_train)//5:]
        X_validation= X_train[:len(df_train)//5]
        y_validation = y_train[:len(df_train)//5]

    #now time to do a regression

    #to do: use this as a parameter
    reg1 = MLPRegressor(hidden_layer_sizes=(150), activation='relu', solver='adam', alpha=0.001, verbose = 0, max_iter=100, tol = 1*pow(10, -6), n_iter_no_change=10, early_stopping=True)#use pytorch implementation instead (way faster)
    reg2 = Lasso()
    reg3 = sklearn.linear_model.ARDRegression()#MSE=86 in 10 iterations
    reg4 = sklearn.linear_model.BayesianRidge() #MSE=86 in 8 iterations
    reg5  = sklearn.linear_model.SGDRegressor(loss='squared_error', early_stopping=True, n_iter_no_change=10)
    #reg6 = svm.SVR()
    reg7 = KNeighborsRegressor(n_neighbors = 20)
    #reg8 = tree.DecisionTreeRegressor()

    
    regressors = [reg1, reg2, reg3, reg4, reg5, reg7]
    mse = np.zeros(len(regressors))
    
    i=0
    for reg in regressors:
        print(str(reg))
        reg.fit(X_train_1, y_train_1)
        #plt.plot(reg.loss_curve_, label = embedding_string)
        #plt.legend(embedding_string)
        #plt.title(embedding_string)
        y_pred_1 = reg.predict(X_validation)
        mse[i]  = compute_mse(y_pred_1, y_validation)
        print('mse =', mse[i])
        i+=1
        
    return mse


In [None]:
#takes quite some time

embedding_list = ['Node2Vec', 'BoostNE', 'NetMF', 'RandNE', 
                'Deepwalk','Diff2Vec', 'Role2Vec','Walklets',
                 'Walklets_2', 'Walklets_3', 'NEU_Walklets', 
                 'NMFADMM', 'LaplacianEigenmaps', 'HOPE', 'SocioDim', 'pca']

i=0
mses  = np.zeros((len(embedding_list), 6))

for embedding_string in embedding_list :
    mse = train_test_graph_pipeline(embedding_string, y_train)
    mses[i] = mse
    i+=1

print(mses)
mses = np.around(mses, decimals=2)
np.savetxt('embedding_results.csv', mses, delimiter=',')

### Compare Different word embeddings 

In [None]:
wes = ['abstract_gigaword', 'abstract_google', 'abstract_numberbatch',  'abstract_custom']
mses = np.zeros((len(wes), 6))
i=0

for we in wes:
    mses = train_test_graph_pipeline(we, y_train)# fix this function
    mses[i] = mse
    i+=1

print(mses)
mses = np.around(mses, decimals=2)
np.savetxt('word_embedding_results.csv', mses, delimiter=',')

### Compare Different feature combinations

In [None]:
#all possible combinations

(a_ab, b_ab) = (a1, b1)
(a_gr, b_gr) = (a2, b2)
(a_wa, b_wa) = (a3, b3)
(a_ab_gr, b_ab_gr) = (np.concatenate((a_ab, a_gr), axis=1), np.concatenate((b_ab, b_gr), axis=1))
(a_ab_wa, b_ab_wa) = (np.concatenate((a_ab, a_wa), axis=1), np.concatenate((b_ab, b_wa), axis=1))
(a_gr_wa, b_ab_wa) = (np.concatenate((a_gr, a_wa), axis=1), np.concatenate((b_gr, b_wa), axis=1))
(a_all, b_all) = (np.concatenate((a_ab, a_gr_wa),axis =1), np.concatenate((a_ab, a_gr_wa), axis=1))

comb = [(a_ab, b_ab), (a_gr, b_gr), (a_wa, b_wa), (a_ab_gr, b_ab_gr), (a_ab_wa, b_ab_wa), (a_gr_wa, b_ab_wa), (a_all, b_all)]

mses  = np.zeros((7, 6))
i=0

for (X_train, X_test) in comb:
    X_train_1, y_train_1, X_validation, y_validation = shuffle_split(X_train, y_train)
    mse = train_test_graph_pipeline('No', y_train, X_train_1 = X_train_1, y_train_1 = y_train_1, X_validation = X_validation, y_validation = y_validation)
    mses[i] = mse
    i+=1
    print(mses)

In [30]:
print(mses)
table_2 = np.around(mses, decimals=2)
np.savetxt('combinations_results.csv', table_2, delimiter=',')

[[7.72300000e+01 1.15930000e+02 1.01020000e+02 1.00960000e+02
  1.43500000e+02 9.33600000e+01]
 [1.09970000e+02 1.31090000e+02 1.16180000e+02 1.31020000e+02
  2.86816982e+22 1.08430000e+02]
 [9.04200000e+01 1.57710000e+02 1.14740000e+02 1.14730000e+02
  1.17140000e+02 1.13940000e+02]
 [5.83700000e+01 1.07150000e+02 9.61700000e+01 9.61600000e+01
  1.75570283e+26 7.42400000e+01]
 [5.71400000e+01 1.18600000e+02 8.20600000e+01 8.20300000e+01
  9.71700000e+01 8.44900000e+01]
 [8.34000000e+01 1.23910000e+02 1.00950000e+02 1.00940000e+02
  2.87923367e+22 9.27100000e+01]
 [5.67100000e+01 1.07630000e+02 8.09000000e+01 8.08900000e+01
  2.31323571e+26 7.34300000e+01]]


# Regression with MLP

In [11]:
from matplotlib import pyplot as plt

In [9]:
def compute_mse(y_pred_1, y_validation):
    mse = (np.square(y_pred_1 - y_validation)).mean(axis=0)
    print('MSE =',  mse)

In [None]:
from sklearn.neural_network import MLPRegressor

reg = MLPRegressor(hidden_layer_sizes=(200), activation='relu', solver='adam', verbose = 1, max_iter=70, n_iter_no_change=10, early_stopping = True)#use pytorch implementation instead (way faster)

#on split test/validation
reg.fit(X_train_1, y_train_1)

#whoile training set
#reg.fit(X_train, y_train)

plt.plot(reg.loss_curve_)
plt.show()

In [None]:
#200 layers -> 56.98129538854695
#150 layers - >  56.93900433307325
#100 layers -> 56.1059605875759
#75 layers -> MSE = 57.55111207305141
#125 layers -> MSE = 56.478676360493374

In [16]:
#compute mse for validation set (only do it when training with training/validation)
y_pred_1 = reg.predict(X_validation)
print('data predicted')
compute_mse(y_pred_1, y_validation)

data predicted
MSE = 57.346453434697146


# Ensemble methods

## Bagging

In [14]:
from sklearn.ensemble import BaggingRegressor

mlp = MLPRegressor(hidden_layer_sizes=(200), activation='relu', solver='adam', verbose = 1, max_iter=100, n_iter_no_change=15, early_stopping = True)
reg = BaggingRegressor(mlp, n_estimators= 8, n_jobs=-1, verbose = 1)

#on split test/validation
#reg.fit(X_train_1, y_train_1)

#on whole training set
reg.fit(X_train, y_train)


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed: 15.8min remaining: 47.5min


In [None]:
#compute mse for validation set (only do it when training with training/validation)
y_pred_1 = reg.predict(X_validation)
print('data predicted')
compute_mse(y_pred_1, y_validation)

In [25]:
# write the predictions to file
y_pred = reg.predict(X_test)
y_pred[y_pred < 0] = 0 #post-processing
df_test['hindex'] = pd.Series(np.round_(y_pred, decimals=3))
df_test.loc[:,["author","hindex"]].to_csv('submission.csv', index=False)
print('data written')

[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    8.0s remaining:   56.4s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:   10.3s finished


data written
