## Predicting Missing links in a citation network

In [None]:
# global imports 
import random 
import numpy as np 
import pandas as pd
import jgraph ## this was previously known as igraph
import csv 
import matplotlib.pyplot as plt

# machine learning imports
from sklearn import svm 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 
from sklearn import preprocessing 

import spacy

ModuleNotFoundError: No module named 'jgraph'

### Import datasets

In [None]:
# function to read data from txt files
nodes_info_df = pd.read_csv('./data/node_information.csv')
random_preds_df = pd.read_csv('./data/random_predictions.csv') 
test_set = pd.read_csv('./data/testing_set.txt', sep = ' ', header = None)
train_set = pd.read_csv('./data/training_set.txt', sep = ' ', header = None)
test_set.columns = ['source_id', 'target_id']
train_set.columns = ['source_id', 'target_id', 'label']
nodes_info_df.columns = ['paper_id', 'publication_year', 'title', 'author', 'journal_name', 'abstract']

## Exploratory Analysis

In [None]:
print('Unique papers: ', len(set(nodes_info_df['paper_id'])))
sym_diff = set(test_set['source_id'].append(test_set['target_id'])).symmetric_difference(set(nodes_info_df['paper_id']))
print('Unknown papers in test set (with nodes_info):', len(sym_diff))

In [None]:
# # get distribution of journal names 
# nodes_info_df['journal_name'] = nodes_info_df['journal_name'].fillna('unknown')
# nodes_info_df.journal_name.value_counts()[:15]

In [None]:
# nodes_info_df.author

## Feature generation

In [None]:
#Load Spacy
import en_core_web_sm
spacy_nlp = en_core_web_sm.load(disable=["tagger", "parser","ner","entity_linker","textcat","entity_ruler","sentencizer","merge_noun_chunks","merge_entities","merge_subtokens"])

### Text features generation 

In [None]:
import re 
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

def isNaN(string):
    return string != string

def filter_bad(alphabet):
    bad = [',', None]

    if(alphabet in bad):
        return False
    else:
        return True
    
## possible formats of authors:
# several authors: separation via ','
# sometimes mentions the university eg '(montpellier)'
# sometimes mentions the first name 
# sometimes format is: firstname letter. lastname
def author_normalisation(authors):
    if isNaN(authors) == False:
        #print(authors)
        authors = authors.lower()
        final_authors = list()
        
        # remove universities and last space
        if '(' in authors:
            authors = re.sub(r'\(+.*\)', '', authors).strip() 
        
        # remove extra spaces
        authors = authors.split()
        authors = ' '.join(filter(filter_bad, authors))
          
        # get all authors of one paper 
        for author in authors.split(', '): 
            author.strip()            
            # get the names of an author
            names = author.split(' ')
            author_names = list()        
            if len(names) == 2:
                # check if first element is 'letter.' format:
                if re.match('\w\.', names[0]):
                    author_names.append(names[0])
                else:
                    author_names.append(names[0][0] + '.')

            if len(names) == 3:
                if re.match('\w\.', names[0]):
                    author_names.append(names[0])
                else:
                    author_names.append(names[0][0] + '.')

                # skip the second middle name
                if re.match('\w\.', names[1]):
                    pass
                    #author_names.append(names[1])
                #else:
                #    author_names.append(names[1][0] + '.')

            author_names.append(names[-1])
            if len(author_names) > 1:
                author_names = ' '.join(author_names)
            else:
                author_names = author_names[0]
            # append last name
            final_authors.append(author_names)


        number_of_authors = len(final_authors)
        if number_of_authors == 0:
            return np.NaN
        return final_authors
    
    return np.NaN

def common_authors(string1, string2):
    if isNaN(string1):
        return False
    if isNaN(string2):
        return False
    
    #a_set = set(string1.split(','))
    #b_set = set(string2.split(','))
    a_set = set(string1)
    b_set = set(string2)
    
    if (a_set & b_set): 
        return True 
    else: 
        return False
    
def number_common_authors(string1, string2):
    pass


def remove_special_characters(string):
    string = re.sub("([^\w]|[\d_])+", " ",  string)
    return string

def tokenize(string):        
    # Code to tokenize
    spacy_tokens = spacy_nlp(string)
    # Code to remove punctuation tokens and create string tokens
    string_tokens = [token.lemma_ for token in spacy_tokens if not token.is_punct if not token.is_stop]  
    return string_tokens      

def recombining_tokens_into_a_string(list_of_tokens):
    return " ".join(list_of_tokens)


def create_tf_idf(column,tf_idf):
    #if tf_idf doesn't exist
    if tf_idf==None:
        #create a TfidfVectorizer object
        tf_idf = TfidfVectorizer()
        #Vectorize the sample text
        X_tfidf_sample = tf_idf.fit_transform(column)
    #if tf_idf already exist use the same for the test
    else:
        X_tfidf_sample = tf_idf.transform(column)
    return X_tfidf_sample,tf_idf

def tf_idf_feature(column,dataset,tf_idf,author_or_not):
    #Remove special characters from the text
    dataset[column]=dataset[column].apply(lambda x: remove_special_characters(x))
    #if we deal with the column author
    if author_or_not==1:
        # Remove strings of size less than two
        column_cleaned= dataset[column].str.findall('\w{2,}').str.join(' ')
    else:
        #Tokenize, extract lemmas and remove stop words
        tokenized=dataset[column].apply(lambda x: tokenize(x)) 
        #Recombine tokens into a string
        column_cleaned=tokenized.apply(recombining_tokens_into_a_string)
    # Create the tf_idf matrix 
    tf_idf_matrix,tf_idf=create_tf_idf(column_cleaned,tf_idf)
    return tf_idf_matrix,tf_idf

# Compute the similarity between a column target and source
def compute_similarity(column,df_source,df_target,author_or_not):
    #Fill the Na's
    df_source[column].fillna("unknown", inplace=True)
    df_target[column].fillna("unknown", inplace=True)
    tf_idf=None
    #Create the tf_idf features
    tf_idf_title_source,tf_idf=tf_idf_feature(column,df_source,tf_idf,author_or_not)
    tf_idf_title_target,tf_idf=tf_idf_feature(column,df_target,tf_idf,author_or_not)
    #Calculate the similarities
    similarity=[]
    for i in range(tf_idf_title_source.shape[0]):
        cos_sim=cosine_similarity(tf_idf_title_source[i], tf_idf_title_target[i])
        similarity.append(cos_sim)
    #Convert the list as a DataFrame
    similarity_df=pd.DataFrame(np.vstack(similarity))
    return similarity_df

def reduce_matrix_width(source_df,target_df,n_components):
    # Apply a PCA to reduce the matrix width , we chose 15
    pca_train = PCA(n_components=n_components)
    #PCA on source feature
    pca_train.fit(source_df)
    matrix_source_reduced = pca_train.transform(source_df)
    print(sum(pca_train.explained_variance_ratio_)) # Percentage of initial matrix explained by reduced matrix
    #PCA on target feature
    pca_train.fit(target_df)
    matrix_target_reduced = pca_train.transform(target_df)
    print(sum(pca_train.explained_variance_ratio_)) # Percentage of initial matrix explained by reduced matrix
    return matrix_source_reduced,matrix_target_reduced

def journal_name_feature():
    #We first merge train and test to avoid a different number of features when one-hot-encoding
    #To keep trace of the train and test dataset
    train_source_info['train_test']=1
    train_target_info['train_test']=1
    test_source_info['train_test']=0
    test_target_info['train_test']=0
    # merging the two datasets together
    combined_source=pd.concat([train_source_info,test_source_info],ignore_index=True)
    combined_target=pd.concat([train_target_info,test_target_info],ignore_index=True)
    # One hot encoding
    journal_name_encoded_source=pd.get_dummies(combined_source['journal_name'])
    journal_name_encoded_target=pd.get_dummies(combined_target['journal_name'])
    #Apply PCA to reduce matrix with 15 components
    journal_name_encoded_source_reduced,journal_name_encoded_target_reduced =reduce_matrix_width(journal_name_encoded_source,journal_name_encoded_target,15)
    # Merge encoded dataset with the combine dataset
    combined_source=pd.concat([combined_source,pd.DataFrame(journal_name_encoded_source_reduced)],axis=1)
    combined_target=pd.concat([combined_target,pd.DataFrame(journal_name_encoded_target_reduced)],axis=1)
    #Separate train and test and keep only journal_name features
    train_source_journal=combined_source[combined_source["train_test"]==1].drop(['abstract','author','journal_name','label','paper_id','publication_year','source_id','target_id','title','train_test'], axis=1)
    test_source_journal=combined_source[combined_source["train_test"]==0].drop(['abstract','author','journal_name','label','paper_id','publication_year','source_id','target_id','title','train_test'], axis=1)
    train_target_journal=combined_target[combined_target["train_test"]==1].drop(['abstract','author','journal_name','label','paper_id','publication_year','source_id','target_id','title','train_test'], axis=1)
    test_target_journal=combined_target[combined_target["train_test"]==0].drop(['abstract','author','journal_name','label','paper_id','publication_year','source_id','target_id','title','train_test'], axis=1)
    #add prefix to columns names
    train_source_journal.columns=[str(col) + '_source' for col in train_source_journal.columns]
    test_source_journal.columns=[str(col) + '_source' for col in test_source_journal.columns]
    train_target_journal.columns=[str(col) + '_target' for col in train_target_journal.columns]
    test_target_journal.columns=[str(col) + '_target' for col in test_target_journal.columns]
    return train_source_journal,test_source_journal,train_target_journal,test_target_journal
  

In [None]:
# reaye source and target info datasets
train_source_info = train_set.merge(nodes_info_df, left_on='source_id', right_on='paper_id',how="left")
train_target_info = train_set.merge(nodes_info_df, left_on='target_id', right_on='paper_id',how="left")

test_source_info = test_set.merge(nodes_info_df, left_on='source_id', right_on='paper_id',how="left")
test_target_info = test_set.merge(nodes_info_df, left_on='target_id', right_on='paper_id',how="left")


In [None]:
## apply the features to training set 
train_set['source_authors'] = train_source_info.author.apply(lambda x: author_normalisation(x))
train_set['target_authors'] = train_target_info.author.apply(lambda x: author_normalisation(x))

train_set['publication_year_diff'] = train_source_info.publication_year - train_target_info.publication_year

train_set['source_journal'] = train_source_info.journal_name
train_set['target_journal'] = train_target_info.journal_name

train_set['same_journal'] = train_set.apply(lambda x: int(x.source_journal == x.target_journal), axis=1)

## apply the features to test set
test_set['source_authors'] = test_source_info.author.apply(lambda x: author_normalisation(x))
test_set['target_authors'] = test_target_info.author.apply(lambda x: author_normalisation(x))

test_set['publication_year_diff'] = test_source_info.publication_year - test_target_info.publication_year

test_set['source_journal'] = test_source_info.journal_name
test_set['target_journal'] = test_target_info.journal_name
test_set['same_journal'] = test_set.apply(lambda x: int(x.source_journal == x.target_journal), axis=1)


In [None]:
#other features this might take some times to run
## apply the features to training set
train_set['similarity_title']=compute_similarity("title",train_source_info,train_target_info,0)
train_set['similarity_abstract']=compute_similarity("abstract",train_source_info,train_target_info,0)
train_set['similarity_author']=compute_similarity("author",train_source_info,train_target_info,1)

## apply features to test set
test_set['similarity_title']=compute_similarity("title",test_source_info,test_target_info,0)
test_set['similarity_abstract']=compute_similarity("abstract",test_source_info,test_target_info,0)
test_set['similarity_author']=compute_similarity("author",test_source_info,test_target_info,1)

In [None]:
#journal_name feature
train_source_journal,test_source_journal,train_target_journal,test_target_journal =journal_name_feature()

In [None]:
#Add journal_name to the train and test
train_set=pd.concat([train_set,train_source_journal],axis=1,)
train_set=pd.concat([train_set,train_target_journal],axis=1)
test_set=pd.concat([test_set,test_source_journal.reset_index().drop(["index"],axis=1)],axis=1)
test_set=pd.concat([test_set,test_target_journal.reset_index().drop(["index"],axis=1)],axis=1)

### Graph features generation 

In [None]:
import networkx as nx 
# get some elements and then assign the attributes -> this is shite so ignore it 
def shortest_path_info(some_graph, source, target):
    if source not in some_graph.nodes():
        return -1 # not known 
    if target not in some_graph.nodes():
        return -1 # not known 
    if nx.has_path(some_graph, source, target):
        return nx.dijkstra_path_length(some_graph, source=source, target=target)
    
    return -2 # no path

def degree_centrality(some_graph):
    degree_dict = dict(some_graph.degree(some_graph.nodes()))
    return degree_dict

def get_in_out_degree(some_graph):
    in_degree_dict = dict(some_graph.in_degree(some_graph.nodes()))
    out_degree_dict = dict(some_graph.out_degree(some_graph.nodes()))
    return in_degree_dict, out_degree_dict
    

def common_neighs(some_graph, x, y):
    if x not in some_graph.nodes():
        return 0,[] # not known 
    if y not in some_graph.nodes():
        return 0,[] # not known
    neighs = sorted(list(nx.common_neighbors(some_graph, x, y)))
    return len(neighs), neighs

def jac_index(g, x, y):
    if x not in g.nodes():
        return -1 # not known 
    if y not in g.nodes():
        return -1 # not known
    preds = nx.jaccard_coefficient(g, [(x, y)])
    jacc = 0

    for u, v, p in preds:
        jacc = p
    return jacc

def pref_attachement(g, x, y):
    if x not in g.nodes():
        return -1 # not known 
    if y not in g.nodes():
        return -1 # not known
    preds = nx.preferential_attachment(g, [(x, y)])
    pref = 0

    for u, v, p in preds:
        pref = p
    return pref

def aa_index(g, x, y):
    if x not in g.nodes():
        return -1 # not known 
    if y not in g.nodes():
        return -1 # not known
    preds = nx.adamic_adar_index(g, [(x, y)])
    aa = 0

    for u, v, p in preds:
        aa = p
    return aa



In [None]:
# create the network 
# get network for when there is a connection in train set
# edges = list(zip(train_set.loc[train_set.label == 1].source_id, train_set.loc[train_set.label == 1].target_id))
# nodes = list(set(train_set.source_id + train_set.target_id))

# train_G = nx.DiGraph()
# train_G.add_nodes_from(nodes)
# train_G.add_edges_from(edges)

train_G = nx.from_pandas_edgelist(train_set, source='source_id', target='target_id', edge_attr=None,
                                  create_using=nx.DiGraph())

# make sure you also have an undirected graph
train_G_ud = train_G.to_undirected()

# create some dictionaries to use later on
clustering_coeff_dict = nx.clustering(train_G_ud)
avg_neigh_degree_dict = nx.average_neighbor_degree(train_G)
out_degree_centrality = nx.out_degree_centrality(train_G)
in_degree_centrality = nx.in_degree_centrality(train_G)
page_rank = nx.pagerank_scipy(train_G)
hub_score, authority_score = nx.hits(train_G)

In [None]:
# function to get features for graph of a single element
def get_features(directed_graph, ud_graph, source_id, target_id, label):
    # features for undirected graph
    jaccard_index = jac_index(ud_graph, source_id, target_id)
    preferencial_attachment = pref_attachement(ud_graph, source_id, target_id)
    number_common_neighbours, common_neighbours = common_neighs(ud_graph, source_id, target_id)
    adamic_adar_index = aa_index(ud_graph, source_id, target_id)
    #shortest_path = shortest_path_info(train_G, source_id, target_id)

    
    source_pr = page_rank[source_id]
    source_hub_score = hub_score[source_id]
    source_authority_score = authority_score[source_id]
    source_cluster_coeff = clustering_coeff_dict[source_id]
    source_out_centrality = out_degree_centrality[source_id]
    source_avg_neigh_degree = avg_neigh_degree_dict[source_id]
 
    target_pr = page_rank[target_id]
    target_hub_score = hub_score[target_id]
    target_authority_score = authority_score[target_id]
    target_cluster_coeff = clustering_coeff_dict[target_id]
    target_in_centrality = in_degree_centrality[target_id]
    target_avg_neigh_degree = avg_neigh_degree_dict[target_id]

    # no name feature but supposedly important 
    feature_n = source_out_centrality * target_in_centrality
     
    return [source_id, target_id, label, jaccard_index, preferencial_attachment, 
            number_common_neighbours, adamic_adar_index, source_pr, target_pr, 
            source_hub_score, target_hub_score, source_authority_score, 
            target_authority_score, source_cluster_coeff, target_cluster_coeff, 
            source_out_centrality, target_in_centrality, source_avg_neigh_degree, 
            target_avg_neigh_degree, feature_n]
    

### IMPORTANT: add column names when adding new features to the dataset 

In [None]:
### add columns when you add Features
column_names = ['source_id', 'target_id', 'label', 'jaccard_index', 'preferential_attachement', 
                'number_common_neighbours',  'adamic_adar_index', 'source_pr',
                'target_pr', 'source_hub_score', 'target_hub_score', 'source_authority_score',
                'target_authority_score', 'source_cluster_coeff', 'target_cluster_coeff',
                'source_out_centrality', 'target_in_centrality', 'source_avg_neigh_degree', 
                'target_avg_neigh_degree', 'feature_n']
final_train_set = pd.DataFrame([[np.nan]*len(column_names)]* train_set.shape[0], columns=column_names)
final_test_set = pd.DataFrame([[np.nan]*len(column_names)]* test_set.shape[0], columns=column_names)

In [None]:
# create the features for the train set
for idx, row in train_set.iterrows():
    features = get_features(train_G, train_G_ud, row.source_id, row.target_id, row.label)
    #update the features
    final_train_set.loc[idx] = features

In [None]:
#create the features for the test set
for idx, row in test_set.iterrows():
    features = get_features(train_G, train_G_ud, row.source_id, row.target_id, -1)
    #update the features
    final_test_set.loc[idx] = features

In [None]:
# merge graph and text features together 
train_set = train_set.merge(final_train_set, on=['source_id', 'target_id', 'label'], how='left') 
test_set = test_set.merge(final_test_set, on=['source_id', 'target_id'], how='left')

In [None]:
from networkx import betweenness_centrality
from networkx import edge_betweenness_centrality
from networkx import load_centrality
from networkx import eigenvector_centrality

def graph_features(directed_graph, dataframe_dataset):
    # betweenness
    between_centrality = betweenness_centrality(directed_graph) # shortest-path betweenness centrality for nodes
    # load centrality
    ld_centrality = load_centrality(directed_graph) # load centrality of a node is the fraction of all shortest paths that pass through that node
    #eigenvector centrality
    eig_centrality = eigenvector_centrality(directed_graph)
    
    # save features to training set 
    dataframe_dataset['betweeness_centrality'] = pd.DataFrame.from_dict(dict(eig_centrality), orient='index')
    dataframe_dataset['load_centrality'] = pd.DataFrame.from_dict(dict(ld_centrality), orient='index')
    dataframe_dataset['eigen_centrality'] = pd.DataFrame.from_dict(dict(eig_centrality), orient='index')

    return dataframe_dataset

In [None]:
train_set = graph_features(train_G, train_set)
train_set.betweeness_centrality.fillna(-1, inplace=True)
train_set.load_centrality.fillna(-1, inplace=True)
train_set.eigen_centrality.fillna(-1, inplace=True)

test_set = graph_features(train_G, test_set)
test_set.betweeness_centrality.fillna(-1, inplace=True)
test_set.load_centrality.fillna(-1, inplace=True)
test_set.eigen_centrality.fillna(-1, inplace=True)

In [None]:
# write out so that you do not have to run everything again
train_set.to_csv('final_train.csv',index=False)
test_set.to_csv('final_test.csv', index=False)

### Can start from here as well when features were saved previously

In [None]:
test_set = pd.read_csv('final_test.csv')
train_set = pd.read_csv('final_train.csv')

### Final clean (i.e replacing nans etc)

In [None]:
# fill nas in some way
train_set.publication_year_diff.fillna(-24, inplace=True) # 24 is for unknown (?)
train_set.fillna('unknown', inplace=True)

test_set.publication_year_diff.fillna(-24, inplace=True) # 24 is for unknown (?_)
test_set.fillna('unknown', inplace=True)

In [None]:
test_set.head()

In [None]:
train_set.head()

In [None]:
# check the types of each column (none should be object)
train_set.dtypes

In [None]:
%matplotlib inline
## Most interesting correlation is with label
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(14,12))
sns.heatmap(train_set.corr(),
            vmax=0.5,
            square=True,
            annot=True)


## Learning Stuff

In [None]:
# separate features from labels:
X = train_set.loc[:, (train_set.columns != 'label') & 
                  (train_set.columns != 'common_authors') & 
                  (train_set.columns != 'source_authors') & 
                  (train_set.columns != 'target_authors') & 
                  (train_set.columns != 'source_journal') & 
                  (train_set.columns != 'target_journal') 
                 ]
y = train_set['label']
y.astype(np.int)


In [None]:
# final feature correlation
ff = X.copy()
ff['label'] = y
plt.figure(figsize=(14,12))
sns.heatmap(X.corr(),
            vmax=0.5,
            square=True,
            annot=True)

In [None]:
## Train different models and compare the performance 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.metrics import  f1_score, confusion_matrix
from sklearn.model_selection import cross_validate

model = AdaBoostClassifier(n_estimators=75, learning_rate=1)
scores = cross_validate(model, X, y, scoring='f1', 
                        cv=5) # n_jobs is the number of cpus to use -1 => all
scores

In [None]:
# describe results from scores
from scipy import stats 
stats.describe(scores['test_score'])

In [None]:
model = RandomForestClassifier()
scores = cross_validate(model, X, y, scoring='f1', 
                        cv=5) # n_jobs is the number of cpus to use -1 => all
scores

In [None]:
# describe results from scores
from scipy import stats 
stats.describe(scores['test_score'])

### Recursive feature selection 

In [None]:
# ## ONLY RUN AT THE END FOR GRAPHS.. takes a v.long time to execute (been 3hours for now.. only execute on a virtual 
# # machine with GPUs (if possible))
# from sklearn.feature_selection import RFECV

# clf_rf_4 = model
# rfecv = RFECV(estimator=clf_rf_4, step=1, cv=10,scoring='f1')   #10-fold cross-validation
# rfecv = rfecv.fit(X, y)

# print('Optimal number of features :', rfecv.n_features_)
# print('Best features :', X.columns[rfecv.support_])

In [None]:
# Plot number of features VS. cross-validation scores
import matplotlib.pyplot as plt
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score of number of selected features")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

## prior to authors:
DescribeResult(nobs=10, minmax=(0.7092423428264374, 0.7505859928392963), mean=0.7330286516063008, variance=0.0002449243278408503, skewness=-0.16892931758355367, kurtosis=-1.5003847605685021)

after some basic graphs:
DescribeResult(nobs=10, minmax=(0.9537111539570966, 0.9556853523477206), mean=0.9544708719147975, variance=4.3393884483164826e-07, skewness=0.7947367347642024, kurtosis=-0.6317507457312379)

### Comparing models

## 1. XG Boost

1.1 XGboost base model

In [None]:
from xgboost.sklearn import XGBClassifier

# making sure the test and the train files have same sequence of columns

test = test[X.columns]


# defining the base model
xgb_model_base = XGBClassifier(n_estimators = 100)

# printing the cross validation scores for the classifier
scores = cross_validate(xgb_model_base, X, y.values.ravel(), scoring='f1', 
                        cv=3,n_jobs = -1 ) # n_jobs is the number of cpus to use -1 => all
scores


# fitting on the training data
xgb_model_base.fit(X, y.values.ravel())

# predicting the outcome from the final 
predictions = xgb_model_base.predict(test)

# write out
out_df = test_set.copy()
data = {'id': list(out_df.index), 'category': predictions}
final_df = pd.DataFrame(data)


# 3: write file out
final_df.to_csv('submission.csv',index=False, sep=',')

1.2 XgBosst with random search

In [None]:
# defining the search grid

random_grid = {
     "n_estimators"     : [int(x) for x in np.linspace(50, 600, num = 20)],
     "learning_rate"    : [0.01, 0.02, 0.05, 0.10 ] ,
     "max_depth"        : [ 6, 8, 10, 12, 15, 20],
     "min_child_weight" : [ 1, 3, 5, 7 ],
     "gamma"            : [ 0.3, 0.4, 0.7, 0.9 ],
     "colsample_bytree" : [ 0.05, 0.1, 0.3, 0.4] }

# Use the random grid to search for best hyperparameters

# First create the base model to tune
xgb_model = XGBClassifier()

# Random search of parameters
xgb_random = RandomizedSearchCV(estimator = xgb_model, param_distributions = random_grid,
n_iter = 10, cv = 3, verbose=2, random_state=42 ,n_jobs = -1, scoring = 'f1_weighted')

optimised_xgb_random = xgb_random.best_estimator_



# printing the cross validation scores for the classifier
scores = cross_validate(optimised_xgb_random, X, y.values.ravel(), scoring='f1', 
                        cv=3,n_jobs = -1 ) # n_jobs is the number of cpus to use -1 => all
scores


# fitting on the training data
xgb_model_base.fit(X, y.values.ravel())

# predicting the outcome from the final 
optimised_xgb_random.predict(test)

# write out
out_df = test_set.copy()
data = {'id': list(out_df.index), 'category': predictions}
final_df = pd.DataFrame(data)


# 3: write file out
final_df.to_csv('submission.csv',index=False, sep=',')

## 2. Support Vector Machine

In [None]:
from sklearn.svm import LinearSVC

# SVM has a zero tolerance towards null values, hence replacing them by 0

XVM = X.fillna(value=0)
test_SVM = test.fillna(value=0)

clf = LinearSVC( tol=1e-4)


# printing the cross validation scores for the classifier
scores = cross_validate(clf, XVM, y, scoring='f1', 
                        cv=10,n_jobs = -1 ) # n_jobs is the number of cpus to use -1 => all
scores


# fitting on the training data
clf.fit(XVM, y)

# predicting the outcome from the final 
prediction_clf = clf.predict(test_SVM)

# write out
out_df = test_set.copy()
data = {'id': list(out_df.index), 'category': predictions}
final_df = pd.DataFrame(data)


# 3: write file out
final_df.to_csv('submission.csv',index=False, sep=',')

## 3. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
# 1: retrain the complete model -> don't forget to change this to optimal one @ end
final_model = RandomForestClassifier()
final_model.fit(X, y)

In [None]:
# 2: predict on the test set
final_test_set = test_set.loc[:, (test_set.columns != 'source_authors') & (test_set.columns != 'common_authors') & (test_set.columns != 'target_authors')& (test_set.columns != 'label')& (test_set.columns != 'source_journal') & (test_set.columns != 'target_journal')]
predictions = final_model.predict(final_test_set)

# write out
out_df = test_set.copy()
data = {'id': list(out_df.index), 'category': predictions}
final_df = pd.DataFrame(data)


# 3: write file out
final_df.to_csv('submission.csv',index=False, sep=',')

In [None]:
# plot the feature importance
feat_importances = pd.Series(final_model.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

## The end