# Data importation

In [1]:
import random
import numpy as np
from sklearn import svm
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
import nltk
import csv
import pandas as pd
from nltk.stem import SnowballStemmer
import matplotlib.pyplot as plt
import networkx as nx
import re

%matplotlib inline

In [2]:
info = pd.read_csv(
    "node_information.csv", 
    header= None, 
    names=["Id", "year", "title", "authors", "journal", "abstract"],
    sep=",",
    index_col = 0
)

In [3]:
X_train = pd.read_csv("training_set.txt", sep=" ", header=None)
X_test = pd.read_csv("testing_set.txt", sep=" ", header=None)
y_train = X_train[2]
X_train.drop([2], axis = 1, inplace = True)

print info.shape, X_train.shape, X_test.shape

(27770, 5) (615512, 2) (32648, 2)


In [4]:
######################
### FOR VALIDATION ###
######################


#####################
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.2)
#####################

#####################
small_portion_to_train = 50000
small_portion_to_test  = 5000
#X_train = X_train[:small_portion_to_train]
#y_train = y_train[:small_portion_to_train]

#X_test  = X_test[:small_portion_to_test]
#y_test = y_test[:small_portion_to_test]
#####################

# Feature Preprocess

- list_authors is the list of authors in the papers
- list_universities is the list where the authors are from

In [5]:
def universities_to_keep(authors, universities):
    while('(' in authors and ')' in authors):
        universities.append( authors[authors.find('(')+1 : authors.find(')')] )
        authors = authors[: authors.find('(')] + authors[ authors.find(')')+1 : ]
            
    if '(' in authors:
        universities.append( authors[authors.find('(')+1 : ])
        authors = authors[: authors.find('(')]
    
    return authors, universities


def name_to_keep(author):
    if len(author.split(' ')) <= 1:
        return author
    
    while( author[0] == ' ' and len(author) > 0):
        author = author[1:]
    while( author[-1] == ' ' and len(author) > 0):
        author = author[:-1]
    
    author = author.replace('.', '. ')
    author = author.replace('.  ', '. ')
    name_to_keep = author.split(' ')[0][0] + '. ' + author.split(' ')[-1]

    return name_to_keep

# Transform concatenated names of authors to a list of authors 
list_authors = []
list_universities = []

info['authors'] = info['authors'].replace(np.nan, 'missing')
for authors in info['authors']:
    if authors != 'missing':
        ### split the different authors
        authors = authors.lower()
        
        ### Find the universities included in the name
        universities = []
        authors, universities = universities_to_keep(authors, universities)
        
        ### Split the authors
        authors = re.split(',|&', authors)
        
        ### For each author, check if university, and store it. Also, keep just the names (To be improved)
        authors_in_article = []      
        for author in authors:
            if author != ' ':
                authors_in_article.append(name_to_keep(author))
            
        list_universities.append(universities)
        list_authors.append(authors_in_article)
    else:
        list_universities.append(['missing'])
        list_authors.append(['missing'])   
        
info['authors'] = list_authors
info['universities'] = list_universities

# Topologic features

In [6]:
def make_graph(X_train, y_train, X_test):
    X_train = pd.concat([X_train, y_train], axis = 1)
    X_train = X_train.values
    G = nx.DiGraph()
    for i in range(X_train.shape[0]):
        source = X_train[i,0]
        target = X_train[i,1]
        G.add_node(source)
        G.add_node(target)
        if X_train[i,-1] == 1:
            G.add_edge(source,target)
            
    X_test = X_test.values
    for i in range(X_test.shape[0]):
        source = X_test[i,0]
        target = X_test[i,1]
        G.add_node(source)
        G.add_node(target)
        
    return G  

In [7]:
G = make_graph(X_train, y_train, X_test)  

In [8]:
def create_topologic_features(X, G):
    X_ = X.copy()
    X = X.values
    
    X_['Betweeness centrality'] = compute_betweeness_array(X, G)
    X_['Number common neighbours'] = make_common_neighbors(X, G)
    X_['Jaccard coefficienf'] = make_jaccard(X, G)
    diff_deg, to_deg = compute_diff_inlinks(X, G)
    X_['Difference in inlinks coefficient'] = diff_deg
    X_["Number of times to cited"] = to_deg
    X_['Same cluster'] = same_community(X,G)
    return X_

In [9]:
%%time
from create_topologic_features import create_topologic_features
X_train = create_topologic_features(X_train, G)
X_test = create_topologic_features(X_test, G)

CPU times: user 2min 24s, sys: 1.67 s, total: 2min 25s
Wall time: 2min 25s


# Semantic features
- Cosine similarity within the titles as tf-idf
- Cosine similarity within the abstracts as tf-idf
- Cosine similarity within the titles as word2vec
- Cosine similarity within the abstracts as word2vec

### To try
- Difference cosine similarities?
- Keep the stopwords or not?
- Stemmise the words of not?

# Attribute features

- Difference in publication year
- Number of common authors
- Self-citation
- Same journal
- Number of times "to" cited (Attraction of the "to" paper)

### To try
- Number of times each author of "to" cited [Sum of these number of times] ?
- Number of times each journal cited?
- Number of same university??

In [10]:
%%time
from create_attribute_features import create_attribute_features
X_train = create_attribute_features(X_train,info)
X_test = create_attribute_features(X_test,info)

CPU times: user 9min 59s, sys: 3.78 s, total: 10min 3s
Wall time: 10min 2s


# Author Graph features

In [11]:
%%time 
from author_graph import make_graph_authors, create_topologic_features_authors

G_authors = make_graph_authors(X_train, y_train, info)
X_train = create_topologic_features_authors(X_train, G_authors, info, betweeness = True, common_neigh_and_jacc = True, inlinks = True)
X_test = create_topologic_features_authors(X_test, G_authors, info,  betweeness = True, common_neigh_and_jacc = True, inlinks = True)

CPU times: user 35min 37s, sys: 6.5 s, total: 35min 43s
Wall time: 36min 42s


In [12]:
X_train.head()

Unnamed: 0,0,1,Betweeness centrality,Number common neighbours,Jaccard coefficienf,Difference in inlinks coefficient,Number of times to cited,Same cluster,Diff publication,Number same authors,Self citation,Same journal,Authors betweeness,Authors common neighbors,Authors jaccard,Authors max difference in inlinks,Authors sum difference in inlinks,Authors max of times to cited,Authors sum of times to cited,Authors of times to cited
109813,7228,9404020,-0.000108,0,0.0,2,3,0,6,0,0,0,-0.074968,153,0.140625,270,315,496,450,248
435577,9802032,9507048,-0.002341,6,0.031414,107,123,1,3,0,0,0,-1.015983,584,0.09347,4902,4460,5170,5170,5170
393549,9707211,9909071,-0.000612,0,0.0,-1,3,0,-2,0,0,0,-0.025411,10,0.030488,60,60,93,93,93
585839,9711149,9801034,-0.000792,0,0.0,10,10,0,-1,1,1,0,0.0,6217,1.0,0,0,5170,5170,5170
573847,4048,9605158,0.0,2,0.066667,7,11,1,4,1,1,0,0.018972,229,0.361769,-48,-60,393,122,113


In [13]:
X_test.head()

Unnamed: 0,0,1,Betweeness centrality,Number common neighbours,Jaccard coefficienf,Difference in inlinks coefficient,Number of times to cited,Same cluster,Diff publication,Number same authors,Self citation,Same journal,Authors betweeness,Authors common neighbors,Authors jaccard,Authors max difference in inlinks,Authors sum difference in inlinks,Authors max of times to cited,Authors sum of times to cited,Authors of times to cited
315892,9610162,9512062,-0.006482,3,0.008746,207,246,0,2,0,0,0,0.898126,1259,0.201182,-3944,-3944,1226,1226,1226.0
98630,9802073,9704165,0.00054,7,0.097222,11,16,0,1,0,0,0,1.022651,622,0.09992,-5010,-4857,313,160,156.5
185113,9208069,9512054,-0.000108,0,0.0,1,4,0,-3,0,0,0,0.018282,110,0.133333,-130,-92,174,136,87.0
600202,9810239,9407087,-0.033815,14,0.014156,945,965,1,4,0,0,0,-0.299414,439,0.148915,2439,4356,4692,2586,2346.0
13082,301150,208148,-0.000252,17,0.293103,18,23,1,1,0,0,0,-0.116017,59,0.045004,618,1676,1689,631,365.5


# Classifier

In [14]:
def score(pred, real):
    tot = 0
    for i, val in enumerate(real):
        if pred[i] == val:
            tot += 1
    return float(tot)/len(real)

In [15]:
y_train.shape

(492409,)

In [16]:
%%time 
from sklearn.ensemble import RandomForestClassifier
#rfc = RandomForestClassifier(n_estimators = 100,n_jobs=3)
#rfc.fit(X_train.drop([0,1], axis = 1), y_train)
#pred = rfc.predict(X_test.drop([0,1], axis = 1))

CPU times: user 22.2 ms, sys: 0 ns, total: 22.2 ms
Wall time: 1.02 s


In [17]:
%%time
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators = 600)
gbc.fit(X_train.drop([0,1], axis = 1), y_train)
pred = gbc.predict(X_test.drop([0,1], axis = 1))

CPU times: user 38min 8s, sys: 15.9 s, total: 38min 24s
Wall time: 38min 35s


In [18]:
print score(pred, y_test)
print score(gbc.predict(X_train.drop([0,1], axis = 1)),y_train)

0.955045774676
0.968124059471


In [19]:
def make_submission(predicted_label, name = 'submit.csv'):
    submit_d = d = {'id' : pd.Series(np.arange(1,X_test.shape[0]+1).astype(int)),
                    'category' : pd.Series(predicted_label).astype(int)}
    submit = pd.DataFrame(submit_d)
    submit.to_csv(name,index=False)
    return submit
