# Loading Data and Libraries

In [1]:
from google.colab import drive 
drive.mount('/content/gdrive/', force_remount=True) 

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive/


In [2]:
cd '/content/gdrive/My Drive/Altegrad' 

/content/gdrive/My Drive/Altegrad


In [3]:
!pip install unidecode 

Collecting unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |█▍                              | 10kB 18.4MB/s eta 0:00:01[K     |██▊                             | 20kB 2.1MB/s eta 0:00:01[K     |████▏                           | 30kB 3.1MB/s eta 0:00:01[K     |█████▌                          | 40kB 2.0MB/s eta 0:00:01[K     |██████▉                         | 51kB 2.5MB/s eta 0:00:01[K     |████████▎                       | 61kB 3.0MB/s eta 0:00:01[K     |█████████▋                      | 71kB 3.4MB/s eta 0:00:01[K     |███████████                     | 81kB 3.9MB/s eta 0:00:01[K     |████████████▍                   | 92kB 4.3MB/s eta 0:00:01[K     |█████████████▊                  | 102kB 3.3MB/s eta 0:00:01[K     |███████████████▏                | 112kB 3.3MB/s eta 0:00:01[K     |████████████████▌               | 122kB 3.3MB/

In [42]:
import codecs
import sys
import csv
from unidecode import unidecode
import pandas as pd
import numpy as np
import string
import networkx as nx
import scipy.sparse as sp
import nltk
nltk.download('stopwords')
from sklearn.metrics import log_loss

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('french'))

data_path = "./text/"
edgelist_path = "./data/edgelist.txt"

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data Loading

In [0]:
def build_graph():
    '''Function that build a directed weighted graph from the edgelist.txt'''
    G = nx.read_weighted_edgelist(edgelist_path, create_using=nx.DiGraph())
    print("Number of nodes : ", G.number_of_nodes())
    print("Number of edges : ", G.number_of_edges())
    return G

def build_train_test(train_path, test_path):
    """Function that reads the train.csv and returns the train Ids and train labels
        and reads the test.csv and returns the test Ids
    """
    with open(train_path, 'r') as f:
        train_data = f.read().splitlines()
        
    train_hosts = list()
    y_train = list()
    for row in train_data:
        host, label = row.split(",")
        train_hosts.append(host)
        y_train.append(label.lower())
        
    df_train = pd.DataFrame(data= y_train, index = train_hosts, columns= ["class"]).reset_index()
    
    with open(test_path, 'r') as f:
        test_hosts = f.read().splitlines()
    df_test =  pd.DataFrame(data=[] , index = test_hosts, columns= ["class"]).reset_index()
    return df_train, df_test

def write_submission(write_path, test_hosts, model_classes_list, predicted_probas):
    """Function that writes the submission file
  there is a need to be pass it  : 
    - The path of the file to create
    - The test Ids (returned by build_train_test)
    - The classes labels as a list
    - The predicted probas for those class labels (same order)
    """
    with open(write_path, 'w') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        model_classes_list.insert(0, "Host")
        writer.writerow(model_classes_list)
        for i,test_host in enumerate(test_hosts):
            lst = predicted_probas[i,:].tolist()
            lst.insert(0, test_host)
            writer.writerow(lst)

def text_from_id(id):
    id = str(id)
    try :
        with codecs.open(data_path+id, 'r', encoding="utf-8") as f:
            text = f.readlines()
    except:
        with codecs.open(data_path+id, 'r', encoding="latin-1") as f:
            text = f.readlines()
    return text

def build_local_test(train_hosts, y_train, size_local_test=.25):
    
    local_train, local_test, local_y_train, local_y_test = train_test_split(train_hosts, y_train,
                                                                            stratify=y_train, 
                                                                            test_size=size_local_test)
    
    return local_train, local_y_train, local_test, local_y_test

def compute_score(predictions, y_true, classes_order):
    dico = {v:k for k, v in enumerate(classes_order)}
    print(dico)
    loss = 0
    for i, cla in enumerate(y_true) :
        loss -= np.log(predictions[i, dico[cla]])
    loss = loss/len(y_true)
    return loss

In [0]:
test_data = pd.read_csv('./embeds/' + 'test.csv', header = None)
test_data.columns = ['File']

### Use Bert Vectors

In [44]:
train_camembert = pd.read_csv('./embeds/embeds_taha.csv') 
test_camembert = pd.read_csv('./embeds/test_camembert.csv') 

train_camembert.head() 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767,target
0,0.004979,0.051084,0.011548,0.144239,-0.054896,-0.091976,-0.079737,0.158545,-0.063696,0.029545,-0.028441,0.035867,-0.028134,0.150745,0.162346,-0.035347,-0.001045,-0.116933,0.1371,-0.052911,0.046483,-0.012581,0.034688,-0.209713,0.126438,-0.194954,-0.005686,0.05549,-0.018563,0.043815,0.024498,-0.170535,0.127708,0.183329,-0.058293,0.028888,0.003619,0.127561,0.032685,-0.005017,...,0.112802,0.088039,0.314132,0.163118,-0.062759,-0.075335,0.011383,0.013771,-0.061509,0.046618,-0.003913,-0.064163,0.05393,0.042424,0.023481,-0.163376,-0.105974,0.039036,-0.039118,-0.163902,0.032288,0.022957,0.030852,-0.239209,0.09117,-0.01587,-0.105228,-0.125584,-0.003099,-0.039667,-0.042257,0.001405,0.040467,0.06253,0.063022,-0.122656,-0.116885,0.005459,0.004883,3
1,-0.008539,0.196301,-0.017056,-0.081347,0.046665,-0.032499,0.080598,0.02548,-0.037952,0.034152,0.106114,0.201544,-0.049484,0.05949,0.09946,-0.075301,0.074141,-0.06583,-0.072313,0.040387,0.025943,-0.075173,-0.10821,-0.191474,0.446738,-0.188546,0.064377,-0.072099,0.023127,-0.110718,-0.130323,-0.093393,-0.063916,0.146621,0.045616,-0.069296,-0.204344,0.036896,0.108239,-0.230537,...,0.11354,-0.105509,0.108981,0.152414,-0.000489,-0.108477,0.171643,-0.01805,-0.030431,-0.08014,-0.00785,0.093457,-0.027985,0.041688,-0.066457,-0.049049,0.087364,0.104405,-0.057261,0.004004,-0.134647,-0.03834,0.125589,-0.069518,0.03281,-0.129408,0.00243,-0.167985,-0.183271,0.027825,-0.048238,-0.06793,0.172509,-0.102032,0.005831,0.000533,-0.095894,-0.10714,-0.166406,2
2,0.014586,0.010158,0.019481,0.139028,-0.069928,-0.046199,-0.043685,0.151316,-0.040847,0.036641,0.029062,0.051212,-0.044985,0.119287,0.193056,-0.06069,-0.070176,-0.121432,0.084575,-0.064783,0.040374,-0.04405,0.091162,-0.18224,0.132107,-0.152883,-0.091903,0.03484,0.024752,-0.013717,-0.032962,-0.166242,0.136357,0.15513,-0.029583,0.049545,-0.060612,0.123955,0.027059,-0.036924,...,0.097372,0.096647,0.300638,0.132141,-0.123732,-0.007762,0.035959,0.007439,-0.066337,-0.012342,-0.004053,-0.046152,0.135284,0.022371,0.024677,-0.114882,-0.071554,0.016102,-0.011974,-0.173243,0.043724,0.024285,0.064807,-0.211291,0.103848,-0.010146,-0.137891,-0.148035,-0.007298,-0.061166,0.022263,-0.050842,0.149352,0.073157,0.062471,-0.108098,-0.133127,0.026859,-0.020756,2
3,0.009449,0.057803,-0.032411,0.181052,-0.023205,-0.077158,-0.088522,0.112174,-0.064386,0.011736,-0.007792,0.100436,0.00339,0.168092,0.165106,-0.055701,-0.022107,-0.087624,0.058423,-0.026992,0.035437,-0.050085,0.067647,-0.218709,0.148364,-0.155343,-0.00127,0.080315,-0.040227,-0.052534,0.019631,-0.110544,0.10182,0.177629,-0.008147,0.044479,0.016035,0.097301,0.02391,-0.030328,...,0.066725,0.087909,0.215426,0.13586,-0.078239,-0.063461,0.016133,0.024904,-0.032214,-0.042499,-0.012666,0.030743,0.090259,0.012712,0.034152,-0.061917,-0.04308,0.021741,-0.033634,-0.210305,0.038935,0.029853,0.004078,-0.209901,0.028864,-0.020685,-0.116714,-0.089824,-0.000962,-0.069954,-0.040098,0.052144,0.01468,0.06952,0.018773,-0.12043,-0.096468,0.036334,0.023615,1
4,-0.01832,0.008598,-0.035451,0.050254,-0.067355,-0.03581,-0.083052,0.092934,-0.036804,0.049422,0.022433,0.036013,0.020027,0.106636,0.179138,-0.093071,0.029542,-0.083329,0.082547,-0.071713,0.012202,-0.013264,0.0113,-0.291337,0.254224,-0.155738,-0.058512,0.0577,0.0067,-0.112091,0.014196,-0.116371,0.065155,0.173727,0.02628,-0.047481,-0.016186,0.13153,0.008204,-0.097958,...,0.09671,0.059853,0.171077,0.128387,-0.065607,-0.015473,0.089697,0.011542,-0.057754,0.037398,-0.001983,0.005053,0.060829,0.016896,0.042919,-0.144442,0.004372,0.075883,-0.037403,-0.142559,-0.002616,-0.017627,0.090729,-0.196299,0.055102,-0.073774,-0.033602,-0.132747,-0.059542,0.019749,-0.002796,-0.067205,0.100744,0.075183,0.039977,-0.04893,-0.125256,0.028781,-0.059022,7


In [0]:
idx_exceptions = np.where(train_camembert.mean(axis=1) == 0.)[0]

for idx in idx_exceptions: 
    train_camembert.iloc[idx] = train_camembert.iloc[idx + 5] 

In [0]:
import logging
logging.getLogger("pytorch_transformers.tokenization_utils").setLevel(logging.ERROR)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import make_scorer

In [0]:
def loglikelihood_score(y_true, predictions, classes_order):
    dic = {v:k for k, v in enumerate(classes_order)}
    loss = 0
    for i, cls in enumerate(y_true) :
        loss -= np.log(predictions[i, dic[cls]])
    loss = loss/len(y_true)
    return loss

In [91]:
X_train = train_camembert.iloc[:,:-1] 
y_train = train_camembert.target 
X_test = test_camembert.iloc[:, :]

X_train = X_train.values 
y_train = y_train.values 
X_test = X_test.values

X_train.shape, y_train.shape, X_test.shape

((1994, 768), (1994,), (560, 768))

In [0]:
X_1, X_2, Y_1, Y_2 = train_test_split(X_train, y_train, test_size = 0.2)

In [58]:
grid={"C":np.logspace(-1,3, num = 30)} 

logreg = LogisticRegression(solver='lbfgs',  multi_class='auto', max_iter=25000, n_jobs=-1) 

classes_order = LogisticRegression(solver='lbfgs',  multi_class='auto').fit(X_1[:, :2], Y_1).classes_ 
score_function = make_scorer(loglikelihood_score, greater_is_better=False, classes_order=classes_order, needs_proba=True) 

logreg_cv = GridSearchCV(logreg, grid, cv=3, verbose=3, n_jobs=-1, scoring=score_function) 
logreg_cv.fit(X_1, Y_1) 

print(logreg_cv.best_params_) 
print('Grid Search best score : ', logreg_cv.best_score_) 
print('Score on test', logreg_cv.score(X_2, Y_2) )

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   24.3s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  6.0min finished


{'C': 1.743328822199988}
Grid Search best score :  -1.2459555009497325
Score on test -1.3047290165347236


In [59]:
from sklearn.model_selection import GridSearchCV 
from sklearn.ensemble import RandomForestClassifier 

# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth': [20, 40, 60],
    'max_features': [2, 5, 10, 15, 20],
    'n_estimators': [100, 200, 300, 1000]
} 

# Create a based model
rf = RandomForestClassifier() 

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, scoring=score_function,
                          cv = 3, n_jobs = -1, verbose = 3) 

grid_search.fit(X_train, y_train) 

print(grid_search.best_params_) 
print('Score of Grid Search : ', grid_search.best_score_) 
print('Score on test', grid_search.score(X_2, Y_2) )

Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   50.7s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 10.7min finished
  array_means[:, np.newaxis]) ** 2,


{'max_depth': 20, 'max_features': 2, 'n_estimators': 100}
Score of Grid Search :  -inf
Score on test -0.3673568805550964


In [60]:
clf = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0,
             importance_type='gain', learning_rate=0.05, max_delta_step=0,
             max_depth=6, min_child_weight=11, missing=-999, n_estimators=1000,
             n_jobs=1, nthread=4, objective='multi:softprob', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=1337,
             subsample=0.8, verbosity=1)
clf.fit(X_train, y_train)

print('Score on the Test set:',  clf.score(X_2, Y_2))

Score on the Test set: 0.9674185463659147


In [0]:
from sklearn.metrics import accuracy_score

In [69]:
print('XGBoost score', accuracy_score(Y_2, clf.predict(X_2)))
print('LR score', accuracy_score(Y_2, logreg_cv.best_estimator_.predict(X_2)))
print('RF score', accuracy_score(Y_2, grid_search.best_estimator_.predict(X_2)))

XGBoost score 0.9674185463659147
LR score 0.543859649122807
RF score 0.9699248120300752


In [0]:
print('XGBoost score', log_loss(Y_2, clf.predict_proba(X_2)))
print('Score of RF : ', log_loss(Y_2, logreg_cv.best_estimator_.predict_proba(X_2)))
print('Score of LR :', log_loss(Y_2, grid_search.best_estimator_.predict_proba(X_2)))

In [0]:
# Write predictions to a file
classes = ['business/finance','education/research','entertainment',
  'health/medical','news/press','politics/government/law','sports','tech/science']

write_submission("./embeds/test_submission_xgb_2.csv", 
                 list(test_data["File"]),
                 model_classes_list=classes,
                 predicted_probas= logreg_cv.best_estimator_.predict_proba(X_test))

## Using Other features

In [0]:
import pickle 
from sklearn.preprocessing import OrdinalEncoder 

In [0]:
train_data = pd.read_csv('./embeds/' + 'train_noduplicates.csv', header = None) 
train_data.columns = ['File', 'Type'] 

test_data = pd.read_csv('./embeds/' + 'test.csv', header = None) 
test_data.columns = ['File'] 

enc = OrdinalEncoder() 
X = train_data['Type'] 
labels = enc.fit_transform(np.array(X).reshape(-1,1)) 
train_data['Labels'] = labels 

In [74]:
with open('./embeds/doc_vocab_embed.pickle', 'rb') as handle:
  vocab_embedding_docs = pickle.load(handle)

len(vocab_embedding_docs) 

2555

In [77]:
my_list = [] 
X = [] 
y = [] 
for element in vocab_embedding_docs.keys():
  try:
    if len(vocab_embedding_docs[element]) == 300:
      y_t = train_data[train_data['File'] == int(element)]['Labels'].iloc[0]
      y.append(y_t)
      X.append(vocab_embedding_docs[element])
  except:
    my_list.append(element)

X = np.vstack(X)
y = np.array(y).reshape(-1, 1)

X_train = pd.DataFrame(X)
Y_train = y.ravel()

X_train.shape, Y_train.shape

((1994, 300), (1994,))

In [0]:
X_1, X_2, Y_1, Y_2 = train_test_split(X_train, y_train, test_size = 0.2)

In [0]:
grid={"C":np.logspace(-1,3, num = 30)}

logreg = LogisticRegression(solver='lbfgs',  multi_class='auto', max_iter=25000, n_jobs=-1)

classes_order = LogisticRegression(solver='lbfgs',  multi_class='auto').fit(x_train[:, :2], y_train).classes_
score_function = make_scorer(loglikelihood_score, greater_is_better=False, classes_order=classes_order, needs_proba=True)

logreg_cv = GridSearchCV(logreg,grid,cv=3, verbose=3, n_jobs=-1, scoring=score_function)

logreg_cv.fit(X_1, Y_1)

print(logreg_cv.best_params_)
print('Score on the local test : ', logreg_cv.best_score_)

In [81]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth': [20, 40, 60, 80, 100, 120]
}

# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, scoring=score_function,
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X_1, Y_1)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   20.6s finished
  array_means[:, np.newaxis]) ** 2,


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [0]:
print('Score of RF : ', accuracy_score(Y_2, grid_search.best_estimator_.predict_proba(X_2)))
print('Score of LR :', accuracy_score(Y_2, logreg_cv.best_estimator_.predict_proba(X_2)))

In [89]:
print('Score of RF : ', log_loss(Y_2, grid_search.best_estimator_.predict_proba(X_2)))
print('Score of LR :', log_loss(Y_2, logreg_cv.best_estimator_.predict_proba(X_2)))

Score of RF :  1.821928841396987
Score of LR : 1.821928841396987


In [0]:
# Write predictions to a file
classes = ['business/finance','education/research','entertainment',
  'health/medical','news/press','politics/government/law','sports','tech/science']

write_submission("./Data/test_submission_rf_1.csv", 
                 list(test_data["File"]), 
                 model_classes_list=classes, 
                 predicted_probas=grid_search.best_estimator_.predict_proba(test_camembert))