## SetUp directories

In [58]:
import os
data_directory = '../data/'
if(not os.path.exists((data_directory))):
     os.makedirs(data_directory)

## SetUp Parameters

In [59]:
corpus_file = 'corpus_check_long.csv'
corpus_path = data_directory + corpus_file
# We will create a temporary file with the results of the preprocessing this file will be deleted after 
#the execution of the script
temp_file_eval = "../data/evalFile.txt"

# File Name where we will store the training data 
train_path = data_directory + 'trainFile.txt'

# File name where we will store the evaluation data
eval_file = data_directory + 'eval.csv'

# Name of the column storing the article 
article = 'corpus'

# Preprocessing

In [60]:
import pandas as pd
df = pd.read_csv(corpus_path)

In [61]:
df

Unnamed: 0,siren,legal_name,id,url_article,corpus
0,419838529,IPSEN,8,http://www.boursier.com/actions/actualites/new...,Ipsen lorgne les peptides de PeptiMimesis\n
1,419838529,IPSEN,2894,http://www.boursier.com/actions/actualites/new...,Ipsen : accord important avec Probi\n \n\npubl...
2,419838529,IPSEN,3057,http://www.boursier.com/actions/actualites/new...,La médecine générale d'Ipsen en panne au 1er t...
3,419838529,IPSEN,4208,http://www.cerclefinance.com/default.asp?pub=v...,"Bourse en ligne : Information boursiere, Econo..."
4,419838529,IPSEN,5284,http://www.cerclefinance.com/default.asp?pub=v...,"Bourse en ligne : Information boursiere, Econo..."
...,...,...,...,...,...
64289,803894872,MDP,76494,http://www.lyonpoleimmo.com/2019/10/22/56480/b...,Beynost : Maxon lance la réalisation de son ce...
64290,834289373,LAGARDERE MEDIA NEWS,76497,https://www.lexpress.fr/actualite/medias/herve...,Hervé Gattegno est nommé directeur général des...
64291,813956265,E-VALLEY SERVICES 2,76506,https://www.lavoixdunord.fr/654836/article/201...,\n Vendredi 08 novembre 2019 Consulter le jour...
64292,440088938,SENVION FRANCE,76507,https://www.greenunivers.com/2019/10/siemens-g...,Siemens Gamesa reprend les services de Senvion...


In [62]:
indexNames = []
def get_corrupt_data(df):
    for counter,data in enumerate(df.iterrows()):
        i, row = data
        tmp = df.corpus[i]
        if ("�") in tmp:
            indexNames.append(i)
get_corrupt_data(df)      
df.drop(indexNames , inplace=True)

## Filter : keep only companies that have at least 7 articles, and their 


In [63]:
#Build list of companies that have more then 7 articles in the corpora
top = df["siren"].value_counts()
top = top.where(top>=7).dropna()
topList = list(top.index)
df = df[df["siren"].isin(topList)]

## Filter: discard articles that are longer than 1,000,000 characters

In [64]:
df = df[df[article].astype(str).map(len)<1000000]
df

Unnamed: 0,siren,legal_name,id,url_article,corpus
0,419838529,IPSEN,8,http://www.boursier.com/actions/actualites/new...,Ipsen lorgne les peptides de PeptiMimesis\n
1,419838529,IPSEN,2894,http://www.boursier.com/actions/actualites/new...,Ipsen : accord important avec Probi\n \n\npubl...
2,419838529,IPSEN,3057,http://www.boursier.com/actions/actualites/new...,La médecine générale d'Ipsen en panne au 1er t...
3,419838529,IPSEN,4208,http://www.cerclefinance.com/default.asp?pub=v...,"Bourse en ligne : Information boursiere, Econo..."
4,419838529,IPSEN,5284,http://www.cerclefinance.com/default.asp?pub=v...,"Bourse en ligne : Information boursiere, Econo..."
...,...,...,...,...,...
58371,489682005,ZENIKA,49051,https://emploibreizh.bzh/job/consultant-devops...,Website unavailable\n \n ...
58372,489682005,ZENIKA,53702,https://emploibreizh.bzh/job/developpeur-front...,Website unavailable\n \n ...
58373,489682005,ZENIKA,57506,https://www.fusacq.com/buzz/isatis-capital-ent...,"ISATIS CAPITAL entre au capital de ZENIKA, FUS..."
58374,489682005,ZENIKA,71582,https://uxjobs.fr/job/3428-ux-designer-h-f-lyon,[CDI] UX Designer H/F Lyon - Zenika - UXJobs.f...


## Filter: discard articles that are longer than 100 words


In [65]:
import re
import contractions
import string
from nltk.tokenize import sent_tokenize 

translator = str.maketrans(' ', ' ', string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [66]:
def cleaning(doc):
    doc = doc.replace('\n', ' ')
    doc = doc.replace('\r\n', ' ')
    doc = doc.replace('\r', ' ')
    doc = doc.replace('\t', ' ')
    return doc 
def remove_numbers(doc):
    doc = re.sub("\d+", "", doc)
    doc = doc.replace('m€', '')
    doc = doc.replace('k€', '')   
    return doc

In [67]:
temp_train_name = 'dataTrain'
temp_eval_name = 'dataEval'
# Tokenize text
def preprocessing(doc,train=False):        
        # Remove «»
        doc = doc.replace("«", " ")
        doc = doc.replace("»", " ")

        # To lowercase 
        doc = doc.lower()
        
        # Remove url's
        doc = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', doc, flags=re.MULTILINE)
        
        # Cleaning
        doc = cleaning(doc)
        
        # Remove numbers
        doc = remove_numbers(doc)
        
    
        # Remove multiple wite spaces 
        doc = re.sub(' +', ' ',doc)
        
        # Remove unicode breaking character
        doc = doc.replace(u'\xa0', u' ')
        
        if train: 
            result = []
            sentences = sent_tokenize(doc)
            for sent in sentences: 
                   # Remove punctuation
                sent = sent.translate(translator)
                sent += "\n"
                result.append(sent)
            return "".join(result)
        else:
            doc += "\n"
            return doc 

def preprocess_and_write_to_file(dataframe,train,index=0):
    if(train):
        fileName = temp_train_name
    else:
        fileName = temp_eval_name
    f = codecs.open(fileName + str(index) + '.txt' , 'w', 'utf-8')
    for counter,data in enumerate(dataframe.iterrows()):
        i, row = data
        if(counter%5000==0):
            print("Thread " + str(index) + "processed " + str(counter) + "/" + str(dataframe.count()))
        preprocessed_text = preprocessing((row[article]),train)
        f.write(preprocessed_text)  # python will convert \n to os.linesep
    f.close()  

We want to create two files: one for training which will consist of each sentence of each document per line and 
an eval file which will be in csv format containing the name of the article, the url it originated from and the 
preprocessed article itself.

In [68]:
%%time
train_ = [True,False]
import codecs
import multiprocessing
import numpy as np
chunks = np.array_split(df,3)
manager = multiprocessing.Manager()
threads = []
for train in train_:
    for index,chunk in enumerate(chunks):
        thread = multiprocessing.Process(target=preprocess_and_write_to_file, args=(chunk,train,index))
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()

Thread 0processed 0/siren          8510
legal_name     8510
id             8510
url_article    8510
corpus         8510
dtype: int64
Thread 1processed 0/siren          8509
legal_name     8509
id             8509
url_article    8509
corpus         8509
dtype: int64
Thread 2processed 0/siren          8509
legal_name     8509
id             8509
url_article    8509
corpus         8509
dtype: int64
Thread 1processed 5000/siren          8509
legal_name     8509
id             8509
url_article    8509
corpus         8509
dtype: int64
Thread 2processed 5000/siren          8509
legal_name     8509
id             8509
url_article    8509
corpus         8509
dtype: int64
Thread 0processed 5000/siren          8510
legal_name     8510
id             8510
url_article    8510
corpus         8510
dtype: int64
Thread 0processed 0/siren          8510
legal_name     8510
id             8510
url_article    8510
corpus         8510
dtype: int64
Thread 1processed 0/siren          8509
legal_name     8509


In [69]:
import subprocess
subprocess.check_output(["cat " + temp_train_name + "*.txt" + ' > ' + train_path],shell=True)
subprocess.check_output(["cat " + temp_eval_name + "*.txt" + ' > ' + temp_file_eval],shell=True)
subprocess.check_output(["rm data*.txt"],shell=True)

b''

Creating the Eval csv file

In [70]:
def read_file(path):
    with open(path) as f:
        content = f.readlines()
    return content
    

In [73]:
data = read_file(temp_file_eval)

In [74]:
subprocess.run(["rm", temp_file_eval])

CompletedProcess(args=['rm', '../data/evalFile.txt'], returncode=0)

In [75]:
df['preprocessedCorpus'] = data
del df['id']
del df['corpus']
df = df.rename({'preprocessedCorpus': 'corpus'}, axis='columns')
df.to_csv(eval_file)

In [76]:
df

Unnamed: 0,siren,legal_name,url_article,corpus
0,419838529,IPSEN,http://www.boursier.com/actions/actualites/new...,ipsen lorgne les peptides de peptimimesis \n
1,419838529,IPSEN,http://www.boursier.com/actions/actualites/new...,ipsen : accord important avec probi publié le ...
2,419838529,IPSEN,http://www.boursier.com/actions/actualites/new...,la médecine générale d'ipsen en panne au er tr...
3,419838529,IPSEN,http://www.cerclefinance.com/default.asp?pub=v...,"bourse en ligne : information boursiere, econo..."
4,419838529,IPSEN,http://www.cerclefinance.com/default.asp?pub=v...,"bourse en ligne : information boursiere, econo..."
...,...,...,...,...
58371,489682005,ZENIKA,https://emploibreizh.bzh/job/consultant-devops...,website unavailable this site is currently sus...
58372,489682005,ZENIKA,https://emploibreizh.bzh/job/developpeur-front...,website unavailable this site is currently sus...
58373,489682005,ZENIKA,https://www.fusacq.com/buzz/isatis-capital-ent...,"isatis capital entre au capital de zenika, fus..."
58374,489682005,ZENIKA,https://uxjobs.fr/job/3428-ux-designer-h-f-lyon,[cdi] ux designer h/f lyon - zenika - uxjobs.f...
