### Imports

In [34]:
import numpy as np
import pandas as pd

import requests
import json
import time

from ast import literal_eval

import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

# Matomo Logs

### Import data unprocessed

In [35]:
df_test = pd.read_csv("./data/matomo_logs_labelled.csv", dtype=str)

In [36]:
df_test.shape

(425, 8)

In [37]:
df_test.head(3)

Unnamed: 0,terms,siren,url,Status,Commentaire,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,"['stellantis', 'STELLANTIS', 'Stellantis']",879786085,https://annuaire-entreprises.data.gouv.fr/entr...,True,542 065 479 bizarre,,,
1,"['air france', 'Air france', 'AIR FRANCE', 'Ai...",420495178,https://annuaire-entreprises.data.gouv.fr/entr...,True,,,,
2,"['airbus operation', 'AIRBUS OPERATIONS', 'air...",420916918,https://annuaire-entreprises.data.gouv.fr/entr...,True,,,,


### Preprocessing

In [38]:
df_test.drop(columns=["Unnamed: 5", "Unnamed: 6", "Unnamed: 7"], axis=1, inplace=True)

In [39]:
df_test.head(1)

Unnamed: 0,terms,siren,url,Status,Commentaire
0,"['stellantis', 'STELLANTIS', 'Stellantis']",879786085,https://annuaire-entreprises.data.gouv.fr/entr...,True,542 065 479 bizarre


In [40]:
df_test.dtypes

terms          object
siren          object
url            object
Status         object
Commentaire    object
dtype: object

In [41]:
df_test["terms"] = df_test["terms"].apply(
    literal_eval
)  # convert to list type to use explode

In [42]:
df_test = df_test.explode("terms", ignore_index=True)

In [43]:
df_test.head(3)

Unnamed: 0,terms,siren,url,Status,Commentaire
0,stellantis,879786085,https://annuaire-entreprises.data.gouv.fr/entr...,True,542 065 479 bizarre
1,STELLANTIS,879786085,https://annuaire-entreprises.data.gouv.fr/entr...,True,542 065 479 bizarre
2,Stellantis,879786085,https://annuaire-entreprises.data.gouv.fr/entr...,True,542 065 479 bizarre


In [44]:
df_test.shape

(859, 5)

In [49]:
df_test = df_test[df_test["Status"] == "TRUE"]

In [50]:
df_test.shape

(739, 5)

In [51]:
df_test.to_csv("./data/matomo_logs_preprocessed.csv", header=True, index=False)

# NGINX Logs

### Import data unprocessed

In [52]:
df_test = pd.read_csv("./data/nginx_logs_labelled.csv", dtype=str)

In [53]:
df_test.shape

(16091, 8)

In [54]:
df_test.head(3)

Unnamed: 0,terms,url_post,url_elastic,Google,Pappers,siren,siret,degree of condifence
0,agence interim adecco,https://annuaire-entreprises.data.gouv.fr/rech...,https://test.annuaire-entreprises.data.gouv.fr...,https://www.google.com/search?client=firefox-b...,https://www.pappers.fr/recherche?q=agence+inte...,?,,
1,fondation adecco,https://annuaire-entreprises.data.gouv.fr/rech...,https://test.annuaire-entreprises.data.gouv.fr...,https://www.google.com/search?client=firefox-b...,https://www.pappers.fr/recherche?q=fondation+a...,?,,
2,club mediterranee,https://annuaire-entreprises.data.gouv.fr/rech...,https://test.annuaire-entreprises.data.gouv.fr...,https://www.google.com/search?client=firefox-b...,https://www.pappers.fr/recherche?q=club+medite...,572185684,,


In [56]:
df_test["siren"].value_counts()

?              100
343 056 958      5
380 129 866      4
852931450        2
408 192 391      2
              ... 
885 021 113      1
799 543 616      1
797 819 034      1
531 669 315      1
440736965        1
Name: siren, Length: 167, dtype: int64

In [57]:
df_test = df_test[(df_test["siren"] != "?") & df_test["siren"]]

In [58]:
df_test.shape

(189, 8)

In [60]:
df_test.dtypes

terms                   object
url_post                object
url_elastic             object
Google                  object
Pappers                 object
siren                   object
siret                   object
degree of condifence    object
dtype: object

In [61]:
df_test["siren"].value_counts()

343 056 958    5
380 129 866    4
708202759      2
810 414 243    2
130 011 455    2
              ..
799 543 616    1
797 819 034    1
531 669 315    1
403 891 062    1
440736965      1
Name: siren, Length: 166, dtype: int64

In [63]:
for index, row in df_test.iterrows():
    df_test["siren"][index] = df_test["siren"][index].replace(" ", "")

In [64]:
df_test["siren"].value_counts()

343056958    5
380129866    4
130011455    3
213702251    2
750527848    2
            ..
885021113    1
799543616    1
797819034    1
531669315    1
440736965    1
Name: siren, Length: 165, dtype: int64

In [66]:
df_test.to_csv("./data/nginx_logs_preprocessed.csv", header=True, index=False)