In [1]:
import numpy as np
import pandas as pd
import requests
import json
import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt 

## Import elastic V0 data

In [2]:
df_test = pd.read_csv("./data/results_matomo_v0.csv")

In [3]:
df_test = df_test.drop(columns=['url', 'Status', 'Commentaire'], axis=1)

In [4]:
df_test.head(3)

Unnamed: 0,terms,siren,results_elastic,pages_elastic,siren_elastic,resp_time_elastic,results_postgres,pages_postgres,siren_postgres,resp_time_postgres,results_postges,pages_postges,rank_elastic,rank_postgres
0,stellantis,879786085,1353,68,"['879786085', '803902485', '879147148', '34219...",0.063221,1,1,['879786085'],0.042676,1,1,0,0
1,STELLANTIS,879786085,1353,68,"['879786085', '803902485', '879147148', '34219...",0.146982,1,1,['879786085'],0.065058,1,1,0,0
2,Stellantis,879786085,1353,68,"['879786085', '803902485', '879147148', '34219...",0.084343,1,1,['879786085'],0.042565,1,1,0,0


In [5]:
df_test_2 = pd.read_csv("./data/results_nginx_v0.csv")

In [6]:
df_test_2 = df_test_2.drop(columns=['url_post', 'url_elastic', 'Google', 'Pappers', 'siret', 'degree of condifence'\
                                    ], axis=1)

In [7]:
df_test_2.head(3)

Unnamed: 0,terms,siren,results_elastic,pages_elastic,siren_elastic,resp_time_elastic,results_postgres,pages_postgres,siren_postgres,resp_time_postgres,results_postges,pages_postges,rank_elastic,rank_postgres
0,club mediterranee,572185684,90,5,"['379834906', '782615843', '482003191', '84191...",0.217592,84,5,"['782615843', '394049233', '514659853', '48200...",0.095049,84,5,-1,-1
1,emmaus communaute,304233505,111,6,"['538548157', '304233505', '311686497', '34015...",0.13071,45,3,"['342642071', '399498690', '340155571', '78590...",0.051059,45,3,1,12
2,emmaus communaute,782901599,111,6,"['538548157', '304233505', '311686497', '34015...",0.141893,45,3,"['342642071', '399498690', '340155571', '78590...",0.063611,45,3,14,17


In [8]:
df_test_2.columns

Index(['terms', 'siren', 'results_elastic', 'pages_elastic', 'siren_elastic',
       'resp_time_elastic', 'results_postgres', 'pages_postgres',
       'siren_postgres', 'resp_time_postgres', 'results_postges',
       'pages_postges', 'rank_elastic', 'rank_postgres'],
      dtype='object')

In [9]:
df_test.columns

Index(['terms', 'siren', 'results_elastic', 'pages_elastic', 'siren_elastic',
       'resp_time_elastic', 'results_postgres', 'pages_postgres',
       'siren_postgres', 'resp_time_postgres', 'results_postges',
       'pages_postges', 'rank_elastic', 'rank_postgres'],
      dtype='object')

In [10]:
df_test.shape

(739, 14)

In [11]:
df_test_2.shape

(189, 14)

In [12]:
df_test = pd.concat([df_test, df_test_2])

In [13]:
df_test.shape

(928, 14)

In [14]:
df_test.dtypes

terms                  object
siren                  object
results_elastic         int64
pages_elastic           int64
siren_elastic          object
resp_time_elastic     float64
results_postgres        int64
pages_postgres          int64
siren_postgres         object
resp_time_postgres    float64
results_postges         int64
pages_postges           int64
rank_elastic            int64
rank_postgres           int64
dtype: object

In [15]:
df_test.head(3)

Unnamed: 0,terms,siren,results_elastic,pages_elastic,siren_elastic,resp_time_elastic,results_postgres,pages_postgres,siren_postgres,resp_time_postgres,results_postges,pages_postges,rank_elastic,rank_postgres
0,stellantis,879786085,1353,68,"['879786085', '803902485', '879147148', '34219...",0.063221,1,1,['879786085'],0.042676,1,1,0,0
1,STELLANTIS,879786085,1353,68,"['879786085', '803902485', '879147148', '34219...",0.146982,1,1,['879786085'],0.065058,1,1,0,0
2,Stellantis,879786085,1353,68,"['879786085', '803902485', '879147148', '34219...",0.084343,1,1,['879786085'],0.042565,1,1,0,0


In [16]:
df_test.shape

(928, 14)

## Add results V1 (avec enseignes et adresses enseignes)

In [17]:
def find(key, dictionary):
    for k, v in dictionary.items():
        if k == key:
            yield v
        elif isinstance(v, dict):
            for result in find(key, v):
                yield result
        elif isinstance(v, list):
            for d in v:
                for result in find(key, d):
                    yield result

In [18]:
def get_response(url, q):
    params['q'] = q
    response = requests.get(url, params=params) 
    time_elapsed = response.elapsed.total_seconds()
    content = json.loads(response.content)
    total_results = content[0]['total_results']
    total_pages = content[0]['total_pages']
    siren_list = list(find('siren', content[0]))
    return total_results, total_pages, siren_list, time_elapsed

In [19]:
url_elastic = "http://api.sirene.dataeng.etalab.studio/search"

In [20]:
params = {'q':'',
         'page' : '1',
         'per_page' : '20'}

In [21]:
df_test['results_elastic_1'], df_test['pages_elastic_1'], df_test['siren_elastic_1'],\
df_test['resp_time_elastic_1']\
= "", "", "", ""

In [22]:
df_test

Unnamed: 0,terms,siren,results_elastic,pages_elastic,siren_elastic,resp_time_elastic,results_postgres,pages_postgres,siren_postgres,resp_time_postgres,results_postges,pages_postges,rank_elastic,rank_postgres,results_elastic_1,pages_elastic_1,siren_elastic_1,resp_time_elastic_1
0,stellantis,879786085,1353,68,"['879786085', '803902485', '879147148', '34219...",0.063221,1,1,['879786085'],0.042676,1,1,0,0,,,,
1,STELLANTIS,879786085,1353,68,"['879786085', '803902485', '879147148', '34219...",0.146982,1,1,['879786085'],0.065058,1,1,0,0,,,,
2,Stellantis,879786085,1353,68,"['879786085', '803902485', '879147148', '34219...",0.084343,1,1,['879786085'],0.042565,1,1,0,0,,,,
3,air france,420495178,706,36,"['420495178', '314119504', '378006027', '77568...",0.271584,788,40,"['883454803', '824718910', '401867015', '84159...",0.255068,788,40,0,6,,,,
4,Air france,420495178,706,36,"['420495178', '314119504', '378006027', '77568...",0.067208,788,40,"['883454803', '824718910', '401867015', '84159...",0.104733,788,40,0,6,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,pontecaille gregoire,818452476,1,1,['818452476'],0.141171,1,1,['818452476'],0.039299,1,1,0,0,,,,
185,sherpa,393825229,2870,144,"['393825229', '801698242', '500913785', '33407...",0.103189,330,17,"['344857321', '484513593', '398939744', '33813...",0.071680,330,17,0,15,,,,
186,sherpa,500913785,2870,144,"['393825229', '801698242', '500913785', '33407...",0.137308,330,17,"['344857321', '484513593', '398939744', '33813...",0.070307,330,17,2,-1,,,,
187,mc animation,820552925,3,1,"['820552925', '440736965', '398015974']",0.047684,4,1,"['820552925', '440736965', '398015974', '50865...",0.047149,4,1,0,0,,,,


In [24]:
for index, row in df_test.iterrows():
    print(get_response(url_elastic, row['terms']))
    

(1353, 68, ['879786085', '803902485', '379118748', '752313940', '500019898', '879147148', '808518013', '342199841', '510173164', '348622044', '840125876', '441308194', '387493638', '800604696', '820525905', '315523308', '498029859', '824000327', '821469491', '818133845'], 0.273817)
(1353, 68, ['879786085', '803902485', '379118748', '752313940', '500019898', '879147148', '808518013', '342199841', '510173164', '348622044', '840125876', '441308194', '387493638', '800604696', '820525905', '315523308', '498029859', '824000327', '821469491', '818133845'], 0.083464)
(1353, 68, ['879786085', '803902485', '379118748', '752313940', '500019898', '879147148', '808518013', '342199841', '510173164', '348622044', '840125876', '441308194', '387493638', '800604696', '820525905', '315523308', '498029859', '824000327', '821469491', '818133845'], 0.08166)
(1129, 57, ['420495178', '632041042', '314119504', '378006027', '775685183', '379369465', '813041381', '813029949', '782741631', '785742685', '552043002

(83, 5, ['341737062', '410241657', '438912149', '788667913', '794697706', '813401148', '824441240', '392383683', '823893706', '521919514', '429009699', '477762686', '433857190', '421008822', '503189441', '504916966', '439908864', '494943582', '533263109', '841645419'], 0.088902)
(83, 5, ['341737062', '410241657', '438912149', '788667913', '794697706', '813401148', '824441240', '392383683', '823893706', '521919514', '429009699', '477762686', '433857190', '421008822', '503189441', '504916966', '439908864', '494943582', '533263109', '841645419'], 0.117075)
(83, 5, ['341737062', '410241657', '438912149', '788667913', '794697706', '813401148', '824441240', '392383683', '823893706', '521919514', '429009699', '477762686', '433857190', '421008822', '503189441', '504916966', '439908864', '494943582', '533263109', '841645419'], 0.085366)
(83, 5, ['341737062', '410241657', '438912149', '788667913', '794697706', '813401148', '824441240', '392383683', '823893706', '521919514', '429009699', '4777626

KeyboardInterrupt: 

In [None]:
df_test['results_elastic_1'] = df_test['results_elastic_1'].astype('int32')
df_test['pages_elastic_1'] = df_test['pages_elastic_1'].astype('int32')
df_test['resp_time_elastic_1'] = df_test['resp_time_elastic_1'].astype('float64')

In [None]:
df_test.describe()

## Ranks

In [None]:
df_test['rank_elastic_1'] =""

In [None]:
for ind, row in df_test.iterrows():
    if str(row['siren']) in row['siren_elastic_1']:
        df_test['rank_elastic_1'][ind] = row['siren_elastic_1'].index(str(row['siren']))
    else:
        df_test['rank_elastic_1'][ind] = -1

In [None]:
df_test['rank_elastic_1'] = df_test['rank_elastic_1'].astype('float64')

## KPIs

In [None]:
fig = px.histogram(df_test.sort_values(by=['rank_elastic_1']),\
                    x="rank_elastic_1",color_discrete_sequence=['indianred'],\
                    title="Distribution elastic des rangs du bon résultat")
fig.update_layout(bargap=0.5)
fig.update_xaxes(type='category')
fig.show()

In [None]:
x_elastic = df_test.sort_values(by=['rank_elastic'])['rank_elastic']
x_postgres = df_test.sort_values(by=['rank_postgres'])['rank_postgres']
x_elastic_1 = df_test.sort_values(by=['rank_elastic_1'])['rank_elastic_1']
fig = go.Figure()
fig.add_trace(go.Histogram(histfunc="count",
                          x=x_elastic_1,
                          name="Elasticsearch_1",
                          marker_color='#eb0e3e',))
fig.add_trace(go.Histogram(histfunc="count",
                         x=x_elastic,
                         marker_color='#EB89B5',
                         name="Elasticsearch"))
fig.add_trace(go.Histogram(histfunc="count",
                          x=x_postgres,
                          name="Postgres",
                          marker_color='#330C73',))

fig.update_layout(
    title_text='Fréquence des rangs des résultats de la recherche', # title of plot
    xaxis_title_text='Rang du résulat dans la page', # xaxis label
    yaxis_title_text='Nombre de requêtes', # yaxis label
    bargap=0.2, # gap between bars of adjacent location coordinates
    bargroupgap=0.1 # gap between bars of the same location coordinates
)
fig.update_xaxes(type='category')
fig.show()