### The online store from which the EClog data was taken is Polish. Part of the EDA was tranlsating Polish phrases to discover the useful signifiers for a customer action. Below demonstrates the use of a GoogleTranslate API for this purpose.

In [1]:
import pandas as pd
from datetime import datetime
from googletrans import Translator
import re
import httpx
import csv

In [2]:
usecols=['IpId', 
         'TimeStamp',
         'HttpMethod',
         'Uri', 
         'ResponseCode', 
         'Bytes', 
         'Referrer', 
         'UserAgent',
        ]
eclog_1d = pd.read_csv('eclog_1day.csv', usecols=usecols)

#### The following methods could omit many possibly useful phrases. We will return to this issue at later date if deemed necessary.

In [3]:
def phrase_extract(y, ext='php'):
    phrase = re.search('([a-zA-Z_-]+)\.' + ext, y)
    if phrase != None:
        return phrase.group(1)

#### A dictionary created using an ad hoc method. 
`{'search': 'szukaj',
 'sale': 'promocje',
 'recommended': 'polecane',
 'contact': 'kontakt',
 'Webserwis': 'webserwis',
 'log': 'zaloguj',
 'file': 'plik',
 'presence': 'kodprod',
 'Yes': 'tak',
 'description': 'opis'}`

#### First, get words and phrases from the Uri with method POST.

In [4]:
post_uri = pd.Series(eclog_1d.loc[eclog_1d['HttpMethod']=='POST',:]['Uri'].unique())

In [5]:
mask = post_uri.apply(lambda y: re.search('\.php', y) == None)
post_uri_html = pd.Series(post_uri[mask].unique())
post_uri_php = pd.Series(post_uri[~mask].unique())

In [6]:
post_phrases_html = pd.Series(pd.Series(post_uri_html). \
                    apply(lambda y: phrase_extract(y, ext='html'))).unique()
post_phrases_html = post_phrases_html[post_phrases_html != None]

In [7]:
timeout = httpx.Timeout(10.0)
translator = Translator(timeout=timeout)
post_phrase_html_dict = {}
for phrase in post_phrases_html:
    phrase_ = phrase.replace('_', ' ').replace('-', ' ')
    translated = translator.translate(phrase_, src='pl', dest='en')
    post_phrase_html_dict[translated.text] = phrase

In [8]:
with open('post_phrase_html_dict.csv', 'w', newline='') as csvfile:
    phrasemaker = csv.writer(csvfile)
    phrasemaker.writerows(post_phrase_html_dict.items())

In [9]:
post_phrases_php = pd.Series(pd.Series(post_uri_php). \
                    apply(lambda y: phrase_extract(y))).unique()
post_phrases_php = post_phrases_php[post_phrases_php != None]

In [10]:
timeout = httpx.Timeout(10.0)
translator = Translator(timeout=timeout)
post_phrase_php_dict = {}
for phrase in post_phrases_php:
    phrase_ = phrase.replace('_', ' ').replace('-', ' ')
    translated = translator.translate(phrase_, src='pl', dest='en')
    post_phrase_php_dict[translated.text] = phrase

In [11]:
with open('post_phrase_php_dict.csv', 'w', newline='') as csvfile:
    phrasemaker = csv.writer(csvfile)
    phrasemaker.writerows(post_phrase_php_dict.items())    

#### Second, get words and phrases from the Uri with method GET.

In [12]:
get_uri = pd.Series(eclog_1d.loc[eclog_1d['HttpMethod']=='GET',:]['Uri'].unique())

In [13]:
mask = get_uri.apply(lambda y: re.search('\.php', y) == None)
get_uri_html = pd.Series(get_uri[mask].unique())
get_uri_php = pd.Series(get_uri[~mask].unique())

In [14]:
get_phrases_html = pd.Series(pd.Series(get_uri_html). \
                    apply(lambda y: phrase_extract(y, ext='html'))).unique()
get_phrases_html = get_phrases_html[get_phrases_html != None]

In [15]:
timeout = httpx.Timeout(10.0)
translator = Translator(timeout=timeout)
get_phrase_html_dict = {}
for phrase in get_phrases_html:
    phrase_ = phrase.replace('_', ' ').replace('-', ' ')
    translated = translator.translate(phrase_, src='pl', dest='en')
    get_phrase_html_dict[translated.text] = phrase

In [16]:
with open('get_phrase_html_dict.csv', 'w', newline='') as csvfile:
    phrasemaker = csv.writer(csvfile)
    phrasemaker.writerows(get_phrase_html_dict.items())

In [17]:
get_phrases_php = pd.Series(pd.Series(get_uri_php). \
                    apply(lambda y: phrase_extract(y))).unique()
get_phrases_php = get_phrases_php[get_phrases_php != None]

In [19]:
timeout = httpx.Timeout(10.0)
translator = Translator(timeout=timeout)
get_phrase_php_dict = {}
for phrase in get_phrases_php:
    phrase_ = phrase.replace('_', ' ').replace('-', ' ')
    translated = translator.translate(phrase_, src='pl', dest='en')
    get_phrase_php_dict[translated.text] = phrase

In [20]:
with open('get_phrase_php_dict.csv', 'w', newline='') as csvfile:
    phrasemaker = csv.writer(csvfile)
    phrasemaker.writerows(get_phrase_php_dict.items())