In [None]:
import pandas as pd
import numpy as np
import time
import re
import urllib3

import requests
from requests.adapters import HTTPAdapter
from requests import Session

import arango
from arango import ArangoClient
from arango.response import Response
from arango.http import HTTPClient

In [None]:
pd.set_option('display.max_colwidth', -1)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import warnings
warnings.filterwarnings("ignore")

## Funciones auxiliares

In [None]:
from functools import wraps
from time import time

def timing(f):
    @wraps(f)
    def wrapper(*args, **kwargs):
        start = time()
        result = f(*args, **kwargs)
        end = time()
        print('Elapsed time: {}'.format(end-start))
        return result
    return wrapper

In [None]:
import logging

from requests.adapters import HTTPAdapter
from requests import Session

from arango.response import Response
from arango.http import HTTPClient


class CustomHTTPClient(HTTPClient):
    """My custom HTTP client with cool features."""

    def __init__(self):
        self._session = Session()
        # Initialize your logger.
        self._logger = logging.getLogger('my_logger')

    def create_session(self, host):
        session = Session()

        # Add request header.
        session.headers.update({'x-my-header': 'true'})

        # Enable retries.
        adapter = HTTPAdapter(max_retries=5)
        self._session.mount('https://', adapter)

        return session

    def send_request(self,
                     session,
                     method,
                     url,
                     params=None,
                     data=None,
                     headers=None,
                     auth=None):
        # Add your own debug statement.
        self._logger.debug('Sending request to {}'.format(url))

        # Send a request.
        response = session.request(
            method=method,
            url=url,
            params=params,
            data=data,
            headers=headers,
            auth=auth,
            verify=False  # Disable SSL verification
        )
        self._logger.debug('Got {}'.format(response.status_code))

        # Return an instance of arango.response.Response.
        return Response(
            method=response.request.method,
            url=response.url,
            headers=response.headers,
            status_code=response.status_code,
            status_text=response.reason,
            raw_body=response.text,
        )

In [None]:
@timing
def execute(query):
    cursor = aql.execute(query)
    item_keys = [doc for doc in cursor]
    return item_keys

## Carga `peopleMaster`

In [None]:
client = ArangoClient(hosts='https://localhost:XXXX/', http_client=CustomHTTPClient())
db = client.db('gp', username='root', password='kXaHdJJoKi')
aql = db.aql
pregel = db.pregel

# PRUEBA DE CONEXION - Nº de documentos
collection = 'peopleMaster'
query=r'''RETURN LENGTH(''' + collection + ''')'''
print(query)
execute(query)

In [None]:
#col = db.collection('peopleMaster')
#pd_peopleMaster = pd.DataFrame(list(col))
#pd_peopleMaster.to_csv('output/pd_peopleMaster.csv', index=False)
pd_peopleMaster = pd.read_csv('collections/pd_peopleMaster.csv')

In [None]:
pd_peopleMaster.head(1)

In [None]:
pd_peopleMaster[pd_peopleMaster['_key'] == '1571818679-29140509']

In [None]:
pd_people = pd_peopleMaster[['_key', 'name', 'principal_name']]

In [None]:
pd_people.describe()

In [None]:
def titleRemoval(text):
    """
    Remove common title prefix from names
    """    
    
    commonTitles = ["MR", "MRS", "MS", "MISS", 
                    "SIR", "SISTER", "LADY", "LORD",
                    "DR", "DR ING", "DRA", "DRS",
                    "HERR", "HR",
                    "MONSIEUR", "MADAME", "MADEMOISELLE", "MLLE", "MME",
                    "CHIEF", "HRA", "ING",
                    "PROF", "PROF DR", "PROF SIR", "PROFESSOR",
                    "SR", "SR D", "SRA", "EXMA SRA", "EXMO SR", "SRTA"]

    regex = r'\b(?:' + '|'.join(commonTitles) + r').\s*' + '|(, \w+)'

    text = re.sub(regex, '', text.upper())
    text = re.sub('\.', ' ', text)
    return str(text).lower().strip()

In [None]:
def bannedRemoval(text):
    """
    Remove some words from names that do not make sense
    """

    banned = ["personal assistant", "\{external\}", 'unquote', 'equity', 
              'europe', 'risk', 'summit', 'invest in bavaria', 'corporate',
              'notification', 'reporting', 'fax', 'message', 'bgf quarterly newsletter',
              'Bgf Tech Track 1', '- premium cars']
    
    regex = '|'.join(banned)
    text = re.sub(regex, '', text.lower())
    return str(text).strip()

In [None]:
def rareRemoval(text):
    """
    Remove rare names like webpages, etc. In fact any name that contains the set of characters {=,:,/,(,)}
    """
    pattern = re.compile("[=:/\(\)]")
    if re.search(pattern, text):
        return str('')
    else:
        return text

In [None]:
def manualSub(text):
    """
    Replace user selected strings
    """    
    if "with exclusive site" in text.lower():
        return "biorefinery visit"
    else:
        return text

## Extracción del nombre:

- Paso 1: limpieza con `cleanName`. En este paso se aplica
    - Aplica `titleRemoval` - elimina los títulos del nombre
    - Aplica `bannedRemoval` - elimina nombres baneados
    - Elimina apellidos o nombres con caracter / en mitad
    - Elimina los corchetes []
    - Aplica `rareRemoval` para eliminar nombres raros
    - Aplica `manualSub` para eliminar filtros manuales
    - Sustituye dobres espacios por simples espacios

- Paso 2: elige estrategia de extracción de nombre
    - `extractLongestName`: extrae el nombre más largo
    - `extractMostFrequentName`: extrae como nombre el conjunto de palabras más repetidas de forma consecutiva

In [None]:
from collections import Counter
import itertools

def cleanName(name):
    
    name = titleRemoval(name)
    name = bannedRemoval(name)
    name = re.sub(r'\s*(?:[\w_]*[/\\](?:[\w_]*[/\\])*[\w_]*)', '', name)
    name = re.sub('\[|\]', '', name)
    name = rareRemoval(name)
    name = manualSub(name)
    name = re.sub('  ', ' ', name)

    return name


def extractLongestName(x):
    """
    from a string of names extract the longest one
    """    
    names = list()
    
    # si name=NaN devuelve NaN
    if type(x) is float:
        return 'NaN'

    for i in x.split('\''):
        
        # si name no esta vacio lo appendes
        if len(i.strip()) > 1:    
            i = cleanName(i.lower())
            names.append(i)
            
    counter = Counter([name for name in names])

    if len(list(counter.elements())) > 0:
        return max(list(counter.elements()), key=len).title()

def word_fequency(name_list):
    """
    count frequency of words in a string of names
    """
    
    flat_list = list(itertools.chain(*[l.split(' ') for l in name_list]))
    counter = Counter([l for l in flat_list])
    words = [word for word,_ in counter.most_common(2)]
    counts = [count for _,count in counter.most_common(2)]
    
    return words, counts

def most_frequent_name(name_list):
    """
    calculates the most frequent consecutive words
    """
    
    words, counts = word_fequency(name_list)
    
    # si no, calcula igual de frecuentes devuelve la cadena más larga
    if len(words) > 1:

        w1 = words[0] + ' ' + words[1]
        w2 = words[1] + ' ' + words[0]
        
        w1_count = sum([name.count(w1) for name in name_list])
        w2_count = sum([name.count(w2) for name in name_list])

        if w1_count > w2_count:
            return w1
        else:
            return w2
    else:
        return words


def extractMostFrequentName(x):
    """
    from a string of names extract the most frequent two-words name
    """

    names = list()
    
    # si name=NaN devuelve NaN
    if type(x) is float:
        return 'NaN'

    for i in x.split('\''):
        
        # si name no esta vacio lo limpias y lo agregas
        if len(i.strip()) > 1:    
            i = cleanName(i.lower())
            names.append(i)
    
    print(names)
    if len(names) > 1: #si hay mas de un nombre, calcula la forma correcta
        name = most_frequent_name(names)
    elif not names:
        name = ''
    else:
        name = names[0]
    
    return str(name).title()

In [None]:
pd_people['longest_name'] = pd_people['name'].apply(lambda x: extractLongestName(x))
pd_people['most_frequent_name'] = pd_people['name'].apply(lambda x: extractMostFrequentName(x))

In [None]:
pd_people.to_csv('output/renames.csv', sep='|')