In [2]:
import pandas as pd
import numpy as np
import time
import matplotlib.pylab as plt
from datetime import timezone
import json

pd.set_option('display.max_colwidth', -1)

import warnings
warnings.filterwarnings('ignore')

In [3]:
from sklearn import preprocessing

In [None]:
import arango
from arango import ArangoClient
from arango.response import Response
from arango.http import HTTPClient

### Funciones auxiliares

In [None]:
def read_jsonl_to_pandas(path_to_file):
    """
    create a pandas DataFrame from a jsonline file
    """
    with open(path_to_file, 'r') as json_file:
        json_list = list(json_file)

    result = []    
    for json_str in json_list:
        result.append(json.loads(json_str))

    return pd.DataFrame(result)

In [None]:
def read_arango_to_pandas(db, collection):
    """
    read an ArangoDB collection and pandas
    
    :param: db - Arango collection    
    :param: collection - Calection to retrieve
    """

    col = db.collection(collection)
    return pd.DataFrame(list(col))

In [None]:
from functools import wraps
import time

def timing(f):
    @wraps(f)
    def wrapper(*args, **kwargs):
        start = time.time()
        result = f(*args, **kwargs)
        end = time.time()
        print('Elapsed time: {}'.format(end-start))
        return result
    return wrapper

In [None]:
import logging

from requests.adapters import HTTPAdapter
from requests import Session

from arango.response import Response
from arango.http import HTTPClient


class CustomHTTPClient(HTTPClient):
    """My custom HTTP client with cool features."""

    def __init__(self):
        self._session = Session()
        # Initialize your logger.
        self._logger = logging.getLogger('my_logger')

    def create_session(self, host):
        session = Session()

        # Add request header.
        session.headers.update({'x-my-header': 'true'})

        # Enable retries.
        adapter = HTTPAdapter(max_retries=5)
        self._session.mount('https://', adapter)

        return session

    def send_request(self,
                     session,
                     method,
                     url,
                     params=None,
                     data=None,
                     headers=None,
                     auth=None):
        # Add your own debug statement.
        self._logger.debug('Sending request to {}'.format(url))

        # Send a request.
        response = session.request(
            method=method,
            url=url,
            params=params,
            data=data,
            headers=headers,
            auth=auth,
            verify=False  # Disable SSL verification
        )
        self._logger.debug('Got {}'.format(response.status_code))

        # Return an instance of arango.response.Response.
        return Response(
            method=response.request.method,
            url=response.url,
            headers=response.headers,
            status_code=response.status_code,
            status_text=response.reason,
            raw_body=response.text,
        )

In [None]:
@timing
def execute(query):
    cursor = aql.execute(query)
    item_keys = [doc for doc in cursor]
    return item_keys

## Data: Grafo

In [None]:
file = 'master_edges_v3.jsonl'
pd_grafo = read_jsonl_to_pandas(file)

print(pd_grafo.head(1))
print(pd_grafo.columns)

In [None]:
pd_grafo['from'] = pd_grafo['_from'].apply(lambda x: x.split('/')[-1])
pd_grafo['to'] = pd_grafo['_to'].apply(lambda x: x.split('/')[-1])

## Data: Relaciones A $\to$ B

In [4]:
# datos tabla master:
pd_master = pd.read_csv('master_source_table.csv')

In [5]:
pd_master.head(1)

Unnamed: 0,_key,_id,_from,_to,_rev,id,created_at,type
0,-4Xr2h7tKSUIh8hBmJKQmlsjtPRthFwbOx4JEHYJhu8P-3A7vg-Alessandro.Casartelli@gpbullhound.com_sent_1572271009-137259152,master_source_table/-4Xr2h7tKSUIh8hBmJKQmlsjtPRthFwbOx4JEHYJhu8P-3A7vg-Alessandro.Casartelli@gpbullhound.com_sent_1572271009-137259152,peopleMaster/1571814086-48906921,peopleMaster/1572271009-137259152,_ZgHrgNG--K,mails_processed/-4Xr2h7tKSUIh8hBmJKQmlsjtPRthFwbOx4JEHYJhu8P-3A7vg-Alessandro.Casartelli@gpbullhound.com,2019-04-30T16:07:11.000Z,sent


In [6]:
# limpia el dataframe
pd_master = pd_master[['_from', '_to', 'type', 'id', 'created_at']]
pd_master.columns = ['from', 'to', 'type', 'id', 'date']

pd_master['date'] = pd_master.date.apply(lambda x: str(x)[0:10])
pd_master['from'] = pd_master['from'].apply(lambda x: str(x).replace('peopleMaster/',''))
pd_master['to'] = pd_master['to'].apply(lambda x: str(x).replace('peopleMaster/',''))

## Función que calcula número de personas en interacciones

In [81]:
def people_activity_relation(pd_relations):
    """
    identify people involved in the same activity    
    """
    
    pd1 = pd_relations[['from', 'to', 'type', 'id']]
    aux = pd.DataFrame({'from':pd1['to'], 'to':pd1['from'], 'id':pd1['id'], 'type':pd1['type']})
    pd2 = pd.concat([pd1, aux], axis=0).sort_index().drop_duplicates(keep='first')
    pd2 = pd2[~pd2.index.duplicated(keep='first')].reset_index(drop=True)
    pd2 = pd2[['from', 'to', 'type', 'id']]

    people = []
    for name, group in pd2.groupby('id'):
    
        s = set(group['from'].values).union(set(group['to'].values))    
        people.append([name, group.type.unique()[0], s, len(s)])

    return pd.DataFrame(people, columns=["id", "type", "participants", "number"])

In [82]:
pd_people = people_activity_relation(pd_master)
pd_people.head(2)

Unnamed: 0,id,type,participants,number
0,activityPipedrive/11575,activity,"{1571760048-129650078, 1571814086-48906921, 1571824562-83745457}",3
1,activityPipedrive/12812,activity,"{1571817791-82992245, 1571827131-30658636, 1571812576-51196481}",3


Comprobamos que en ambos dataframes tenemos el mismo numero de identificadores de actividad


In [115]:
[len(pd_people.id.unique()), len(pd_master.id.unique())]

[66516, 66516]

In [85]:
[pd_people.number.min(), pd_people.number.max()]

[2, 202]

In [117]:
pd_people[['id', 'number']].head()

Unnamed: 0,id,number
0,activityPipedrive/11575,3
1,activityPipedrive/12812,3
2,activityPipedrive/13695,2
3,activityPipedrive/2829,2
4,activityPipedrive/5830,2


Tenemos dos DataFrames, `pd_master` contiene las relaciones A $\to$ B, y `pd_people` que contiene el número de personas involucrados en la interacción. Ambos DataFrames tienen un único identificador `id`. Tenemos que hacer un `pd.merge` de los DataFrames por `id`

In [124]:
#add new column to pd_master with number of people involved
pd_master['number'] = pd_master[['id']].merge(pd_people[['id', 'number']], on=['id'])['number']

In [125]:
pd_master.shape

(99192, 6)

## Función que genera los pesos

Esta función hace uso de la información obtenida de la tabla master así como del número de personas involucradas en la interacción. El número de personas en la interacción se incorpora como una potencia negativa $1/(\text{personas})^k$ de manera que a mayor número, menor importancia se le da.

In [183]:
def create_weights(pd_data, parameters):
    """
    create the weights of interactions

    :param: pd_data - pandas.DataFrame for the analysis
    :param: pd_people - pandas.DataFrame with number of people involved in activity
    :param: parameters - model parameters {'alpha','p_sent','p_cc','p_event','p_job','p_activity'}
    
            sum(p) = 1
    
    :output: dic_weighted - dictionary with df per type with damping factor p*exp[alpha(t-t0)]
    """
    
    #reduce timestamp by this factor
    factor = 10**7

    #min-max scaler
    scaler = preprocessing.MinMaxScaler()

    #current utc timestamp
    t0 = time.time()/factor
    
    pd_out = []        
    names = []
    
    for name, group in pd_data.groupby('type'):
    
        p_group = parameters['p_'+name]
        q_group = parameters['q_'+name]        
        alpha_group = parameters['alpha_'+name]
        
        #set timezone of datetime object to utc and calculates the timestamp
        group['t'] = pd.to_datetime(group.date).apply(lambda x: x.replace(tzinfo=timezone.utc).timestamp()/factor)
        #weight is decreasing with time and number of people
        group['t-t0'] = group['t'] - t0
        group['weight'] = np.exp( alpha_group * group['t-t0'] ) * q_group#/group['number']**3
        #sum over interaction pairs
        final = group[['from', 'to', 'weight']].groupby(['from', 'to']).sum().reset_index()
        #apply log-transformation to compensate for highly interecting people
        final['weight'] = np.log1p( final['weight'] )
        #weight by group importance
        final['weight'] = final['weight'] * p_group

        pd_out.append(final)
        names.append(name)

    #store weight dataframes por group
    dic_groups = dict(zip(names, pd_out))

    #group by and sum weights by activities
    pd_weight = pd.concat([value for key, value in dic_groups.items()]).groupby(['from', 'to']).sum().reset_index()      
    
    #scale to be a number in range [0,1] and return the complement 
    pd_weight['weight'] = (1 - scaler.fit_transform(pd_weight['weight'].values.reshape(-1,1)))

    #format output
    pd_weight.columns = ['_from','_to', 'weight']

    pd_weight['_key'] = pd_weight.apply(lambda row: row['_from']+'_edge_'+row['_to'], axis=1)
    pd_weight['_from'] = pd_weight['_from'].apply(lambda x: 'peopleMaster/'+str(x))
    pd_weight['_to'] = pd_weight['_to'].apply(lambda x: 'peopleMaster/'+str(x))

    pd_weight['interactions'] = '1'
    pd_weight['first'] = "2015-07-08T15:00:00.000Z"
    pd_weight['last'] = "2015-07-08T20:00:00.000Z"
    pd_weight['nSent'] = '0'
    pd_weight['nCc'] = '0'
    pd_weight['nActivities'] = '0'
    pd_weight['nDeals'] = '0'
    pd_weight['nJob'] = '0'
    pd_weight['nEvents'] = '1'    
    
    return pd_weight

In [208]:
params = {
    
    #decaimiento temporal
    'alpha_sent':0.1, 
    'alpha_cc':1.5, 
    'alpha_event':0.5, 
    'alpha_job':0.01, 
    'alpha_activity':0.3,
    
    #pesos de importancia de cada actividad
    'p_sent':0.7, 
    'p_cc':0.05, 
    'p_event':0.1, 
    'p_job':0.05, 
    'p_activity':0.1,

    #pesos 2
    'q_sent':6,
    'q_cc':0.15, 
    'q_event':1., 
    'q_job':0.01, 
    'q_activity':2.

}

pd_weight = create_weights(pd_merged, params)

In [209]:
pd_weight.head(1)

Unnamed: 0,_from,_to,weight,_key,interactions,first,last,nSent,nCc,nActivities,nDeals,nJob,nEvents
0,peopleMaster/1571760048-101557862,peopleMaster/1571814086-48906921,0.999982,1571760048-101557862_edge_1571814086-48906921,1,2015-07-08T15:00:00.000Z,2015-07-08T20:00:00.000Z,0,0,0,0,0,1


Algunas comprobaciones

In [216]:
#Sendagorta -> Casartelli, 3

wAB = pd_weight.loc[(pd_weight["_from"] == 'peopleMaster/'+key_jaime_sendagorta) & (pd_weight["_to"] == 'peopleMaster/'+key_casartelli), 'weight'].values[0]
wBA = pd_weight.loc[(pd_weight["_to"] == 'peopleMaster/'+key_jaime_sendagorta) & (pd_weight["_from"] == 'peopleMaster/'+key_casartelli), 'weight'].values[0]
w = (wAB + wBA)/2
print(round(1-w,1)*10)

#Sendagorta -> Eduadrdo F. 1
#wAB = pd_weight.loc[(pd_weight["_from"] == 'peopleMaster/'+key_jaime_sendagorta) & (pd_weight["_to"] == 'peopleMaster/'+key_eduardo_fernandez), 'weight'].values[0]
#wBA = pd_weight.loc[(pd_weight["_to"] == 'peopleMaster/'+key_jaime_sendagorta) & (pd_weight["_from"] == 'peopleMaster/'+key_eduardo_fernandez), 'weight'].values[0]
#w = (wAB + wBA)/2
#print(round(1-w,3)*10)

#Sendagorta -> Ernesto 2
wAB = pd_weight.loc[(pd_weight["_from"] == 'peopleMaster/'+key_jaime_sendagorta) & (pd_weight["_to"] == 'peopleMaster/'+key_ernesto_funes), 'weight'].values[0]
wBA = pd_weight.loc[(pd_weight["_to"] == 'peopleMaster/'+key_jaime_sendagorta) & (pd_weight["_from"] == 'peopleMaster/'+key_ernesto_funes), 'weight'].values[0]
w = (wAB + wBA)/2
print(round(1-w,1)*10)

#Sendagorta -> Gioia , 9
wAB = pd_weight.loc[(pd_weight["_from"] == 'peopleMaster/'+key_jaime_sendagorta) & (pd_weight["_to"] == 'peopleMaster/'+key_gioia_cerbelli), 'weight'].values[0]
wBA = pd_weight.loc[(pd_weight["_to"] == 'peopleMaster/'+key_jaime_sendagorta) & (pd_weight["_from"] == 'peopleMaster/'+key_gioia_cerbelli), 'weight'].values[0]
w = (wAB + wBA)/2
print(round(1-w,1)*10)

#Sendagorta -> Viktor Fritzen
wAB = pd_weight.loc[(pd_weight["_from"] == 'peopleMaster/'+key_jaime_sendagorta) & (pd_weight["_to"] == 'peopleMaster/'+key_viktor_fritzen), 'weight'].values[0]
wBA = pd_weight.loc[(pd_weight["_to"] == 'peopleMaster/'+key_jaime_sendagorta) & (pd_weight["_from"] == 'peopleMaster/'+key_viktor_fritzen), 'weight'].values[0]
w = (wAB + wBA)/2
print(round(1-w,1)*10)



3.0
0.0
7.0


IndexError: index 0 is out of bounds for axis 0 with size 0