# Third step SMG data analysis

#### A few notes:

1. All markdown titles were kept as comments within each code cell.
2. At the beginning and at the end of each cell a copy of the dataframe being 
modified is made in order to allow for several executions of that cell without 
running the previous part of the ipynb. 
3. Along the code some data is being stored separately in order to be used 
in Batch 5 for the plot graphs.

In [54]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt 
plt.style.use('seaborn-colorblind')
%matplotlib inline
import seaborn as sns
import regex as re
from Levenshtein import distance as lev
from concurrent.futures import ThreadPoolExecutor
from dns import resolver
import requests
import threading
from datetime import datetime
import pickle

# Omitir warnings
import warnings
warnings.filterwarnings('ignore')

In [55]:
str(datetime.now())

'2022-09-11 14:34:55.206525'

In [56]:
with open('dict_graph2.dat', 'rb') as handle:
    data = handle.read()
dict_temp_graph = pickle.loads(data)
dict_temp_graph

{'domains_total': 5309,
 'domains_MX_false': 1146,
 'domains_MX_true': 4163,
 'domains_A_false': 992,
 'domains_A_true': 4317,
 'domains_spam_sink': 91,
 'domains_known_sink': 11,
 'domains_medium_trust': 595,
 'domains_client_trust': 316,
 'users_total': 137513,
 'users_unique': 99917,
 'users_trust_eq1': 82681,
 'users_null': 456}

In [57]:
# Data read

df_data = pd.read_csv('set_smg.csv', encoding='ansi', dtype='object')

df_data.columns

Index(['ID', 'ID_Mail', 'Email', 'Origen_mail', 'ID_Telefono',
       'Numero_Completo', 'Origen_telefono'],
      dtype='object')

In [None]:
# Data sort

df_data.sort_values(by=['ID', 'ID_Telefono', 'Numero_Completo'], inplace=True)

# df_data

In [59]:
# Phone numbers extraction.

df_phone = df_data[['ID', 'ID_Telefono', 'Numero_Completo', 'Origen_telefono']]

df_temp_phone = df_phone.copy()

In [None]:
df_phone = df_temp_phone.copy()

# Duplicates removal

df_phone.drop_duplicates(keep='first', inplace=True)
df_phone.reset_index(drop=True, inplace=True)

df_temp_phone = df_phone.copy()

# df_phone

In [None]:
df_phone = df_temp_phone.copy()

# Classification in cellular and office numbers. 
# Office may be cellular too? Doesn't matter.

pattern = re.compile(r'(?i)(?P<prefix>^[1-3]\d{1,3})15(?P<number>\d+)')

# Apply this transformation only to numbers which len is 12
matches = df_phone['Numero_Completo'].apply(lambda x: pattern.search(x) \
                                            if len(x)==12 else None)
phone_to_call = matches.apply(lambda x: np.nan if x is None else \
                                (x.group('prefix')+x.group('number')))

mask = matches.notnull()
df_phone.loc[mask, 'cellular'] = 'celular'
df_phone.loc[~mask, 'cellular'] = 'oficina/celular'

df_phone['phone_to_call'] = phone_to_call
df_phone['phone_to_call'].fillna(df_phone.Numero_Completo, inplace=True)

df_temp_phone = df_phone.copy()

# df_phone

In [62]:
df_phone = df_temp_phone.copy()

# Finding of 'incorrect' phone numbers (by len) and discard those that match

df_phone['phone_to_call'] = df_phone['phone_to_call'].apply(lambda x: \
                                x if len(x)>=10 else '---')

df_temp_phone = df_phone.copy()

In [None]:
df_phone = df_temp_phone.copy()

# Ranking of phone numbers by client and (within client)
#    contactability = phone number count within client
# 1. It assumes that a number with a higher contactability is more likely
#    to be correct when it is compared to a similar user (within a client).
# 2. If there is only one phone number for one client, its trust is 1. 
#    This does not imply that the number is incorrect, though it is likely if 
#    there are numbers users for that client with a higher contactability.
# 3. There are users that are at Levinshtein distance of 1 which are 
#    both valid, so it cannot be assumed that contactability = 1 means
#    an incorrect phone number. 
# 4. Many phone numbers belonging to one client may differ by one or more 
#    digits and still be correct.

df_phone['contactability'] = 0

clients_unique = list(df_phone.ID.unique())

def hard_work(client, lock):
    df_aux = df_phone[df_phone['ID'] == client].copy(deep=False)
    for index_2 in df_aux.index:
        for index_1 in df_aux.index:
            if df_aux.at[index_1, 'phone_to_call'] == \
                    df_aux.at[index_2, 'phone_to_call'] and \
                    index_1 != index_2:
                with lock:
                    count = df_phone.at[index_1, 'contactability'] + 1
                    df_phone.at[index_1, 'contactability'] = count
                    df_phone.at[index_2, 'contactability'] = count

lock = threading.RLock()
futures = [threading.Thread(target=hard_work, args=(client, lock)) \
            for client in clients_unique]
print('Wait...')
for future in futures:
    future.start()
for future in futures:
    future.join()
print('Done!')

mask = (df_phone['contactability'] == 0)
df_phone['contactability'].mask(mask, 1, inplace=True)
mask = (df_phone['phone_to_call'] == '---')
df_phone['contactability'].mask(mask, 0, inplace=True)

df_temp_phone = df_phone.copy()

# df_phone

In [64]:
dict_graph = dict_temp_graph.copy()

# Store graph data

dict_graph.update({'phones_total':  len(df_phone.phone_to_call)})
dict_graph.update({'phones_unique':  len(df_phone.phone_to_call.unique())})

mask = df_phone.cellular == 'celular' 
dict_graph.update({'phones_cellular':  mask.value_counts()[1]})

mask = (df_phone.contactability == 1) 
dict_graph.update({'phones_trust_eq1':  mask.value_counts()[1]})

mask = (df_phone.phone_to_call == '---')
dict_graph.update({'phones_null':  mask.value_counts()[1]})

dict_temp_graph = dict_graph.copy()

dict_graph

{'domains_total': 5309,
 'domains_MX_false': 1146,
 'domains_MX_true': 4163,
 'domains_A_false': 992,
 'domains_A_true': 4317,
 'domains_spam_sink': 91,
 'domains_known_sink': 11,
 'domains_medium_trust': 595,
 'domains_client_trust': 316,
 'users_total': 137513,
 'users_unique': 99917,
 'users_trust_eq1': 82681,
 'users_null': 456,
 'phones_total': 278763,
 'phones_unique': 164085,
 'phones_cellular': 72115,
 'phones_trust_eq1': 132106,
 'phones_null': 90}

In [65]:
df_phone.to_csv('df_phones_final.csv')

with open("dict_graph_final.dat", "wb") as outfile:
    pickle.dump(dict_graph, outfile)

In [66]:
str(datetime.now())

'2022-09-11 15:04:34.551528'