# Second step SMG data analysis

**Note:** Due to an unexpected behavior of jupyter notebooks, heavy calculation 
cells are excecuted a lot slower as the size of the notebook increases. Thus, 
the complete analysis is split into different batchs in order to reduce 
total excecution time.

#### A few notes:

1. All markdown titles were kept as comments within each code cell.
2. At the beginning and at the end of each cell a copy of the dataframe being 
modified is made in order to allow for several executions of that cell without 
running the previous part of the ipynb. 
3. Along the code some data is being stored separately in order to be used 
in Batch 5 for the plot graphs.

In [75]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt 
plt.style.use('seaborn-colorblind')
%matplotlib inline
import seaborn as sns
import regex as re
from Levenshtein import distance as lev
from concurrent.futures import ThreadPoolExecutor
from dns import resolver
import requests
import threading
from datetime import datetime
import pickle

# Omitir warnings
import warnings
warnings.filterwarnings('ignore')

In [76]:
str(datetime.now())

'2022-09-11 14:30:02.178502'

In [77]:
with open('dict_graph1.dat', 'rb') as handle:
    data = handle.read()
dict_temp_graph = pickle.loads(data)

dict_temp_graph

{'domains_total': 5309,
 'domains_MX_false': 1146,
 'domains_MX_true': 4163,
 'domains_A_false': 992,
 'domains_A_true': 4317,
 'domains_spam_sink': 91,
 'domains_known_sink': 11,
 'domains_medium_trust': 595}

In [None]:
df_temp_mails = pd.read_csv('df_mails_batch1.csv', index_col=[0])

# df_temp_mails

In [79]:
df_mails = df_temp_mails.copy()

# Allocation of domains by client and (within client) by same user

clients_unique = list(df_mails.ID.unique())

for client in clients_unique:
    df_aux = df_mails[df_mails['ID'] == client].copy(deep=False)
    if df_aux.shape[0] < 2:
        continue
    for index_2 in df_aux.index:
        domain_2 = df_aux.at[index_2, 'domain']
        for index_1 in df_aux.index:
            domain_1 = df_aux.at[index_1, 'domain']
            if df_aux.at[index_1, 'valid'] == True and \
                    df_aux.at[index_1, 'user'] == df_aux.at[index_2, 'user']:
                if domain_1 != domain_2:
                    distance = lev(domain_1, domain_2)
                    if df_mails.at[index_2, 'trust'] < 71-distance:
                        df_mails.at[index_2, 'valid_domain'] = domain_1
                        df_mails.at[index_2, 'trust'] = 71-distance
                        print('+-', end='')

df_temp_mails = df_mails.copy()

+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-

In [80]:
dict_graph = dict_temp_graph.copy()

# Store graph data

mask = (df_mails.trust <= 70) & (df_mails.trust >= 41)

dict_graph.update({'domains_client_trust':  mask.value_counts()[1]})

dict_temp_graph = dict_graph.copy()
dict_graph

{'domains_total': 5309,
 'domains_MX_false': 1146,
 'domains_MX_true': 4163,
 'domains_A_false': 992,
 'domains_A_true': 4317,
 'domains_spam_sink': 91,
 'domains_known_sink': 11,
 'domains_medium_trust': 595,
 'domains_client_trust': 316}

In [None]:
df_mails = df_temp_mails.copy()

# Ranking of users by client and (within client)
#    user_trust = user count within client
# 1. It assumes that a user with a higher user_trust is more likely
#    to be correct when it is compared to a similar user (within a client).
# 2. If there is only one user of a kind for one client, its trust is 1. 
#    This does not imply that the user is incorrect, though it is likely if 
#    there are similar users for that client with a higher user_trust.
# 3. There are users that are at Levenshtein distance of 1 which are 
#    both valid, so it cannot be assumed that a user_trust = 1 means
#    an incorrect user. 

df_mails['user_trust'] = 0

clients_unique = list(df_mails.ID.unique())

print('Wait...')
for client in clients_unique:
    df_aux = df_mails[df_mails['ID'] == client].copy(deep=False)
    for index_2 in df_aux.index:
        for index_1 in df_aux.index:
            if df_aux.at[index_1, 'user'] == df_aux.at[index_2, 'user'] and \
                    index_1 != index_2:
                count = df_mails.at[index_1, 'user_trust'] + 1
                df_mails.at[index_1, 'user_trust'] = count
                df_mails.at[index_2, 'user_trust'] = count

mask = (df_mails['user_trust'] == 0)
df_mails['user_trust'].mask(mask, 1, inplace=True)
print('Done!')

df_temp_mails = df_mails.copy()

# df_mails

In [None]:
df_mails = df_temp_mails.copy()

# Render the recommended email to use for each provided mail

mask = (df_mails.trust != 0)
# mask
df_mails.loc[~mask, 'email_to_use'] = '---'
df_mails.loc[mask, 'email_to_use'] = df_mails['user'] + '@' + df_mails['valid_domain']

df_temp_mails = df_mails.copy()

# df_mails

In [84]:
dict_graph = dict_temp_graph.copy()

# Store graph data

dict_graph.update({'users_total':  len(df_mails.user)})
dict_graph.update({'users_unique':  len(df_mails.user.unique())})

mask = (df_mails.user_trust == 1) 
dict_graph.update({'users_trust_eq1':  mask.value_counts()[1]})

mask = (df_mails.email_to_use == '---') 
dict_graph.update({'emails_null':  mask.value_counts()[1]})

dict_temp_graph = dict_graph.copy()

dict_graph

{'domains_total': 5309,
 'domains_MX_false': 1146,
 'domains_MX_true': 4163,
 'domains_A_false': 992,
 'domains_A_true': 4317,
 'domains_spam_sink': 91,
 'domains_known_sink': 11,
 'domains_medium_trust': 595,
 'domains_client_trust': 316,
 'users_total': 137513,
 'users_unique': 99917,
 'users_trust_eq1': 82681,
 'users_null': 456}

In [None]:
df_mails = df_temp_mails.copy()

# Render the contactability to evaluate for each provided mail

df_mails['contactability'] = df_mails['user_trust'].astype(str) + ' @ ' + df_mails['trust'].astype(str)

df_temp_mails = df_mails.copy()

# df_mails

In [86]:
# Save data for the next batch process

df_mails.to_csv('df_mails_final.csv')

with open("dict_graph2.dat", "wb") as outfile:
    pickle.dump(dict_graph, outfile)

In [87]:
str(datetime.now())

'2022-09-11 14:32:45.262170'