In [36]:
import pandas as pd
# import numpy as np
import datetime
import plotly.express as px

from utils import helper_funtions

In [37]:
pd.options.display.max_columns = None  # Remove "dots" from display when printing dataframes

In [38]:
PATH = helper_funtions.get_path()

# Read data

In [39]:
df = pd.read_excel(PATH + 'data\Sales_outbound.xlsx')
df.sample(n=5).head()

Unnamed: 0,Client_ID,Name,Age,Location,Income,TAX,previous sales_#,Type_of_Products,Contact_Channel,Contact_hour,Num_Contacts,Satisfaction_Score,Sales
323,324,,27,NY,35782.0,5367.3,2,B,Online Chat,12:30:00,6,1,0
200,201,,28,TX,66378.0,9956.7,0,C,Email,06:00:00,6,1,0
474,475,,50,FL,77684.0,11652.6,0,B,Phone,05:30:00,5,2,0
65,66,,54,FL,63397.0,9509.55,0,C,Phone,11:30:00,2,2,0
951,952,,39,CA,99624.0,14943.6,2,B,Phone,07:45:00,1,5,1


# EDA

## Describe data

In [40]:
print(f'Filas: {df.shape[0]}\nColumnas: {df.shape[1]}')
df.dtypes

Filas: 1000
Columnas: 13


Client_ID               int64
Name                  float64
Age                     int64
Location               object
Income                float64
TAX                   float64
previous sales_#        int64
Type_of_Products       object
Contact_Channel        object
Contact_hour           object
Num_Contacts            int64
Satisfaction_Score      int64
Sales                   int64
dtype: object

In [41]:
df.describe()

Unnamed: 0,Client_ID,Name,Age,Income,TAX,previous sales_#,Num_Contacts,Satisfaction_Score,Sales
count,1000.0,0.0,1000.0,996.0,996.0,1000.0,1000.0,1000.0,1000.0
mean,500.5,,41.191,63100.329317,9465.049398,2.052,3.492,2.558,0.269
std,288.819436,,12.259234,21638.692537,3245.803881,2.635952,2.399102,1.331318,0.443662
min,1.0,,18.0,5000.0,750.0,-7.0,1.0,1.0,0.0
25%,250.75,,31.0,46277.75,6941.6625,0.0,2.0,1.0,0.0
50%,500.5,,40.5,62770.5,9415.575,1.0,3.0,2.0,0.0
75%,750.25,,51.0,80618.0,12092.7,4.0,5.0,3.0,1.0
max,1000.0,,125.0,165355.0,24803.25,25.0,33.0,5.0,1.0


In [42]:
print('Suma de valores faltantes en el conjunto de datos:')
df.isnull().sum()

Suma de valores faltantes en el conjunto de datos:


Client_ID                0
Name                  1000
Age                      0
Location                 0
Income                   4
TAX                      4
previous sales_#         0
Type_of_Products         0
Contact_Channel          0
Contact_hour             0
Num_Contacts             0
Satisfaction_Score       0
Sales                    0
dtype: int64

## Visualization

# Otro

In [47]:
# Determine peak times by rounding the time in groups of one hour 
df['Contact_hour'] = pd.to_datetime(df['Contact_hour'], format='%H:%M:%S')
df['Contact_hour_round'] = df['Contact_hour'].apply(lambda x: x.round('60 min'))

fig = px.histogram(df, x='Contact_hour_round')
fig.show()

df_peak_time = df['Contact_hour_round'].value_counts().head()
df_peak_time = df_peak_time.rename_axis('Hour')
df_peak_time = df_peak_time.reset_index(name='Count')
peak_hours = []
for i in range(len(df_peak_time)):
    peak_hours.append(df_peak_time['Hour'].iloc[i])

for i in range(len(df)):
    if df.loc[i, 'Contact_hour_round'] in peak_hours:
        df.at[i, 'Is_peak_hour'] = 1
    else:
        df.at[i, 'Is_peak_hour'] = 0

In [18]:
# Codificar
# df['Contact_channel']
# df['Type_products']

In [19]:
# imputaciones
# df.query("Age > 70")
# df.query("Previous_sales < 0")


In [20]:
# Homogenise column names in the data set
df = df.rename({
    'previous sales_# ':'Previous_sales',
    'Type_of_Products':'Type_products',
    'Contact_Channel':'Contact_channel',
    'Num_Contacts':'Num_contacts',
    'Satisfaction_Score':'Satisfaction_score'
    }, axis=1)

In [21]:
df['Previous_sales'].value_counts()

 0     425
 1     131
 2     104
 4      95
 3      76
 6      52
 5      49
 7      23
 8      21
 9      14
 10      6
 20      1
 25      1
-5       1
-7       1
Name: Previous_sales, dtype: int64

# EDA

# Preprocesing

## Cibersecurity

In [22]:
df.drop(['Name'], axis=1)
df.insert(
    0, 'UUID_client2', 
    df['Client_ID'].apply(
    lambda x: helper_funtions.uuid_generator(x)
    )
)

In [23]:
df.sample(n=5).head()

Unnamed: 0,UUID_client2,Client_ID,Name,Age,Location,Income,TAX,Previous_sales,Type_products,Contact_channel,Contact_hour,Num_contacts,Satisfaction_score,Sales,Contact_hour_round,is_peak_hour,is hora pico
686,ac7ced99-db82-5e1c-86b9-8821ae18b83c,687,,46,CA,55279.0,8291.85,0,B,Email,1900-01-01 06:30:00,1,5,0,1900-01-01 06:00:00,1.0,Hora valle
172,4880bf5f-79a6-5c6d-b720-2bc74613dbaf,173,,27,FL,62779.0,9416.85,1,C,Email,1900-01-01 06:00:00,8,2,0,1900-01-01 06:00:00,1.0,Hora valle
743,241cc4bf-277d-5692-95db-a4079e389819,744,,22,FL,64521.0,9678.15,0,C,Email,1900-01-01 18:00:00,1,2,0,1900-01-01 18:00:00,0.0,Hora valle
353,cf529db4-e5f7-5dff-9f6d-c238d548b76b,354,,60,FL,78895.0,11834.25,2,C,Phone,1900-01-01 09:15:00,1,5,1,1900-01-01 09:00:00,0.0,Hora valle
143,a7d7e5e1-98c8-502d-bbfd-4c7ab64e1bc4,144,,31,IL,69882.0,10482.3,0,C,Email,1900-01-01 16:45:00,4,3,0,1900-01-01 17:00:00,0.0,Hora valle
