In [11]:
import pandas as pd
# import numpy as np
# import datetime
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from utils import helper_funtions


In [12]:
pd.options.display.max_columns = None  # Remove "dots" from display when printing dataframes

In [13]:
PATH = helper_funtions.get_path()

# Read data

In [14]:
df = pd.read_csv(PATH + 'data/data_cleaning.csv')
df.sample(n=5).head()

Unnamed: 0,UUID_client,Age,Location,Income,TAX,Previous_sales,Type_products,Contact_channel,Contact_hour,Num_contacts,Satisfaction_score,Sales
522,2e72f33a-4c6c-5b08-bb32-1d32d0697f1b,40,CA,87866.0,13179.9,0,C,Phone,11:45:00,2,2,0
974,91fa5a4f-687e-5192-a829-dbbce54c41d1,25,TX,36816.0,5522.4,0,C,Email,06:15:00,2,2,0
158,1ddb8ad9-67a1-5fdf-a5c4-bf5937d983a1,56,FL,65643.0,9846.45,0,B,Phone,16:00:00,2,2,0
412,f4351e78-1aec-5c1f-b1aa-a0945bf60594,60,NY,62371.0,9355.65,0,A,Phone,05:30:00,4,1,0
240,972f4934-3e84-5229-90be-83a905fd8d99,40,FL,94174.0,14126.1,1,B,Email,15:00:00,3,2,0


# EDA

## Describe data

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   UUID_client         1000 non-null   object 
 1   Age                 1000 non-null   int64  
 2   Location            1000 non-null   object 
 3   Income              996 non-null    float64
 4   TAX                 996 non-null    float64
 5   Previous_sales      1000 non-null   int64  
 6   Type_products       1000 non-null   object 
 7   Contact_channel     1000 non-null   object 
 8   Contact_hour        1000 non-null   object 
 9   Num_contacts        1000 non-null   int64  
 10  Satisfaction_score  1000 non-null   int64  
 11  Sales               1000 non-null   int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 93.9+ KB


In [16]:
df.describe()

Unnamed: 0,Age,Income,TAX,Previous_sales,Num_contacts,Satisfaction_score,Sales
count,1000.0,996.0,996.0,1000.0,1000.0,1000.0,1000.0
mean,41.191,63100.329317,9465.049398,2.052,3.492,2.558,0.269
std,12.259234,21638.692537,3245.803881,2.635952,2.399102,1.331318,0.443662
min,18.0,5000.0,750.0,-7.0,1.0,1.0,0.0
25%,31.0,46277.75,6941.6625,0.0,2.0,1.0,0.0
50%,40.5,62770.5,9415.575,1.0,3.0,2.0,0.0
75%,51.0,80618.0,12092.7,4.0,5.0,3.0,1.0
max,125.0,165355.0,24803.25,25.0,33.0,5.0,1.0


In [17]:
print('Suma de valores faltantes en el conjunto de datos:')
df.isnull().sum()

Suma de valores faltantes en el conjunto de datos:


UUID_client           0
Age                   0
Location              0
Income                4
TAX                   4
Previous_sales        0
Type_products         0
Contact_channel       0
Contact_hour          0
Num_contacts          0
Satisfaction_score    0
Sales                 0
dtype: int64

Observaciones:
1. La cantidad de compras que el cliente ha realizado en el pasado cuenta con valores negativos lo que no corresponde con un registro lógico.
2. Existen usuarios con edades mayores a 70 años que parecen ser datos incorrectos
3. Existen 181 usuarios con interacciones superiores a cinco, de las cuales, una alcanza las 33 interacciones.
4. La mitad de la muestra califica la satisfacción entre un nivel uno y dos (muy bajo).
5. Al menos la mitad de los clientes no tuvieron una venta efectiva.

## Visualization

### Categorical data

In [18]:
categorical_columns = [
    'Location', 'Type_products', 'Contact_channel',
    'Satisfaction_score', 'Sales'
]
fig = make_subplots(rows=5, cols=1, subplot_titles=categorical_columns)
row, col = 1, 1
for column in categorical_columns:
    helper_funtions.dataviz_structure_categorical(column, fig, df, row, col)
    row += 1
fig.update_layout(height=800, width=750, title_text="Variables categóricas")
fig.show()

Observaciones:
1. La distribución de ubicaciones es homogenea.
2. No parece existir un tipo de producto preferido por los clientes.
3. El canal de preferencia de contacto del cliente es el teléfono.
4. Los niveles de satisfacción son bajos a nivel de muestra.
5. la efectividad de ventas es baja (variable desvalanceada).

### Numerical data

In [19]:
numerical_columns_histogram = [
    'Previous_sales', 'Contact_hour', 'Num_contacts'
]

fig = make_subplots(rows=3, cols=1)
row, col = 1, 1
for column in numerical_columns_histogram:
    helper_funtions.dataviz_structure_numerical_hist(column, fig, df, row, col)
    row += 1
fig.show()

ValueError: 
        If specified, the cols parameter must be a list or tuple of integers
        of length 1 (The number of traces being added)

        Received: ['Previous_sales']
        

In [None]:
box = ['Age', 'Income', 'TAX', 'Previous_sales_#']

Observaciones:
1. fsd
2. ghj

# Otro

In [None]:
# Determine peak times by rounding the time in groups of one hour 
df['Contact_hour'] = pd.to_datetime(df['Contact_hour'], format='%H:%M:%S')
df['Contact_hour_round'] = df['Contact_hour'].apply(lambda x: x.round('60 min'))

fig = px.histogram(df, x='Contact_hour_round')
fig.show()

df_peak_time = df['Contact_hour_round'].value_counts().head()
df_peak_time = df_peak_time.rename_axis('Hour')
df_peak_time = df_peak_time.reset_index(name='Count')
peak_hours = []
for i in range(len(df_peak_time)):
    peak_hours.append(df_peak_time['Hour'].iloc[i])

for i in range(len(df)):
    if df.loc[i, 'Contact_hour_round'] in peak_hours:
        df.at[i, 'Is_peak_hour'] = 1
    else:
        df.at[i, 'Is_peak_hour'] = 0

del df_peak_time

In [None]:
# Codificar
# df['Contact_channel']
# df['Type_products']

In [None]:
# imputaciones
# df.query("Age > 70")
# df.query("Previous_sales < 0")
len(df.query("Num_contacts > 5"))


In [None]:
df['Previous_sales'].value_counts()