In [1]:
import pandas as pd

from features import analytical_tools
from utils import helper_funtions

In [2]:
pd.options.display.max_columns = None  # Remove "dots" from display when printing dataframes

In [3]:
PATH = helper_funtions.get_path()

# Read data

In [4]:
df = pd.read_csv(PATH + 'data/data_cleaning.csv')
df.sample(n=5).head()

Unnamed: 0,UUID_client,Age,Location,Income,TAX,Previous_sales,Type_products,Contact_channel,Contact_hour,Num_contacts,Satisfaction_score,Sales
140,c26dd0f0-5088-5a0c-8d7c-2cd7b0aba927,50,TX,90576.0,13586.4,1,B,Phone,15:00:00,3,4,1
814,761e9fe7-330a-5116-b563-a821a974e330,28,IL,56390.0,8458.5,0,C,Online Chat,12:00:00,2,5,0
181,2bc29c52-441e-5305-9e90-f167c773a86a,32,TX,56987.0,8548.05,2,C,Online Chat,16:00:00,4,3,1
793,27b23f36-43cb-5b24-a4b3-ec516e45e8f7,53,TX,40983.0,6147.45,6,B,Phone,12:00:00,7,2,0
571,9d5234b1-e7fe-5b55-8721-f232ffcaa30b,54,FL,48037.0,7205.55,1,C,Phone,14:00:00,4,3,1


# Preprocesing

In [5]:
# Determine peak times by rounding the time in groups of one hour 
df['Contact_hour'] = pd.to_datetime(df['Contact_hour'], format='%H:%M:%S')
df['Contact_hour_round'] = df['Contact_hour'].apply(lambda x: x.round('60 min'))

df_peak_time = df['Contact_hour_round'].value_counts().head()
df_peak_time = df_peak_time.rename_axis('Hour')
df_peak_time = df_peak_time.reset_index(name='Count')
peak_hours = []
for i in range(len(df_peak_time)):
    peak_hours.append(df_peak_time['Hour'].iloc[i])

for i in range(len(df)):
    if df.loc[i, 'Contact_hour_round'] in peak_hours:
        df.at[i, 'Is_peak_hour'] = 1
    else:
        df.at[i, 'Is_peak_hour'] = 0

del df_peak_time

In [6]:
# Obtener el porcentaje de impuesto
df['TAX_percent'] = (df['TAX'].mul(100) / df['Income']).round(0)
df['TAX_percent'].value_counts()

15.0    996
Name: TAX_percent, dtype: int64

# Simple imputation

In [7]:
df_new = df.query("Age < 70")
mean_age = df_new['Age'].mean()

index = df.query("Age > 70").index[0]

df.at[index, 'Age'] = mean_age

In [8]:
df_new = df.query("Previous_sales > 0")
mean_previous_sales = df_new['Previous_sales'].mean()

indexes = []
for i in range(0, len(df.query("Previous_sales < 0").index)):
    indexes.append(df.query("Previous_sales < 0").index[i])

for index in indexes:
    df.at[index, 'Previous_sales'] = mean_previous_sales

In [9]:
len(df.query("Num_contacts > 5"))

181

# Label encoders

In [None]:
labels_to_encoders_name = {
    'Contact_channel':'Contact_channel_ENC',
    'Type_products':'Type_products_ENC'}

for var_encoder, encoder_name in labels_to_encoders_name.items():
    analytical_tools.label_encoder(var_encoder, encoder_name, df)

In [13]:
df[['Contact_channel','Contact_channel_ENC',
    'Type_products','Type_products_ENC']].head()

Unnamed: 0,Contact_channel_ENC,Contact_channel
0,0,Email
1,2,Phone
2,1,Online Chat
3,0,Email
4,0,Email


# Save

In [None]:
# df.to_csv(PATH + 'data/data_preprocesing.csv', index=False, encoding='utf-8')

In [None]:
df.sample(n=5).head()