In [343]:
import numpy as np
import pandas as pd
from datetime import datetime

In [344]:
rd_station_profile = pd.read_csv("../data/rd_station_profile")
info_student_unimestre = pd.read_csv("../data/info_student_unimestre.csv")
first_cd_turma_student = pd.read_csv("../data/first_cd_turma_for_student_unimestre.csv")
dt_cadastro_student = pd.read_csv("../data/dt_cadastro_student_unimestre.csv")

Trying to get less null values on the min_dt_log

In [345]:
info_student_unimestre.loc[:, 'min_dt_log'] = (
    info_student_unimestre['min_dt_log']
    .fillna(first_cd_turma_student['min_date'])
    .fillna(dt_cadastro_student['dt_cadastro'])
)

With this I was able to reduce 
- *min_dt_log* had 9163 null values now 209

Creating a df to be only one row per ds_email (to avoid the duplicates values checked in the exploring file)

In [346]:
info_student_unimestre_unique = info_student_unimestre.groupby('ds_email').first().reset_index()

### Doing the join again with the new df

In [347]:
merge_left = rd_station_profile.merge(info_student_unimestre_unique, left_on='email', right_on='ds_email', how='left')

In [348]:
merge_left = merge_left.drop(columns='Unnamed: 0')

Creating a new column saying it's a student when a cd_pessoa is found

In [349]:
merge_left['aluno'] = (merge_left['cd_pessoa'].notnull()).astype(int)
merge_left['aluno'].value_counts()


aluno
0    37278
1     4062
Name: count, dtype: int64

## Got different columns with representing the same thing, compiling all in one new columns

Represents city of the person: 'ds_cidade', 'Cidade Final'

In [350]:
merge_left.loc[:, 'cidade'] = (
    merge_left['ds_cidade']
    .fillna(merge_left['cidade_final'])
)

Represents city of the person: 'ds_cidade', 'Cidade Final'

In [351]:
merge_left.loc[:, 'data_nascimento'] = (
    merge_left['dt_nascimento']
    .fillna(merge_left['data_de_nascimento'])
)

With this I was able to reduce 
- *Cidade* had 35856 null values now 33986
- *Data Nascimento* had 38346 null values now 36117

Dropping columns with replicate values

In [352]:
merge_left = merge_left.drop(columns=['ds_cidade', 'cidade_final', 'dt_nascimento', 'data_de_nascimento'])

All not null

In [353]:
merge_left[merge_left.notna().all(axis=1)]

Unnamed: 0,email,lead_scoring_-_perfil,url_pública,estágio_no_funil,total_de_conversões,lead_scoring_-_interesse,status_para_comunicação_por_email,data_da_primeira_conversão,eventos_(últimos_100),origem_da_última_conversão,...,qual_o_curso_de_interesse?,cargo_final,area_atuacao,interesse_final,ds_email,cd_pessoa,min_dt_log,aluno,cidade,data_nascimento
5431,carolineklock@hotmail.com,c,http://app.rdstation.com.br/leads/public/8717b...,Lead Qualificado,16,120,True,2015-09-01 19:17:37 -0300,jornada-assessment-instrumentos-para-lideranca...,Busca Orgânica | Bing,...,Educação empresarial / Remuneração,Outros Cargos,RH,formulario-de-pre-inscricao,carolineklock@hotmail.com,128609.0,2018-04-05 19:53:30,1,Joinville,1985-09-21 00:00:00
15881,stella.bousfield@gmail.com,c,http://app.rdstation.com.br/leads/public/1b271...,Cliente,27,170,True,2015-12-08 10:18:55 -0200,webinar-storytelling / indicado-sarau-de-negoc...,Desconhecido,...,Responsabilidade social,Outros Cargos,Marketing,webinar-storytelling,stella.bousfield@gmail.com,160000017.0,2020-02-27 17:28:20,1,Joinville,1977-06-02 00:00:00


### Change dtype for columns that are date

Just changing - Work but can't use after to do one - other


# Final of these

In [354]:
def convert_columns_to_datetime(df):
    """
    Convert specified columns in a DataFrame to datetime format.

    Parameters:
    - df: DataFrame
        The input DataFrame.
    - date_columns: list
        List of column names to convert to datetime.

    Returns:
    - DataFrame
        The DataFrame with specified columns converted to datetime.
    """

    date_columns = ['data_da_primeira_conversão', 'data_da_última_conversão', 'data_da_última_oportunidade', 'data_nascimento', 'min_dt_log']
    
    df[date_columns] = df[date_columns].apply(lambda col: pd.to_datetime(col, errors='coerce'))
    for i in date_columns:
        df[i] = pd.to_datetime(df[i], utc=True).dt.tz_convert(None)
    return df

merge_left = convert_columns_to_datetime(merge_left)
merge_left.dtypes


  df[date_columns] = df[date_columns].apply(lambda col: pd.to_datetime(col, errors='coerce'))
  df[date_columns] = df[date_columns].apply(lambda col: pd.to_datetime(col, errors='coerce'))
  df[date_columns] = df[date_columns].apply(lambda col: pd.to_datetime(col, errors='coerce'))


email                                                                        object
lead_scoring_-_perfil                                                        object
url_pública                                                                  object
estágio_no_funil                                                             object
total_de_conversões                                                           int64
lead_scoring_-_interesse                                                      int64
status_para_comunicação_por_email                                              bool
data_da_primeira_conversão                                           datetime64[ns]
eventos_(últimos_100)                                                        object
origem_da_última_conversão                                                   object
data_da_última_conversão                                             datetime64[ns]
origem_da_primeira_conversão                                                

In [356]:
def calculate_months_since_conversion(df):
    """
    Calculates the number of months since the first conversion based on the 'aluno' column.
    Also drop the rows where 'aluno' = 1 and min_dt_log is null

    Parameters:
    - The input DataFrame containing the necessary columns.

    Returns:
    - A new column representing the number of months since the first conversion.
    """
    # Drop rows based on conditions
    df = df.drop(df[(df['aluno'] == 1) & df['min_dt_log'].isnull()].index)  # Before have 765 rows that it's student and has the min_dt_log nul
    # As it's crucial for the rest of the analysis need to drop these rows 

    # Calculate months since conversion
    df['months_since_conversion'] = np.where(
        df['aluno'] == 1,
        (df['min_dt_log'] - df['data_da_primeira_conversão']).dt.days / 30,
        (datetime.now() - df['data_da_primeira_conversão']).dt.days / 30
    )

    return df

# Apply the combined function to your DataFrame
merge_left = calculate_months_since_conversion(merge_left)

# Verify the changes
merge_left.loc[merge_left['aluno'] == 1, 'min_dt_log'].isnull().sum()


0