# Importação de depedências

In [2]:
import pandas as pd
from typing import Optional

In [4]:
import hashlib
import uuid


# Préprocessamento

In [8]:
ds =  pd.read_csv("datsetPeqsuisaFinal.csv", encoding='utf-8',sep=",")

FileNotFoundError: [Errno 2] No such file or directory: 'datsetPeqsuisaFinal.csv'

Anonimização dos dados

In [None]:

class GerenciadorAnonimizacao:
    def __init__(self, email: Optional[str] = None):
        """
        Inicializa o gerenciador de anonimização com um e-mail opcional.
        """
        self.email = email

    def set_email(self, email: str):
        """
        Define o e-mail a ser anonimizado.
        """
        self.email = email

    def anonimizar_uuid(self) -> Optional[str]:
        """
        Anonimiza o e-mail usando UUID (namespace DNS).
        """
        if self.email and pd.notnull(self.email):
            return str(uuid.uuid5(uuid.NAMESPACE_DNS, self.email))
        return None

    def anonimizar_sha(self) -> Optional[str]:
        """
        Anonimiza o e-mail usando SHA-256.
        """
        if self.email and pd.notnull(self.email):
            return hashlib.sha256(self.email.encode('utf-8')).hexdigest()
        return None

    @staticmethod
    def anonimizar_email_legivel(email: str) -> Optional[str]:
        """
        Anonimiza o e-mail de forma legível, preservando o domínio.
        """
        try:
            local, domain = email.split('@')
            anonymized_local = (
                local[0] + '*' * (len(local) - 2) + local[-1] if len(local) > 2 else '*' * len(local)
            )
            return anonymized_local + '@' + domain
        except:
            return email  # Retorna o dado original se não for um e-mail válido


In [None]:
def aplicarSha(email):
    gerenciador = GerenciadorAnonimizacao(email=email)
    return gerenciador.anonimizar_sha()

def aplicarUuid(email):
    gerenciador = GerenciadorAnonimizacao(email=email)
    return gerenciador.anonimizar_uuid()

def aplicarAnonimizacaoLegivel(email):
    return GerenciadorAnonimizacao.anonimizar_email_legivel(email)
    
    

In [None]:
# Aplicar anonimização à coluna de e-mail
opcaoAnonimizacao = "UUID"
if 'Endereço de e-mail' in ds.columns:
    match opcaoAnonimizacao:
        case "UUID":
            ds['Endereço de e-mail'] = ds['Endereço de e-mail'].apply(aplicarUuid)

        case "SHA256":
            ds['Endereço de e-mail'] = ds['Endereço de e-mail'].apply(aplicarSha)
            
        case "LEGIVEL":
            ds['Endereço de e-mail'] =  ds['Endereço de e-mail'].apply(aplicarAnonimizacaoLegivel)
            


In [None]:
ds

In [None]:
ds.to_csv("datsetPeqsuisaAnonimizado.csv")

# Importação de datset Anonimizado

In [6]:
dsAnonimizado = pd.read_csv("datsetPeqsuisaAnonimizado.csv")

In [8]:
dsAnonimizado.drop(columns=['Unnamed: 0'], inplace=True)

In [9]:
dsAnonimizado.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57 entries, 0 to 56
Data columns (total 51 columns):
 #   Column                                                                                                                                                                                                  Non-Null Count  Dtype  
---  ------                                                                                                                                                                                                  --------------  -----  
 0   Carimbo de data/hora                                                                                                                                                                                    57 non-null     object 
 1   Endereço de e-mail                                                                                                                                                                                      57 no

In [10]:
dsAnonimizado.keys()

Index(['Carimbo de data/hora', 'Endereço de e-mail', 'Qual é a sua idade?',
       'Gênero', 'Quais plataformas de mídia social você costuma usar?',
       'Cidade onde Mora', 'Sexo', 'Está empregado',
       'Já foi diagnosticado por um profissional com algum dos itens: [Nenhum]',
       'Já foi diagnosticado por um profissional com algum dos itens: [Depressão]',
       'Já foi diagnosticado por um profissional com algum dos itens: [Ansiedade]',
       'Já foi diagnosticado por um profissional com algum dos itens: [Estresse]',
       'Já foi diagnosticado por um profissional com algum dos itens: [TDAH]',
       'Já foi diagnosticado por um profissional com algum dos itens: [Transtorno Bipolar]',
       'Já foi diagnosticado por um profissional com algum dos itens: [Transtorno de ansiedade social :]',
       'Já foi diagnosticado por um profissional com algum dos itens: [Fobia de Perda de Conexão ( Ansiedade relacionada à ausência de acesso ao celular ou redes sociais, causando angústi

In [12]:
representacao = {titulo: idx for idx, titulo in enumerate(dsAnonimizado.keys())}

In [13]:
representacao 

{'Carimbo de data/hora': 0,
 'Endereço de e-mail': 1,
 'Qual é a sua idade?': 2,
 'Gênero': 3,
 'Quais plataformas de mídia social você costuma usar?': 4,
 'Cidade onde Mora': 5,
 'Sexo': 6,
 'Está empregado': 7,
 'Já foi diagnosticado por um profissional com algum dos itens: [Nenhum]': 8,
 'Já foi diagnosticado por um profissional com algum dos itens: [Depressão]': 9,
 'Já foi diagnosticado por um profissional com algum dos itens: [Ansiedade]': 10,
 'Já foi diagnosticado por um profissional com algum dos itens: [Estresse]': 11,
 'Já foi diagnosticado por um profissional com algum dos itens: [TDAH]': 12,
 'Já foi diagnosticado por um profissional com algum dos itens: [Transtorno Bipolar]': 13,
 'Já foi diagnosticado por um profissional com algum dos itens: [Transtorno de ansiedade social :]': 14,
 'Já foi diagnosticado por um profissional com algum dos itens: [Fobia de Perda de Conexão ( Ansiedade relacionada à ausência de acesso ao celular ou redes sociais, causando angústia e desconf

In [14]:
df = dsAnonimizado.copy()

In [16]:
df.columns = range(len(df.columns))

In [18]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,16/11/2024 08:50:05,2903def3-aa2f-5bf3-8d43-f9396e259be8,41,Masculino,"Linkedin, Whatsapp",Niterói,Cisgênero (é o sexo que nasceu),Sim,Coluna 1,,...,,,,,,,,,,
1,16/11/2024 08:56:56,52101e6a-ca49-5844-a546-155aa08bf771,33,Masculino,"Instagram, Facebook, Youtube, Linkedin, TikTok...",São Gonçalo,Cisgênero (é o sexo que nasceu),Sim,Coluna 1,,...,,,,,,,,,,
2,16/11/2024 08:58:00,0135da91-4288-5475-8a7d-c06ac32e157d,51,Masculino,Whatsapp,Niterói,Cisgênero (é o sexo que nasceu),Sim,,,...,,,,,,,,,,
3,16/11/2024 08:58:23,ef094ef4-c311-52ab-994b-a5e2943c8c04,37,Masculino,"Instagram, Facebook, Youtube, Linkedin, Whatsapp",Rio de Janeiro,Cisgênero (é o sexo que nasceu),Sim,,,...,,,,,,,,,,
4,16/11/2024 09:04:14,4fe3cb6d-8d9f-5fdd-87fa-13481495a0c4,30,Masculino,"Instagram, Facebook, Twitter (X), Whatsapp",São Gonçalo,Cisgênero (é o sexo que nasceu),Sim,Coluna 1,,...,,,,,,,,,,
5,16/11/2024 09:08:52,8f30b1fa-511d-5044-948d-268c86cb55df,36,Masculino,"Instagram, Facebook, Youtube, Whatsapp",Itaperuna rj,,Sim,,,...,,,,,,,,,,
6,16/11/2024 09:12:53,844ab66e-8e20-57df-92c1-2ecb27388676,37,Feminino,"Instagram, Facebook, Whatsapp",RJ,Cisgênero (é o sexo que nasceu),Não,,,...,,,,,,,,,,
7,16/11/2024 09:19:18,da896799-9ea2-5e91-ad37-3a9bf6a5eea9,34,Masculino,"Instagram, Facebook, Youtube, Linkedin, Whatsapp",Sorocaba-SP,Cisgênero (é o sexo que nasceu),Sim,Coluna 1,,...,,,,,,,,,,
8,16/11/2024 09:21:15,76f728ad-95de-5906-81cf-682fcbf25c03,33,Masculino,"Instagram, Youtube, Whatsapp",Niterói,Cisgênero (é o sexo que nasceu),Sim,Coluna 1,,...,,,,,,,,,,
9,16/11/2024 09:25:20,3f123bb8-6b37-58e8-bccd-035231c8a49c,44,Masculino,"Instagram, Youtube, Whatsapp",Niterói,Cisgênero (é o sexo que nasceu),Sim,Coluna 1,,...,,,,,,,,,,


In [21]:
colunas_para_substituir = [8,9,10,11,12,13,14,15,16,17,18,40,41,42,43,44,45,46,47,48,49,50]

In [23]:
df[colunas_para_substituir]=df[colunas_para_substituir].replace("Coluna 1",1)
df[colunas_para_substituir]= df[colunas_para_substituir].fillna(0)

  df[colunas_para_substituir]=df[colunas_para_substituir].replace("Coluna 1",1)


In [24]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,16/11/2024 08:50:05,2903def3-aa2f-5bf3-8d43-f9396e259be8,41,Masculino,"Linkedin, Whatsapp",Niterói,Cisgênero (é o sexo que nasceu),Sim,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,16/11/2024 08:56:56,52101e6a-ca49-5844-a546-155aa08bf771,33,Masculino,"Instagram, Facebook, Youtube, Linkedin, TikTok...",São Gonçalo,Cisgênero (é o sexo que nasceu),Sim,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,16/11/2024 08:58:00,0135da91-4288-5475-8a7d-c06ac32e157d,51,Masculino,Whatsapp,Niterói,Cisgênero (é o sexo que nasceu),Sim,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16/11/2024 08:58:23,ef094ef4-c311-52ab-994b-a5e2943c8c04,37,Masculino,"Instagram, Facebook, Youtube, Linkedin, Whatsapp",Rio de Janeiro,Cisgênero (é o sexo que nasceu),Sim,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,16/11/2024 09:04:14,4fe3cb6d-8d9f-5fdd-87fa-13481495a0c4,30,Masculino,"Instagram, Facebook, Twitter (X), Whatsapp",São Gonçalo,Cisgênero (é o sexo que nasceu),Sim,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,16/11/2024 09:08:52,8f30b1fa-511d-5044-948d-268c86cb55df,36,Masculino,"Instagram, Facebook, Youtube, Whatsapp",Itaperuna rj,,Sim,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,16/11/2024 09:12:53,844ab66e-8e20-57df-92c1-2ecb27388676,37,Feminino,"Instagram, Facebook, Whatsapp",RJ,Cisgênero (é o sexo que nasceu),Não,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,16/11/2024 09:19:18,da896799-9ea2-5e91-ad37-3a9bf6a5eea9,34,Masculino,"Instagram, Facebook, Youtube, Linkedin, Whatsapp",Sorocaba-SP,Cisgênero (é o sexo que nasceu),Sim,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,16/11/2024 09:21:15,76f728ad-95de-5906-81cf-682fcbf25c03,33,Masculino,"Instagram, Youtube, Whatsapp",Niterói,Cisgênero (é o sexo que nasceu),Sim,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,16/11/2024 09:25:20,3f123bb8-6b37-58e8-bccd-035231c8a49c,44,Masculino,"Instagram, Youtube, Whatsapp",Niterói,Cisgênero (é o sexo que nasceu),Sim,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Imputação de valores a dados faltantes

In [27]:
#substitui idade de sexo feminino pela mediana das idades
df.loc[50,2] = 51


In [28]:
df.groupby([3,2]).size().reset_index(name='Qauntidade')

Unnamed: 0,3,2,Qauntidade
0,Feminino,19,1
1,Feminino,24,1
2,Feminino,33,1
3,Feminino,34,1
4,Feminino,36,1
5,Feminino,37,1
6,Feminino,39,1
7,Feminino,40,2
8,Feminino,41,1
9,Feminino,42,1


In [29]:
df[2] = pd.to_numeric(df[2],errors='coerce')

In [31]:
df[df[3]=='Feminino'][2].mean()

39.625

In [32]:
df.loc[39,2] = 40 

In [33]:
valor_especifico = 'Youtube, TikTok, Whatsapp'

In [36]:
filtro = df[df[4] == valor_especifico]

In [37]:
filtro 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
39,16/11/2024 18:37:09,24c588c8-7897-5fef-b675-8921af1aab92,40.0,Feminino,"Youtube, TikTok, Whatsapp",,Cisgênero (é o sexo que nasceu),Sim,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
df.mode()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,16/11/2024 08:50:05,0135da91-4288-5475-8a7d-c06ac32e157d,33.0,Masculino,"Instagram, Youtube, Whatsapp",Niterói,Cisgênero (é o sexo que nasceu),Sim,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,16/11/2024 08:56:56,020191c8-4c07-5bea-b769-f8f37cc30497,,,,,,,,,...,,,,,,,,,,
2,16/11/2024 08:58:00,040ce340-fde4-50f0-95d6-80a524b8adb7,,,,,,,,,...,,,,,,,,,,
3,16/11/2024 08:58:23,08d8653a-5fa3-55b5-a9cb-f658bbc46463,,,,,,,,,...,,,,,,,,,,
4,16/11/2024 09:04:14,0934bf0f-83a2-5ce7-bdd6-8f6b596bb5ef,,,,,,,,,...,,,,,,,,,,
5,16/11/2024 09:08:52,1982ad6f-0db0-5ae3-8090-1fe0b82d7e21,,,,,,,,,...,,,,,,,,,,
6,16/11/2024 09:12:53,1c6101fe-3808-53d7-a804-c91dcc7e32d8,,,,,,,,,...,,,,,,,,,,
7,16/11/2024 09:19:18,24c588c8-7897-5fef-b675-8921af1aab92,,,,,,,,,...,,,,,,,,,,
8,16/11/2024 09:21:15,26644736-c2d9-52c6-8e2b-c32fbcec3412,,,,,,,,,...,,,,,,,,,,
9,16/11/2024 09:25:20,28f1561e-9c1b-5fae-a52d-1173b3285da2,,,,,,,,,...,,,,,,,,,,


In [44]:
def substituiMediana(ds,coluna):
    mediana = df[coluna].median()
    ds[coluna].fillna(mediana, inplace=True)
    
def substituiModa(ds,coluna):
    moda = df[coluna].mode()
    ds[coluna].fillna(moda, inplace=True)

In [47]:
df[6] = df[6].fillna('Cisgênero (é o sexo que nasceu)')

In [50]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,16/11/2024 08:50:05,2903def3-aa2f-5bf3-8d43-f9396e259be8,41.0,Masculino,"Linkedin, Whatsapp",Niterói,Cisgênero (é o sexo que nasceu),Sim,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,16/11/2024 08:56:56,52101e6a-ca49-5844-a546-155aa08bf771,33.0,Masculino,"Instagram, Facebook, Youtube, Linkedin, TikTok...",São Gonçalo,Cisgênero (é o sexo que nasceu),Sim,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,16/11/2024 08:58:00,0135da91-4288-5475-8a7d-c06ac32e157d,51.0,Masculino,Whatsapp,Niterói,Cisgênero (é o sexo que nasceu),Sim,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16/11/2024 08:58:23,ef094ef4-c311-52ab-994b-a5e2943c8c04,37.0,Masculino,"Instagram, Facebook, Youtube, Linkedin, Whatsapp",Rio de Janeiro,Cisgênero (é o sexo que nasceu),Sim,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,16/11/2024 09:04:14,4fe3cb6d-8d9f-5fdd-87fa-13481495a0c4,30.0,Masculino,"Instagram, Facebook, Twitter (X), Whatsapp",São Gonçalo,Cisgênero (é o sexo que nasceu),Sim,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,16/11/2024 09:08:52,8f30b1fa-511d-5044-948d-268c86cb55df,36.0,Masculino,"Instagram, Facebook, Youtube, Whatsapp",Itaperuna rj,Cisgênero (é o sexo que nasceu),Sim,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,16/11/2024 09:12:53,844ab66e-8e20-57df-92c1-2ecb27388676,37.0,Feminino,"Instagram, Facebook, Whatsapp",RJ,Cisgênero (é o sexo que nasceu),Não,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,16/11/2024 09:19:18,da896799-9ea2-5e91-ad37-3a9bf6a5eea9,34.0,Masculino,"Instagram, Facebook, Youtube, Linkedin, Whatsapp",Sorocaba-SP,Cisgênero (é o sexo que nasceu),Sim,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,16/11/2024 09:21:15,76f728ad-95de-5906-81cf-682fcbf25c03,33.0,Masculino,"Instagram, Youtube, Whatsapp",Niterói,Cisgênero (é o sexo que nasceu),Sim,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,16/11/2024 09:25:20,3f123bb8-6b37-58e8-bccd-035231c8a49c,44.0,Masculino,"Instagram, Youtube, Whatsapp",Niterói,Cisgênero (é o sexo que nasceu),Sim,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57 entries, 0 to 56
Data columns (total 51 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       57 non-null     object 
 1   1       57 non-null     object 
 2   2       57 non-null     float64
 3   3       57 non-null     object 
 4   4       57 non-null     object 
 5   5       55 non-null     object 
 6   6       57 non-null     object 
 7   7       57 non-null     object 
 8   8       57 non-null     float64
 9   9       57 non-null     float64
 10  10      57 non-null     float64
 11  11      57 non-null     float64
 12  12      57 non-null     float64
 13  13      57 non-null     float64
 14  14      57 non-null     float64
 15  15      57 non-null     float64
 16  16      57 non-null     float64
 17  17      57 non-null     float64
 18  18      57 non-null     float64
 19  19      55 non-null     object 
 20  20      56 non-null     object 
 21  21      52 non-null     object 
 22  22  

In [55]:
representacao 

{'Carimbo de data/hora': 0,
 'Endereço de e-mail': 1,
 'Qual é a sua idade?': 2,
 'Gênero': 3,
 'Quais plataformas de mídia social você costuma usar?': 4,
 'Cidade onde Mora': 5,
 'Sexo': 6,
 'Está empregado': 7,
 'Já foi diagnosticado por um profissional com algum dos itens: [Nenhum]': 8,
 'Já foi diagnosticado por um profissional com algum dos itens: [Depressão]': 9,
 'Já foi diagnosticado por um profissional com algum dos itens: [Ansiedade]': 10,
 'Já foi diagnosticado por um profissional com algum dos itens: [Estresse]': 11,
 'Já foi diagnosticado por um profissional com algum dos itens: [TDAH]': 12,
 'Já foi diagnosticado por um profissional com algum dos itens: [Transtorno Bipolar]': 13,
 'Já foi diagnosticado por um profissional com algum dos itens: [Transtorno de ansiedade social :]': 14,
 'Já foi diagnosticado por um profissional com algum dos itens: [Fobia de Perda de Conexão ( Ansiedade relacionada à ausência de acesso ao celular ou redes sociais, causando angústia e desconf

In [59]:
colunas = list(range(8,19))

In [61]:
colunas.insert(0,0)
colunas

[0, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]

In [63]:
dsAnterior = df[colunas]
dsAnterior

Unnamed: 0,0,8,9,10,11,12,13,14,15,16,17,18
0,16/11/2024 08:50:05,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,16/11/2024 08:56:56,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,16/11/2024 08:58:00,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16/11/2024 08:58:23,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,16/11/2024 09:04:14,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,16/11/2024 09:08:52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,16/11/2024 09:12:53,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,16/11/2024 09:19:18,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,16/11/2024 09:21:15,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,16/11/2024 09:25:20,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
colunas = list(range(40,51))

In [67]:
colunas.insert(0,0)
colunas

[0, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]

In [69]:
dsPosterior = df[colunas]
dsPosterior

Unnamed: 0,0,40,41,42,43,44,45,46,47,48,49,50
0,16/11/2024 08:50:05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,16/11/2024 08:56:56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,16/11/2024 08:58:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16/11/2024 08:58:23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,16/11/2024 09:04:14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,16/11/2024 09:08:52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,16/11/2024 09:12:53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,16/11/2024 09:19:18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,16/11/2024 09:21:15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,16/11/2024 09:25:20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
dfCensurado = pd.merge(dsAnterior,dsPosterior, on=0, how='inner')
dfCensurado

Unnamed: 0,0,8,9,10,11,12,13,14,15,16,...,41,42,43,44,45,46,47,48,49,50
0,16/11/2024 08:50:05,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,16/11/2024 08:56:56,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,16/11/2024 08:58:00,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16/11/2024 08:58:23,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,16/11/2024 09:04:14,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,16/11/2024 09:08:52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,16/11/2024 09:12:53,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,16/11/2024 09:19:18,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,16/11/2024 09:21:15,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,16/11/2024 09:25:20,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [105]:
# Converter apenas as colunas com tipo float para bool
for col in dfCensurado.select_dtypes(include=['float']).columns:
    dfCensurado[col] = dfCensurado[col].astype(bool)

for col in dsAnterior.select_dtypes(include=['float']).columns:
    dsAnterior[col] = dsAnterior[col].astype(bool)

for col in dsPosterior.select_dtypes(include=['float']).columns:
    dsPosterior[col] = dsPosterior[col].astype(bool)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dsAnterior[col] = dsAnterior[col].astype(bool)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dsAnterior[col] = dsAnterior[col].astype(bool)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dsAnterior[col] = dsAnterior[col].astype(bool)
A value is trying to be set on a copy of a slice from a DataFram

In [153]:
dfCensurado[50]

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
32    False
33    False
34    False
35    False
36    False
37    False
38    False
39    False
40    False
41    False
42    False
43    False
44    False
45    False
46    False
47    False
48    False
49    False
50    False
51    False
52    False
53    False
54    False
55    False
56    False
Name: 50, dtype: bool

In [107]:
dfCensurado

Unnamed: 0,0,8,9,10,11,12,13,14,15,16,...,41,42,43,44,45,46,47,48,49,50
0,16/11/2024 08:50:05,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,16/11/2024 08:56:56,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,16/11/2024 08:58:00,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,16/11/2024 08:58:23,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,16/11/2024 09:04:14,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,16/11/2024 09:08:52,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,16/11/2024 09:12:53,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,16/11/2024 09:19:18,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,16/11/2024 09:21:15,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,16/11/2024 09:25:20,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [109]:
 dfCensurado.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57 entries, 0 to 56
Data columns (total 23 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       57 non-null     object
 1   8       57 non-null     bool  
 2   9       57 non-null     bool  
 3   10      57 non-null     bool  
 4   11      57 non-null     bool  
 5   12      57 non-null     bool  
 6   13      57 non-null     bool  
 7   14      57 non-null     bool  
 8   15      57 non-null     bool  
 9   16      57 non-null     bool  
 10  17      57 non-null     bool  
 11  18      57 non-null     bool  
 12  40      57 non-null     bool  
 13  41      57 non-null     bool  
 14  42      57 non-null     bool  
 15  43      57 non-null     bool  
 16  44      57 non-null     bool  
 17  45      57 non-null     bool  
 18  46      57 non-null     bool  
 19  47      57 non-null     bool  
 20  48      57 non-null     bool  
 21  49      57 non-null     bool  
 22  50      57 non-null     bool

In [123]:
dfCensurado_novo = pd.DataFrame()

In [125]:
for col1, col2 in zip(dsAnterior.drop(0,axis=1), dsPosterior.drop(0,axis=1)):
    print(col1, col2)
    dfCensurado_novo[f'Fusao_{col1}_{col2}'] = dsAnterior[col1] | dsPosterior[col2]

8 40
9 41
10 42
11 43
12 44
13 45
14 46
15 47
16 48
17 49
18 50


In [127]:
dfCensurado_novo

Unnamed: 0,Fusao_8_40,Fusao_9_41,Fusao_10_42,Fusao_11_43,Fusao_12_44,Fusao_13_45,Fusao_14_46,Fusao_15_47,Fusao_16_48,Fusao_17_49,Fusao_18_50
0,True,False,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False,False
3,False,False,False,True,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,False
6,False,False,True,False,False,False,False,False,False,False,False
7,True,False,False,False,False,False,False,False,False,False,False
8,True,False,False,False,False,False,False,False,False,False,False
9,True,False,False,False,False,False,False,False,False,False,False


In [137]:
dfCensurado_novo.sum()

Fusao_8_40     35
Fusao_9_41     12
Fusao_10_42    14
Fusao_11_43     9
Fusao_12_44     2
Fusao_13_45     2
Fusao_14_46     1
Fusao_15_47     1
Fusao_16_48     2
Fusao_17_49     1
Fusao_18_50     0
dtype: int64

In [139]:
representacao 

{'Carimbo de data/hora': 0,
 'Endereço de e-mail': 1,
 'Qual é a sua idade?': 2,
 'Gênero': 3,
 'Quais plataformas de mídia social você costuma usar?': 4,
 'Cidade onde Mora': 5,
 'Sexo': 6,
 'Está empregado': 7,
 'Já foi diagnosticado por um profissional com algum dos itens: [Nenhum]': 8,
 'Já foi diagnosticado por um profissional com algum dos itens: [Depressão]': 9,
 'Já foi diagnosticado por um profissional com algum dos itens: [Ansiedade]': 10,
 'Já foi diagnosticado por um profissional com algum dos itens: [Estresse]': 11,
 'Já foi diagnosticado por um profissional com algum dos itens: [TDAH]': 12,
 'Já foi diagnosticado por um profissional com algum dos itens: [Transtorno Bipolar]': 13,
 'Já foi diagnosticado por um profissional com algum dos itens: [Transtorno de ansiedade social :]': 14,
 'Já foi diagnosticado por um profissional com algum dos itens: [Fobia de Perda de Conexão ( Ansiedade relacionada à ausência de acesso ao celular ou redes sociais, causando angústia e desconf

In [141]:
representacao 

{'Carimbo de data/hora': 0,
 'Endereço de e-mail': 1,
 'Qual é a sua idade?': 2,
 'Gênero': 3,
 'Quais plataformas de mídia social você costuma usar?': 4,
 'Cidade onde Mora': 5,
 'Sexo': 6,
 'Está empregado': 7,
 'Já foi diagnosticado por um profissional com algum dos itens: [Nenhum]': 8,
 'Já foi diagnosticado por um profissional com algum dos itens: [Depressão]': 9,
 'Já foi diagnosticado por um profissional com algum dos itens: [Ansiedade]': 10,
 'Já foi diagnosticado por um profissional com algum dos itens: [Estresse]': 11,
 'Já foi diagnosticado por um profissional com algum dos itens: [TDAH]': 12,
 'Já foi diagnosticado por um profissional com algum dos itens: [Transtorno Bipolar]': 13,
 'Já foi diagnosticado por um profissional com algum dos itens: [Transtorno de ansiedade social :]': 14,
 'Já foi diagnosticado por um profissional com algum dos itens: [Fobia de Perda de Conexão ( Ansiedade relacionada à ausência de acesso ao celular ou redes sociais, causando angústia e desconf