#📌 Extracão

In [1]:
# Importação das bibliotecas

import pandas as pd
import numpy as np
import seaborn as sns

import requests
import json


In [2]:
# Request e transformação dos dados

try:
    url = 'https://raw.githubusercontent.com/brunorm86/challenge2-data-science/refs/heads/main/TelecomX_Data.json'

    json_data = requests.get(url).json()

# Caso não consiga ler os dados pela url, lê sua cópia offline
except:
    with open('data/TelecomX_Data.json', 'r') as file:
        json_data = json.load(file)


In [3]:
json_data[0]

{'customerID': '0002-ORFBO',
 'Churn': 'No',
 'customer': {'gender': 'Female',
  'SeniorCitizen': 0,
  'Partner': 'Yes',
  'Dependents': 'Yes',
  'tenure': 9},
 'phone': {'PhoneService': 'Yes', 'MultipleLines': 'No'},
 'internet': {'InternetService': 'DSL',
  'OnlineSecurity': 'No',
  'OnlineBackup': 'Yes',
  'DeviceProtection': 'No',
  'TechSupport': 'Yes',
  'StreamingTV': 'Yes',
  'StreamingMovies': 'No'},
 'account': {'Contract': 'One year',
  'PaperlessBilling': 'Yes',
  'PaymentMethod': 'Mailed check',
  'Charges': {'Monthly': 65.6, 'Total': '593.3'}}}

In [4]:
# Dados "aninhados" no json

#🔧 Transformação

Desaninhamento dos dados

In [5]:
'''
customer = pd.json_normalize(df_raw['customer'])

phone = pd.json_normalize(df_raw['phone'])

internet = pd.json_normalize(df_raw['internet'])

account = pd.json_normalize(df_raw['account'])
'''

# desaninhando os dados com .json_normalize()

df = pd.json_normalize(json_data)

df.head()

Unnamed: 0,customerID,Churn,customer.gender,customer.SeniorCitizen,customer.Partner,customer.Dependents,customer.tenure,phone.PhoneService,phone.MultipleLines,internet.InternetService,...,internet.OnlineBackup,internet.DeviceProtection,internet.TechSupport,internet.StreamingTV,internet.StreamingMovies,account.Contract,account.PaperlessBilling,account.PaymentMethod,account.Charges.Monthly,account.Charges.Total
0,0002-ORFBO,No,Female,0,Yes,Yes,9,Yes,No,DSL,...,Yes,No,Yes,Yes,No,One year,Yes,Mailed check,65.6,593.3
1,0003-MKNFE,No,Male,0,No,No,9,Yes,Yes,DSL,...,No,No,No,No,Yes,Month-to-month,No,Mailed check,59.9,542.4
2,0004-TLHLJ,Yes,Male,0,No,No,4,Yes,No,Fiber optic,...,No,Yes,No,No,No,Month-to-month,Yes,Electronic check,73.9,280.85
3,0011-IGKFF,Yes,Male,1,Yes,No,13,Yes,No,Fiber optic,...,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,98.0,1237.85
4,0013-EXCHZ,Yes,Female,1,Yes,No,3,Yes,No,Fiber optic,...,No,No,Yes,Yes,No,Month-to-month,Yes,Mailed check,83.9,267.4


In [6]:
df.shape

(7267, 21)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7267 entries, 0 to 7266
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   customerID                 7267 non-null   object 
 1   Churn                      7267 non-null   object 
 2   customer.gender            7267 non-null   object 
 3   customer.SeniorCitizen     7267 non-null   int64  
 4   customer.Partner           7267 non-null   object 
 5   customer.Dependents        7267 non-null   object 
 6   customer.tenure            7267 non-null   int64  
 7   phone.PhoneService         7267 non-null   object 
 8   phone.MultipleLines        7267 non-null   object 
 9   internet.InternetService   7267 non-null   object 
 10  internet.OnlineSecurity    7267 non-null   object 
 11  internet.OnlineBackup      7267 non-null   object 
 12  internet.DeviceProtection  7267 non-null   object 
 13  internet.TechSupport       7267 non-null   objec

Padronizar os valores das colunas e transformar os tipos de dados para mais apropriados

In [9]:
df['Churn'].unique()

array(['No', 'Yes', ''], dtype=object)

In [10]:
df['Churn'] = df['Churn'].map({'No': 'No', 'Yes':'Yes', '': None})

In [11]:
df['customer.SeniorCitizen'].unique()

array([0, 1])

In [12]:
df['customer.SeniorCitizen'] = df['customer.SeniorCitizen'].map({1: 'Yes', 0: 'No'})

In [13]:
df['customer.Partner'].unique()

array(['Yes', 'No'], dtype=object)

In [14]:
# df['customer.Partner'] = df['customer.Partner'].map({'Yes': True, 'No': False})

In [15]:
df['customer.Dependents'].unique()

array(['Yes', 'No'], dtype=object)

In [16]:
# df['customer.Partner'] = df['customer.Partner'].map({'Yes': True, 'No': False})

In [17]:
df['phone.PhoneService'].unique()

array(['Yes', 'No'], dtype=object)

In [18]:
# df['phone.PhoneService'] = df['phone.PhoneService'].map({'Yes': True, 'No': False})

In [19]:
df['phone.MultipleLines'].unique()

array(['No', 'Yes', 'No phone service'], dtype=object)

In [20]:
# df['phone.MultipleLines'] = df['phone.MultipleLines'].map({'Yes': True, 'No': False, 'No phone service': None})

In [21]:
df['internet.InternetService'].unique()

array(['DSL', 'Fiber optic', 'No'], dtype=object)

In [22]:
# df['internet.InternetService'] = df['internet.InternetService'].map({'No': None})

In [23]:
df['internet.OnlineSecurity'].unique()

array(['No', 'Yes', 'No internet service'], dtype=object)

In [24]:
df['internet.OnlineBackup'].unique()

array(['Yes', 'No', 'No internet service'], dtype=object)

In [25]:
df['internet.DeviceProtection'].unique()

array(['No', 'Yes', 'No internet service'], dtype=object)

In [26]:
df['internet.TechSupport'].unique()

array(['Yes', 'No', 'No internet service'], dtype=object)

In [27]:
df['internet.StreamingTV'].unique()

array(['Yes', 'No', 'No internet service'], dtype=object)

In [28]:
df['internet.StreamingMovies'].unique()

array(['No', 'Yes', 'No internet service'], dtype=object)

In [29]:
df['account.Contract'].unique()

array(['One year', 'Month-to-month', 'Two year'], dtype=object)

In [30]:
df['account.PaperlessBilling'].unique()

array(['Yes', 'No'], dtype=object)

In [31]:
df['account.PaymentMethod'].unique()

array(['Mailed check', 'Electronic check', 'Credit card (automatic)',
       'Bank transfer (automatic)'], dtype=object)

In [32]:
df['account.Charges.Monthly'].value_counts()

account.Charges.Monthly
20.05     65
19.90     46
19.85     46
19.55     45
19.70     45
          ..
39.15      1
82.35      1
73.30      1
117.60     1
48.20      1
Name: count, Length: 1585, dtype: int64

In [33]:
df['account.Charges.Total'].value_counts()

account.Charges.Total
20.2       11
           11
19.75       9
19.55       9
19.9        9
           ..
1993.2      1
72.1        1
1237.85     1
542.4       1
593.3       1
Name: count, Length: 6531, dtype: int64

In [35]:
df['account.Charges.Total'] = df['account.Charges.Total'].str.strip()

df['account.Charges.Total'] = df['account.Charges.Total'].replace('', np.nan)


In [36]:
df['account.Charges.Total'] = df['account.Charges.Total'].astype(float)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7267 entries, 0 to 7266
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   customerID                 7267 non-null   object 
 1   Churn                      7043 non-null   object 
 2   customer.gender            7267 non-null   object 
 3   customer.SeniorCitizen     7267 non-null   object 
 4   customer.Partner           7267 non-null   object 
 5   customer.Dependents        7267 non-null   object 
 6   customer.tenure            7267 non-null   int64  
 7   phone.PhoneService         7267 non-null   object 
 8   phone.MultipleLines        7267 non-null   object 
 9   internet.InternetService   7267 non-null   object 
 10  internet.OnlineSecurity    7267 non-null   object 
 11  internet.OnlineBackup      7267 non-null   object 
 12  internet.DeviceProtection  7267 non-null   object 
 13  internet.TechSupport       7267 non-null   objec

In [None]:
df

#📊 Carga e análise

Primeiramente, salvaremos em um formato .csv os arquivos tratados até o momento, para segurança

In [41]:
df.to_csv('data/TelecomX_Data_Tratados.csv', index=False, sep=';')

#📄Relatorio Final