### Análise de Dados

**Objetivos**
- Analisar as variávies com relação a resposta (Churn)
- Interpretar o comportamento dos dados

In [2]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy.stats.contingency import association
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [3]:
df = pd.read_csv("../Dados/telco_dataset.csv")

In [None]:
df.head(3)

In [None]:
df.columns

In [None]:
df.shape

#### Criando Funções Auxiliares

In [5]:
def plt_percentages(feature, df):
    g = df.groupby(feature)["Churn"].value_counts().to_frame().reset_index()
    g["% clientes"] = g["count"]/len(df)
    plt.figure(figsize=(10, 4.5))
    ax = sns.barplot(x=feature, y= "% clientes", hue='Churn', data=g, palette="PuBu")
    ax.set_yticklabels(['{:,.0%}'.format(y) for y in ax.get_yticks()])
    ax.plot()

#### Gender & SeniorCitizen

In [None]:
plt_percentages("gender", df)

In [None]:
plt_percentages("SeniorCitizen", df)

#### Partners & Dependents

In [None]:
fig, axis = plt.subplots(1, 2, figsize=(12,4))
axis[0].set_title("Partners")
axis[1].set_title("Dependents")
axis_y = "percentage of customers"

# Partner
gp_partner = df.groupby('Partner')["Churn"].value_counts()/len(df)
gp_partner = gp_partner.to_frame().reset_index().rename(columns={"count": axis_y})
ax = sns.barplot(x='Partner', y= axis_y, hue='Churn', data=gp_partner, ax=axis[0], palette = "flare")

# Dependents
gp_dep = df.groupby('Dependents')["Churn"].value_counts()/len(df)
gp_dep = gp_dep.to_frame().reset_index().rename(columns={"count": axis_y})
ax = sns.barplot(x='Dependents', y= axis_y, hue='Churn', data=gp_dep, ax=axis[1], palette = "flare")

#### Telefone e Serviços de Internet

In [None]:
plt_percentages("MultipleLines", df)

In [None]:
plt_percentages("PhoneService", df)

In [None]:
plt_percentages("InternetService", df)

#### Serviços adicionais de Internet

In [None]:
cols = ["OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"]
df1 = pd.melt(df[df["InternetService"] != "No"][cols]).rename({'value': 'Has service'}, axis=1)
df1.head()

In [None]:
plt.figure(figsize=(10, 4.5))
ax = sns.countplot(data=df1, x='variable', hue='Has service', palette="pastel")
ax.set(xlabel='Additional service', ylabel='Num of customers')
plt.show()

In [None]:
plt_percentages("OnlineSecurity", df)

#### Contrato e Pagamento

In [None]:
plt_percentages("PaperlessBilling", df)

In [None]:
plt_percentages("PaymentMethod", df)

In [None]:
plt_percentages("PaymentMethod", df[df['SeniorCitizen'] == 1])

In [None]:
plt_percentages("Contract", df)

### Análise das Variáveis Numéricas

In [None]:
# Uma outra forma de ver a distribuição. Desta vez separando pela variável target
fig=plt.figure(figsize=(14, 5))
fig = plt.subplot(131)
sns.histplot(df[df['Churn'] == "No"]["tenure"],label='No Churn', color='#512b58') 
sns.histplot(df[df['Churn'] == "Yes"]["tenure"],label='Churn', color='#fe346e')
plt.legend()

fig = plt.subplot(132)
sns.histplot(df[df['Churn'] == "No"]["MonthlyCharges"], label='No Churn', color='#512b58') 
sns.histplot(df[df['Churn'] == "Yes"]["MonthlyCharges"], label='Churn', color='#fe346e') 
plt.legend()

fig = plt.subplot(133)
sns.histplot(df[df['Churn'] == "No"]["TotalCharges"], label='No Churn', color='#512b58') 
sns.histplot(df[df['Churn'] == "Yes"]["TotalCharges"], label='Churn', color='#fe346e') 
plt.legend()

In [None]:
ax = sns.catplot(x="Contract", y="MonthlyCharges", hue="Churn", kind="box", data=df)

In [None]:
corr = df[["tenure", "MonthlyCharges", "TotalCharges", "Churn"]].apply(lambda x: pd.factorize(x)[0]).corr()
ax = sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, 
                 linewidths=.2, cmap="YlGnBu")