In [52]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import warnings
warnings.filterwarnings("ignore")

In [2]:
np.random.seed(42)

n = 500

data = {
    'Pagamento': np.random.choice([0, 1], size=n),
    'Idade': np.random.randint(18, 81, size=n),  # Idades entre 18 e 80 anos
    'Gênero': np.random.choice(['F', 'M'], size=n),
    'Estado_civil': np.random.choice(['S', 'C', 'D', 'V'], size=n),  # S: Solteiro, C: Casado, D: Divorciado, V: Viúvo
    'Categoria': np.random.choice(['Basic', 'Black', 'Platinum'], size=n),
    'CatVIP': np.random.choice(['Alpha', 'Beta', 'Comum'], size=n),
    'Risco': np.random.choice(['A-', 'A', 'A+', 'B-', 'B', 'B+', 'C-', 'C', 'C+'], size=n)
}

df = pd.DataFrame(data)

In [3]:
df.head()

Unnamed: 0,Pagamento,Idade,Gênero,Estado_civil,Categoria,CatVIP,Risco
0,0,80,M,V,Basic,Alpha,A-
1,1,78,M,V,Basic,Comum,B+
2,0,38,F,D,Platinum,Comum,A
3,0,49,F,D,Basic,Beta,A+
4,0,40,M,S,Black,Comum,C


In [4]:
X = df.drop(columns="Pagamento", axis=1)
y = df["Pagamento"]

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
X_train

Unnamed: 0,Idade,Gênero,Estado_civil,Categoria,CatVIP,Risco
249,43,F,S,Basic,Beta,A-
433,61,M,V,Platinum,Beta,A
19,64,F,S,Platinum,Alpha,A
322,33,M,C,Platinum,Beta,A-
332,70,F,C,Platinum,Alpha,B+
...,...,...,...,...,...,...
106,33,F,C,Black,Alpha,A-
270,54,F,V,Black,Beta,B
348,46,M,S,Basic,Comum,B
435,49,F,C,Black,Beta,C-


In [42]:
def define_VIP(valor):
    if valor == 'Alpha' or valor == 'Beta':
        return 1
    else:
        return 0

In [43]:
X_train['CatVIP'] = X_train['CatVIP'].apply(define_VIP)
X_test['CatVIP'] = X_test['CatVIP'].apply(define_VIP)

In [44]:
X_train.head()

Unnamed: 0,Idade,Gênero,Estado_civil,Categoria,CatVIP,Risco
249,43,F,S,Basic,1,A-
433,61,M,V,Platinum,1,A
19,64,F,S,Platinum,1,A
322,33,M,C,Platinum,1,A-
332,70,F,C,Platinum,1,B+


In [45]:
ct = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(), ['Gênero', 'Estado_civil']),
        ('ordinal_cat', OrdinalEncoder(), ['Categoria']),
        ('ordinal_risk', OrdinalEncoder(categories=[['A-', 'A', 'A+', 'B-', 'B', 'B+', 'C-', 'C', 'C+']]), ['Risco'])
    ],
    remainder='passthrough'
)

X_train_transformed = ct.fit_transform(X_train)
X_test_transformed = ct.transform(X_test)


In [46]:
X_train_transformed = pd.DataFrame(X_train_transformed, columns=ct.get_feature_names_out())
X_test_transformed = pd.DataFrame(X_test_transformed, columns=ct.get_feature_names_out())

In [47]:
X_train_transformed.head()

Unnamed: 0,ohe__Gênero_F,ohe__Gênero_M,ohe__Estado_civil_C,ohe__Estado_civil_D,ohe__Estado_civil_S,ohe__Estado_civil_V,ordinal_cat__Categoria,ordinal_risk__Risco,remainder__Idade,remainder__CatVIP
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,43.0,1.0
1,0.0,1.0,0.0,0.0,0.0,1.0,2.0,1.0,61.0,1.0
2,1.0,0.0,0.0,0.0,1.0,0.0,2.0,1.0,64.0,1.0
3,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,33.0,1.0
4,1.0,0.0,1.0,0.0,0.0,0.0,2.0,5.0,70.0,1.0


In [50]:
model = LogisticRegression()

model.fit(X_train_transformed, y_train)

In [51]:
y_pred = model.predict(X_test_transformed)

In [53]:
print(accuracy_score(y_test, y_pred))

0.54


In [54]:
print(confusion_matrix(y_test, y_pred))

[[17 36]
 [10 37]]


In [55]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.32      0.42        53
           1       0.51      0.79      0.62        47

    accuracy                           0.54       100
   macro avg       0.57      0.55      0.52       100
weighted avg       0.57      0.54      0.52       100

