In [41]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier  # For classification tasks
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

df_train = pd.read_csv('Dataset/conjunto_de_treinamento.csv')
df_train.drop(columns=['id_solicitante'], inplace=True)

df_test = pd.read_csv('Dataset/conjunto_de_teste.csv')
df_test.drop(columns=['id_solicitante'], inplace=True)


In [42]:
# Tratando das colunas nominais

## Variáveis binárias
yn = [df_train.columns[12],
df_train.columns[16],
df_train.columns[29],
df_train.columns[31]]

for i in range(len(yn)):
    df_train[yn[i]] = df_train[yn[i]].map({'N': 0, 'Y': 1})

df_train['sexo'] = df_train['sexo'].map({'M': 0, 'F': 1, 'N': None})
df_train['forma_envio_solicitacao'] = df_train['forma_envio_solicitacao'].map({'internet': 0, 'correio': 1, 'presencial': 2})

## Variáveis nominais
def estados_format(df, column):
    return df[column].map({'AC': 0, 'AL': 1, 'AM': 2, 'AP': 3, 'BA': 4, 'CE': 5, 'DF': 6, 'ES': 7, 'GO': 8, 'MA': 9, 'MG': 10, 'MS': 11, 'MT': 12, 'PA': 13, 'PB': 14, 'PE': 15, 'PI': 16, 'PR': 17, 'RJ': 18, 'RN': 19, 'RO': 20, 'RR': 21, 'RS': 22, 'SC': 23, 'SE': 24, 'SP': 25, 'TO': 26})

df_train['estado_onde_nasceu'] = estados_format(df_train, 'estado_onde_nasceu')
df_train['estado_onde_reside'] = estados_format(df_train, 'estado_onde_reside')
df_train['estado_onde_trabalha'] = estados_format(df_train, 'estado_onde_trabalha')

## Preenchendo categorias incompletas
df_train['profissao_companheiro'].fillna(-1, inplace=True)
df_train['profissao'].fillna(-1, inplace=True)
df_train['grau_instrucao'].fillna(-1, inplace=True)
df_train['ocupacao'].fillna(-1, inplace=True)

## Transformando categorias str em int
def tel_format(df, column):
    df[column].replace(' ', -1, inplace=True)
    return df[column].map(int)

df_train['codigo_area_telefone_trabalho'] = tel_format(df_train, 'codigo_area_telefone_trabalho')
df_train['codigo_area_telefone_residencial'] = tel_format(df_train, 'codigo_area_telefone_residencial')

In [43]:
# Tratando das colunas nominais

## Variáveis binárias
yn = [df_test.columns[12],
df_test.columns[16],
df_test.columns[29],
df_test.columns[31]]

for i in range(len(yn)):
    df_test[yn[i]] = df_test[yn[i]].map({'N': 0, 'Y': 1})

df_test['sexo'] = df_test['sexo'].map({'M': 0, 'F': 1, 'N': None})
df_test['forma_envio_solicitacao'] = df_test['forma_envio_solicitacao'].map({'internet': 0, 'correio': 1, 'presencial': 2})

## Variáveis nominais
def estados_format(df, column):
    return df[column].map({'AC': 0, 'AL': 1, 'AM': 2, 'AP': 3, 'BA': 4, 'CE': 5, 'DF': 6, 'ES': 7, 'GO': 8, 'MA': 9, 'MG': 10, 'MS': 11, 'MT': 12, 'PA': 13, 'PB': 14, 'PE': 15, 'PI': 16, 'PR': 17, 'RJ': 18, 'RN': 19, 'RO': 20, 'RR': 21, 'RS': 22, 'SC': 23, 'SE': 24, 'SP': 25, 'TO': 26})

df_test['estado_onde_nasceu'] = estados_format(df_test, 'estado_onde_nasceu')
df_test['estado_onde_reside'] = estados_format(df_test, 'estado_onde_reside')
df_test['estado_onde_trabalha'] = estados_format(df_test, 'estado_onde_trabalha')

## Preenchendo categorias incompletas
df_test['profissao_companheiro'].fillna(-1, inplace=True)
df_test['profissao'].fillna(-1, inplace=True)
df_test['grau_instrucao'].fillna(-1, inplace=True)
df_test['ocupacao'].fillna(-1, inplace=True)

## Transformando categorias str em int
def tel_format(df, column):
    df[column].replace(' ', -1, inplace=True)
    return df[column].map(int)

df_test['codigo_area_telefone_trabalho'] = tel_format(df_test, 'codigo_area_telefone_trabalho')
df_test['codigo_area_telefone_residencial'] = tel_format(df_test, 'codigo_area_telefone_residencial')

In [44]:
X = df_train.iloc[:, :-1]
y = df_train.iloc[:, -1]
X_test = df_test



# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)



# model = RandomForestClassifier()
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# print(accuracy_score(y_test, y_pred))

In [95]:
imputer = SimpleImputer(strategy='constant', fill_value=-1)

# Impute missing values in the training data
imputed_X_train = imputer.fit_transform(X)

# Create an instance of the DecisionTreeClassifier class
classifier = DecisionTreeClassifier()

# Train the model with the imputed training data
classifier.fit(imputed_X_train, y)

# Impute missing values in the test data
imputed_X_test = imputer.transform(X_test)

# Make predictions using the trained model
predictions = classifier.predict(imputed_X_test)

In [96]:

df_test = pd.read_csv('Dataset/conjunto_de_teste.csv')

prediction_file = pd.DataFrame(predictions, columns=['inadimplente'])
prediction_file = pd.concat([df_test['id_solicitante'], prediction_file], axis=1)
prediction_file = prediction_file.to_csv('results/predictions.csv', index=False)


prediction_file = pd.read_csv('results/predictions.csv')
prediction_file.shape


# print(f'{accuracy_score(y_test, predictions)*100}%')

(5000, 2)