In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor


In [36]:
df = pd.read_csv('previsao_de_renda.csv')

In [37]:
df['tempo_emprego'].fillna(df['tempo_emprego'].median(), inplace=True)

df['log_renda'] = np.log(df['renda'])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['tempo_emprego'].fillna(df['tempo_emprego'].median(), inplace=True)


In [38]:
train_df, test_df = train_test_split(df, test_size=0.25, random_state=42)

print("Tamanho do conjunto de treinamento:", train_df.shape)
print("Tamanho do conjunto de teste:", test_df.shape)

Tamanho do conjunto de treinamento: (11250, 17)
Tamanho do conjunto de teste: (3750, 17)


In [39]:
X_train = train_df.drop(columns=['renda', 'log_renda', 'data_ref', 'Unnamed: 0'])
y_train = train_df['log_renda']
X_test = test_df.drop(columns=['renda', 'log_renda', 'data_ref', 'Unnamed: 0'])
y_test = test_df['log_renda']

In [40]:
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

In [41]:
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

In [42]:
alphas = [0, 0.001, 0.005, 0.01, 0.05, 0.1]
results = {}

for alpha in alphas:
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train, y_train)
    y_pred = ridge_model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    results[alpha] = r2

for alpha, r2 in results.items():
    print(f'Alpha: {alpha}, R^2: {r2}')

best_alpha = max(results, key=results.get)
best_r2 = results[best_alpha]
print(f'O melhor modelo tem alpha = {best_alpha} com R^2 = {best_r2}')

Alpha: 0, R^2: 0.24138254239751167
Alpha: 0.001, R^2: 0.24138261289911378
Alpha: 0.005, R^2: 0.241382894635095
Alpha: 0.01, R^2: 0.24138324619740736
Alpha: 0.05, R^2: 0.24138603454473218
Alpha: 0.1, R^2: 0.24138946034560982
O melhor modelo tem alpha = 0.1 com R^2 = 0.24138946034560982


In [43]:
results = {}
for alpha in alphas:
    lasso_model = Lasso(alpha=alpha)
    lasso_model.fit(X_train, y_train)
    y_pred = lasso_model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    results[alpha] = r2

# Exibir os resultados
for alpha, r2 in results.items():
    print(f'Alpha: {alpha}, R^2: {r2}')

# Identificar o melhor modelo
best_alpha = max(results, key=results.get)
best_r2 = results[best_alpha]
print(f'O melhor modelo tem alpha = {best_alpha} com R^2 = {best_r2}')

  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(


Alpha: 0, R^2: 0.24138103454645166
Alpha: 0.001, R^2: 0.2407102335255512
Alpha: 0.005, R^2: 0.23880336747416075
Alpha: 0.01, R^2: 0.23418674216260527
Alpha: 0.05, R^2: 0.19602508225078608
Alpha: 0.1, R^2: 0.13636514431019942
O melhor modelo tem alpha = 0 com R^2 = 0.24138103454645166


  model = cd_fast.enet_coordinate_descent(


Isso depende do contexto e das suas prioridades. Se sua prioridade é ter o maior valor de R² possível, então o primeiro conjunto com Alpha 0.1 é melhor. No entanto, se você valoriza um modelo sem regularização (Alpha 0), o segundo conjunto é melhor.

In [44]:
np.random.seed(42)
X = pd.DataFrame({
    'var1': np.random.rand(100),
    'var2': np.random.rand(100),
    'var3': np.random.rand(100)
})
y = X['var1'] * 2 + X['var2'] * -3 + np.random.rand(100)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

scaler = StandardScaler()
X_train_poly_scaled = scaler.fit_transform(X_train_poly)
X_test_poly_scaled = scaler.transform(X_test_poly)

ridge = Ridge(alpha=0.1)
ridge.fit(X_train_poly_scaled, y_train)
y_pred_ridge = ridge.predict(X_test_poly_scaled)
ridge_r2 = r2_score(y_test, y_pred_ridge)
print(f'R² do modelo Ridge: {ridge_r2}')

R² do modelo Ridge: 0.9235166218885335


In [45]:
tree = DecisionTreeRegressor(max_depth=5)
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)
tree_r2 = r2_score(y_test, y_pred_tree)
print(f'R² da árvore de regressão: {tree_r2}')

R² da árvore de regressão: 0.8160847888885724


O modelo Ridge apresenta um R² superior ao da árvore de regressão, indicando que o modelo Ridge explica melhor a variabilidade dos dados de teste.