In [19]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm

In [2]:
# Load the dataset
data = pd.read_csv('default_of_credit_card_clients.csv')aa

In [8]:
print(data.head())

0  ID  LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  PAY_4  \
1   1      20000    2          2         1   24      2      2     -1     -1   
2   2     120000    2          2         2   26     -1      2      0      0   
3   3      90000    2          2         2   34      0      0      0      0   
4   4      50000    2          2         1   37      0      0      0      0   
5   5      50000    1          2         1   57     -1      0     -1      0   

0  ...  BILL_AMT4  BILL_AMT5  BILL_AMT6  PAY_AMT1  PAY_AMT2  PAY_AMT3  \
1  ...          0          0          0         0       689         0   
2  ...       3272       3455       3261         0      1000      1000   
3  ...      14331      14948      15549      1518      1500      1000   
4  ...      28314      28959      29547      2000      2019      1200   
5  ...      20940      19146      19131      2000     36681     10000   

0  PAY_AMT4  PAY_AMT5  PAY_AMT6  default payment next month  
1         0         0   

In [3]:
# Set the first row as header and drop the current header
data.columns = data.iloc[0]
data = data.drop(0)


In [4]:
# Convert the data to numeric type
data = data.apply(pd.to_numeric, errors='ignore')


In [5]:
# Split the dataset into features and target variable
X = data.drop(columns=['ID', 'default payment next month'])
y = data['default payment next month']

In [6]:
# Split data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
# Forward Selection
selected_features = []
best_score = float('inf')  # Initialize with a high value

while len(selected_features) < len(X.columns):
    remaining_features = [feature for feature in X.columns if feature not in selected_features]
    scores = []

    for feature in remaining_features:
        model = LinearRegression()
        features_to_use = selected_features + [feature]
        model.fit(X_train[features_to_use], y_train)
        y_pred = model.predict(X_test[features_to_use])
        score = mean_squared_error(y_test, y_pred)
        scores.append((feature, score))

    # Select the feature that improves the model the most
    best_feature, best_score_for_feature = min(scores, key=lambda x: x[1])
    
    # Check if adding this feature improves the model
    if best_score_for_feature < best_score:
        selected_features.append(best_feature)
        best_score = best_score_for_feature
    else:
        break  # Stop if adding more features does not improve the model

# Display the selected features
print("Selected Features:", selected_features)

Selected Features: ['PAY_0', 'BILL_AMT1', 'PAY_2', 'PAY_AMT1', 'MARRIAGE', 'PAY_3', 'PAY_AMT5', 'EDUCATION', 'PAY_5', 'PAY_AMT4', 'BILL_AMT2', 'LIMIT_BAL', 'PAY_AMT2', 'BILL_AMT6', 'SEX', 'PAY_AMT3', 'AGE']


In [20]:
# Backward Elimination
while len(X.columns) > 1:  # Keep at least one feature
    # Fit the model
    model = sm.OLS(y, X).fit()
    
    # Get the p-values for each feature
    p_values = model.pvalues[1:]  # Exclude the constant term
    
    # Identify the feature with the highest p-value
    feature_to_remove = p_values.idxmax()
    
    # Check if the highest p-value is above a certain threshold (e.g., 0.05)
    if p_values.max() > 0.05:
        # Remove the feature with the highest p-value
        X = X.drop(columns=[feature_to_remove])
    else:
        break  # Stop if the highest p-value is below the threshold

# Display the selected features
selected_features = X.columns[1:]  # Exclude the constant term
print("Selected Features:", selected_features)

Selected Features: Index(['SEX', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_5',
       'BILL_AMT1', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT5'],
      dtype='object', name=0)
