In [None]:
#%%

# Step 1: Load and inspect the dataset
import pandas as pd

# Load the dataset
cc_apps = pd.read_csv("cc_approvals.data", header=None)

# Display the first 5 rows
print(cc_apps.head())

  0      1      2  3  4  5  6     7  8  9   10 11 12     13   14 15
0  b  30.83  0.000  u  g  w  v  1.25  t  t   1  f  g  00202    0  +
1  a  58.67  4.460  u  g  q  h  3.04  t  t   6  f  g  00043  560  +
2  a  24.50  0.500  u  g  q  h  1.50  t  f   0  f  g  00280  824  +
3  b  27.83  1.540  u  g  w  v  3.75  t  t   5  t  g  00100    3  +
4  b  20.17  5.625  u  g  w  v  1.71  t  f   0  f  s  00120    0  +


In [None]:
#%%

# Step 2: Inspect the dataset structure
# Summary statistics for numerical columns
print(cc_apps.describe())

# Information about the DataFrame
print(cc_apps.info())

# Display the last 17 rows to check for missing values
print(cc_apps.tail(17))

               2           7          10             14
count  690.000000  690.000000  690.00000     690.000000
mean     4.758725    2.223406    2.40000    1017.385507
std      4.978163    3.346513    4.86294    5210.102598
min      0.000000    0.000000    0.00000       0.000000
25%      1.000000    0.165000    0.00000       0.000000
50%      2.750000    1.000000    0.00000       5.000000
75%      7.207500    2.625000    3.00000     395.500000
max     28.000000   28.500000   67.00000  100000.000000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       690 non-null    object 
 1   1       690 non-null    object 
 2   2       690 non-null    float64
 3   3       690 non-null    object 
 4   4       690 non-null    object 
 5   5       690 non-null    object 
 6   6       690 non-null    object 
 7   7       690 non-null    float64
 8   8       690 non-

In [None]:
#%%

# Step 3: Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split

# Drop columns 11 and 13 (irrelevant features)
cc_apps = cc_apps.drop([11, 13], axis=1)

# Split the data into training and testing sets
cc_apps_train, cc_apps_test = train_test_split(cc_apps, test_size=0.33, random_state=42)

# Display the shapes of the resulting datasets
print(cc_apps_train.shape)
print(cc_apps_test.shape)

(462, 14)
(228, 14)


In [None]:
#%%

# Step 4: Handle missing values (Part 1)
import numpy as np

# Replace '?' with NaN
cc_apps_train = cc_apps_train.replace('?', np.nan)
cc_apps_test = cc_apps_test.replace('?', np.nan)

# Display the first few rows to verify
print(cc_apps_train.head())
print(cc_apps_test.head())

      0      1      2  3  4  5   6     7  8  9   10 12    14 15
382    a  24.33  2.500  y  p  i  bb  4.50  f  f   0  g   456  -
137    b  33.58  2.750  u  g  m   v  4.25  t  t   6  g     0  +
346  NaN  32.25  1.500  u  g  c   v  0.25  f  f   0  g   122  -
326    b  30.17  1.085  y  p  c   v  0.04  f  f   0  g   179  -
33     a  36.75  5.125  u  g  e   v  5.00  t  f   0  g  4000  +
    0      1    2  3  4   5   6    7  8  9   10 12   14 15
286  a    NaN  1.5  u  g  ff  ff  0.0  f  t   2  g  105  -
511  a  46.00  4.0  u  g   j   j  0.0  t  f   0  g  960  +
257  b  20.00  0.0  u  g   d   v  0.5  f  f   0  g    0  -
336  b  47.33  6.5  u  g   c   v  1.0  f  f   0  g  228  -
318  b  19.17  0.0  y  p   m  bb  0.0  f  f   0  s    1  +


In [None]:
#%%

# Step 5: Handle missing values (Part 2)
# Impute missing numerical values with the mean
for col in cc_apps_train.columns:
    if cc_apps_train[col].dtype in ['float64', 'int64']:
        mean_value = cc_apps_train[col].mean()
        cc_apps_train[col].fillna(mean_value, inplace=True)
        cc_apps_test[col].fillna(mean_value, inplace=True)

# Check if there are any remaining NaNs in numerical columns
print(cc_apps_train.isna().sum())
print(cc_apps_test.isna().sum())

0     8
1     5
2     0
3     6
4     6
5     7
6     7
7     0
8     0
9     0
10    0
12    0
14    0
15    0
dtype: int64
0     4
1     7
2     0
3     0
4     0
5     2
6     2
7     0
8     0
9     0
10    0
12    0
14    0
15    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cc_apps_train[col].fillna(mean_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cc_apps_test[col].fillna(mean_value, inplace=True)


In [None]:
#%%

# Step 6: Handle missing values (Part 3)
# Impute missing categorical values with the most frequent value
for col in cc_apps_train.columns:
    if cc_apps_train[col].dtype == 'object':
        most_frequent_value = cc_apps_train[col].value_counts().idxmax()
        cc_apps_train[col].fillna(most_frequent_value, inplace=True)
        cc_apps_test[col].fillna(most_frequent_value, inplace=True)

# Check if there are any remaining NaNs
print(cc_apps_train.isna().sum())
print(cc_apps_test.isna().sum())

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
12    0
14    0
15    0
dtype: int64
0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
12    0
14    0
15    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cc_apps_train[col].fillna(most_frequent_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cc_apps_test[col].fillna(most_frequent_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on 

In [None]:
#%%

# Step 7: Preprocess the data (Part 1)
# Convert categorical data into numerical data using one-hot encoding
cc_apps_train = pd.get_dummies(cc_apps_train)
cc_apps_test = pd.get_dummies(cc_apps_test)

# Align the columns of the test set with the training set
cc_apps_test = cc_apps_test.reindex(columns=cc_apps_train.columns, fill_value=0)

# Display the first few rows to verify
print(cc_apps_train.head())
print(cc_apps_test.head())

         2     7  10    14    0_a    0_b  1_13.75  1_15.83  1_15.92  1_16.00  \
382  2.500  4.50   0   456   True  False    False    False    False    False   
137  2.750  4.25   6     0  False   True    False    False    False    False   
346  1.500  0.25   0   122  False   True    False    False    False    False   
326  1.085  0.04   0   179  False   True    False    False    False    False   
33   5.125  5.00   0  4000   True  False    False    False    False    False   

     ...    6_z    8_f    8_t    9_f    9_t  12_g   12_p   12_s   15_+   15_-  
382  ...  False   True  False   True  False  True  False  False  False   True  
137  ...  False  False   True  False   True  True  False  False   True  False  
346  ...  False   True  False   True  False  True  False  False  False   True  
326  ...  False   True  False   True  False  True  False  False  False   True  
33   ...  False  False   True   True  False  True  False  False   True  False  

[5 rows x 329 columns]
       2    7  

In [None]:
#%%

# Step 8: Preprocess the data (Part 2)

# Separate features and labels
X_train = cc_apps_train.iloc[:, :-1]
y_train = cc_apps_train.iloc[:, -1]
X_test = cc_apps_test.iloc[:, :-1]
y_test = cc_apps_test.iloc[:, -1]
from sklearn.preprocessing import MinMaxScaler

# Check column names before renaming
print("X_train columns before renaming:", X_train.columns)
print("X_test columns before renaming:", X_test.columns)

# Add a prefix to all column names to make them non-numeric
X_train.columns = ['col_' + str(col) for col in X_train.columns]
X_test.columns = ['col_' + str(col) for col in X_test.columns]

# Verify column names after renaming
print("X_train columns after renaming:", X_train.columns)
print("X_test columns after renaming:", X_test.columns)



# Scale the features to a range of 0 to 1
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.transform(X_test)

# Display the scaled data
print("Scaled Training Data:")
print(rescaledX_train[:5])
print("\nScaled Testing Data:")
print(rescaledX_test[:5])

X_train columns before renaming: Index([        2,         7,        10,        14,     '0_a',     '0_b',
       '1_13.75', '1_15.83', '1_15.92', '1_16.00',
       ...
           '6_v',     '6_z',     '8_f',     '8_t',     '9_f',     '9_t',
          '12_g',    '12_p',    '12_s',    '15_+'],
      dtype='object', length=328)
X_test columns before renaming: Index([        2,         7,        10,        14,     '0_a',     '0_b',
       '1_13.75', '1_15.83', '1_15.92', '1_16.00',
       ...
           '6_v',     '6_z',     '8_f',     '8_t',     '9_f',     '9_t',
          '12_g',    '12_p',    '12_s',    '15_+'],
      dtype='object', length=328)
X_train columns after renaming: Index(['col_2', 'col_7', 'col_10', 'col_14', 'col_0_a', 'col_0_b',
       'col_1_13.75', 'col_1_15.83', 'col_1_15.92', 'col_1_16.00',
       ...
       'col_6_v', 'col_6_z', 'col_8_f', 'col_8_t', 'col_9_f', 'col_9_t',
       'col_12_g', 'col_12_p', 'col_12_s', 'col_15_+'],
      dtype='object', length=328)
X_test 

In [None]:
#%%

# Step 9: Fit a logistic regression model
from sklearn.linear_model import LogisticRegression

# Create a logistic regression model
logreg = LogisticRegression()

# Fit the model to the training data
logreg.fit(rescaledX_train, y_train)

In [None]:
#%%

# Step 10: Make predictions and evaluate performance
from sklearn.metrics import confusion_matrix, accuracy_score

# Make predictions on the test set
y_pred = logreg.predict(rescaledX_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Display the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 1.0000
Confusion Matrix:
[[103   0]
 [  0 125]]


In [None]:
#%%

# Step 11: Grid search for hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'tol': [0.01, 0.001, 0.0001],
    'max_iter': [100, 150, 200]
}

# Create a GridSearchCV object
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

# Fit the grid search to the training data
grid_model.fit(rescaledX_train, y_train)

# Display the best parameters and score
print(f"Best Parameters: {grid_model.best_params_}")
print(f"Best Score: {grid_model.best_score_:.4f}")

Best Parameters: {'max_iter': 100, 'tol': 0.001}
Best Score: 1.0000


In [None]:
#%%

# Step 12: Evaluate the best model
# Extract the best model
best_model = grid_model.best_estimator_

# Make predictions with the best model
y_pred_best = best_model.predict(rescaledX_test)

# Calculate accuracy
best_accuracy = accuracy_score(y_test, y_pred_best)
print(f"Best Model Accuracy: {best_accuracy:.4f}")

# Display the confusion matrix for the best model
best_conf_matrix = confusion_matrix(y_test, y_pred_best)
print("Best Model Confusion Matrix:")
print(best_conf_matrix)

Best Model Accuracy: 1.0000
Best Model Confusion Matrix:
[[103   0]
 [  0 125]]
