### Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

### Preprocessing of dataset after loading

In [2]:
# Load the dataset
cc_apps = pd.read_csv('crx.data', header=None, na_values='?')
cc_apps.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [3]:
print(cc_apps.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       678 non-null    object 
 1   1       678 non-null    float64
 2   2       690 non-null    float64
 3   3       684 non-null    object 
 4   4       684 non-null    object 
 5   5       681 non-null    object 
 6   6       681 non-null    object 
 7   7       690 non-null    float64
 8   8       690 non-null    object 
 9   9       690 non-null    object 
 10  10      690 non-null    int64  
 11  11      690 non-null    object 
 12  12      690 non-null    object 
 13  13      677 non-null    float64
 14  14      690 non-null    int64  
 15  15      690 non-null    object 
dtypes: float64(4), int64(2), object(10)
memory usage: 86.4+ KB
None


In [4]:
#check for missing values
print(cc_apps.isnull().sum())

0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64


In [5]:
# create a copy of the dataframe
cc_apps_imputed = cc_apps.copy()

In [6]:
# Iterate over each column of cc_apps_imputed 
# and impute the most frequent value for object data type
# and the mean for numeric data types
for col in cc_apps_imputed.columns:
    
    # check if the column is of object type
    if cc_apps_imputed[col].dtypes == "object":
        # impute with the most frequent value
        cc_apps_imputed[col] = cc_apps_imputed[col].fillna(
            cc_apps_imputed[col].value_counts().index[0]
        )
    else:
        # if the column is numeric type then impute with mean of that column
        cc_apps_imputed[col] = cc_apps_imputed[col].fillna(
            cc_apps_imputed[col].mean()
        )  

cc_apps_imputed.isnull().sum()          

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64

In [7]:
# ONE-HOT ENCODING on categorical column
# Dumnify the categorical features
cc_apps_encoded = pd.get_dummies(cc_apps_imputed, drop_first = True)


# drop_first=True --> drops the first category in each column to prevent
# multicollinearity in linear models( here we will be using logistic regression)
# it marked one column the baseline and reduces redundancey.

In [8]:
# Extracting the target variable which is the last column in the dataframe
X = cc_apps_encoded.iloc[:, :-1].values
y = cc_apps_encoded.iloc[:, -1].values

In [9]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [None]:
# SCALING DATA --> the .fit_transform() method  scales teh training features
# and the .transform() method  scales the test features to avoid data leakage.

# Instantiate StandardScaler and use it to rescale X_train and X_test
scaler = StandardScaler()
rescaled_X_train = scaler.fit_transform(X_train)
rescaled_X_test = scaler.transform(X_test)

## Training the model

In [11]:
# Instantiate a Logistic Regression classifier with default parameter values
clf_model = LogisticRegression()

In [None]:
# Fit the model to the training set
# Use .fit() to fit your model to your scaled feature variables and the target variable, making sure to use the training data.
clf_model.fit(rescaled_X_train, y_train)

In [15]:
# Use .predict() on the fitted model to generate predictions from the scaled feature variables and save the results.
# Use clf_model to predict instances from the training set

y_train_pred = clf_model.predict(rescaled_X_train)



In [None]:
# Evaluate the predictions - use confusion matrix to compare the generated predication with the target variable 
# from the training data.
# print the confusionmatrix fo the clf_model
print(confusion_matrix(y_train, y_train_pred))

[[187  17]
 [ 32 226]]


## Finding the best scoring model
Perform grid search cross validation to iterate through different parameters and find the best model for your training data

In [None]:
# Defining grid search parameters
# Define the parameters you want, such as tolerance or max iterations and save these as separate variables.
# tolerance controls how precise the optimization needs to be before stopping
# max_iter is iterations allowedf for the optimizaiotn solver. increasing it allows the algorithm more room to converge if needed
# You can then combine these into a dictionary using dict(). This hyperparameter grid in the form of dictionary is needed for 
# GridSearchCV in scikit-learn which tests all combinations of these values

# Define the grid of values for tolerance and maximum number of iterations
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]

# create a disctionary where tol and max_oter are keys and the lists of their values are the corresponding values
param_grid = dict(tol = tol, max_iter = max_iter)

In [18]:
param_grid

{'tol': [0.01, 0.001, 0.0001], 'max_iter': [100, 150, 200]}

In [20]:
# Performing grid search cross validation
# Use GridSearchCV() to insatntiate the grid search model and specify the model you trained as teh estimator and
# search parameters as param_grid
# cv argument takes the no. of folds
# fit the grid search model with .fit() using the scaled training data

# Instantiate GridSearchCV with the required parameters
grid_model = GridSearchCV(estimator = clf_model, param_grid = param_grid, cv = 5)

In [21]:
# fit grid model to the data
grid_model_result = grid_model.fit(rescaled_X_train, y_train)


In [22]:
# Find the model wiht the best score
# Extract he best model form your grid search with the .best_estimator_ attribute and fit theis to your test data.
# used this model to evaluate it on the test set using .score() and save this score to best_score

# Summarize results
best_train_score, best_train_params = grid_model_result.best_score_, grid_model_result.best_params_
print("Best: %f using %s" % (best_train_score, best_train_params))

Best: 0.857176 using {'max_iter': 100, 'tol': 0.001}


In [None]:
# Extract the best model and evaluate it on the test set
best_model = grid_model_result.best_estimator_
best_score = best_model.score(rescaled_X_test, y_test)

print("Accuracy of logistic regression classifier: ", best_score)

Accuracy of logistic regression clasifier:  0.8289473684210527
