Feature Engineering and Model training

In [120]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")

In [121]:
# for text processing

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
import string
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [122]:
# For classification model slection
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [123]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [124]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [125]:
from category_encoders.binary import BinaryEncoder
from imblearn.combine import SMOTETomek

In [126]:
# For Hyperparameter tunning
from hyperopt import tpe,hp,Trials,space_eval
from hyperopt.fmin import fmin
from hyperopt.pyll import scope

In [127]:
df = pd.read_parquet(r"C:\Users\aakkewar\Documents\BOOTCAMP (PowerBI, SQL, Python, R)\Bits SEM 4\Project\dataset\complaints_par.parquet")

As per final report of EDA some features can be removed

In [128]:
missing = df.isnull().sum().div(df.shape[0]).mul(100).to_frame().sort_values(by=0, ascending=False)

In [129]:
missing[0:8]

Unnamed: 0,0
Tags,90.03585
Consumer disputed?,83.105944
Consumer complaint narrative,63.7376
Company public response,52.655076
Consumer consent provided?,20.112629
Sub-issue,15.94684
Sub-product,5.173649
State,0.967511


In [130]:
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2024-01-13,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Information belongs to someone else,,,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",FL,33401,,Other,Web,2024-01-13,In progress,Yes,,8154057
1,2024-01-12,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Information belongs to someone else,,,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",FL,32209,,Other,Web,2024-01-12,In progress,Yes,,8153174
2,2024-01-13,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Account status incorrect,,,"EQUIFAX, INC.",FL,33351,,,Web,2024-01-13,In progress,Yes,,8153758
3,2024-01-13,Credit reporting or other personal consumer re...,Credit reporting,Unable to get your credit report or credit score,Other problem getting your report or credit score,,,FISERV INC.,CA,90806,,,Web,2024-01-13,In progress,Yes,,8154072
4,2024-01-13,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Information belongs to someone else,,,"EQUIFAX, INC.",GA,30135,,,Web,2024-01-13,In progress,Yes,,8154093


In [131]:
#Company column can be dropped as it contains 4284 null values which are names

In [132]:
df.columns

Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Consumer disputed?', 'Complaint ID'],
      dtype='object')

In [133]:
drop_columns = ['Sub-product','Sub-issue','Company public response', 'Company','ZIP code', 'Tags','Complaint ID']
df.drop(drop_columns, axis=1, inplace=True)

In [134]:
missing = df.isnull().sum().div(df.shape[0]).mul(100).to_frame().sort_values(by=0, ascending=False)

In [135]:
missing

Unnamed: 0,0
Consumer disputed?,83.105944
Consumer complaint narrative,63.7376
Consumer consent provided?,20.112629
State,0.967511
Company response to consumer,0.000176
Date received,0.0
Product,0.0
Issue,0.0
Submitted via,0.0
Date sent to company,0.0


In [136]:
df.shape

(4547854, 11)

In [137]:
df.dropna(subset=['Consumer disputed?'], inplace=True)
df.dropna(subset=['Consumer complaint narrative'], inplace=True)

In [138]:
missing = df.isnull().sum().div(df.shape[0]).mul(100).to_frame().sort_values(by=0, ascending=False)
missing

Unnamed: 0,0
State,0.279859
Date received,0.0
Product,0.0
Issue,0.0
Consumer complaint narrative,0.0
Consumer consent provided?,0.0
Submitted via,0.0
Date sent to company,0.0
Company response to consumer,0.0
Timely response?,0.0


In some columns on df have missing values. it can be imputed with simple imputer with mode strategy

Feature Extraction

In [139]:
df[['Date received', 'Date sent to company']].head()

Unnamed: 0,Date received,Date sent to company
2199,2016-11-09,2016-11-09
19946,2017-04-01,2017-04-01
24151,2016-03-09,2016-03-11
24620,2016-06-21,2016-06-24
24893,2016-11-14,2016-11-15


Here datset has two date feature, 'Date received' which is the date on which the comp;laint was registered to CFPB and 'Date sent to  company' is when the complaint has been sent to respective company.

In [140]:
#difference between date complaint received vs data complaint sent to the company
df['days_to_forward_complaint']= pd.to_datetime(df['Date sent to company']) - pd.to_datetime(df['Date received']) 
#Get the days in datetime days (numeric) format
df['days_to_forward_complaint']= df['days_to_forward_complaint'].dt.days

In [141]:
#After creating the days_to_forward_complaint, both the date columns can be removed
df.drop(['Date received', 'Date sent to company'], axis=1, inplace=True)

The feature days to forward complaint has information about the duration taken for CFPB to forward the complaint to companies

For model to reduce computation time we can use sample of the data for model

In [142]:
df2 =  df.groupby("Consumer disputed?")
countaf1 = df2.count()

print(countaf1 )

                    Product   Issue  Consumer complaint narrative   State  \
Consumer disputed?                                                          
No                   128204  128204                        128204  127880   
Yes                   35807   35807                         35807   35672   

                    Consumer consent provided?  Submitted via  \
Consumer disputed?                                              
No                                      128204         128204   
Yes                                      35807          35807   

                    Company response to consumer  Timely response?  \
Consumer disputed?                                                   
No                                        128204            128204   
Yes                                        35807             35807   

                    days_to_forward_complaint  
Consumer disputed?                             
No                                     128204  
Yes  

In [143]:
df['Consumer disputed?'].value_counts(normalize=True)*100

Consumer disputed?
No     78.167928
Yes    21.832072
Name: proportion, dtype: float64

In [144]:
df.shape

(164011, 10)

In [145]:
df1 = df.groupby("Consumer disputed?").sample(n=35700)
df1.reset_index(inplace=True)
df1.head(5)

Unnamed: 0,index,Product,Issue,Consumer complaint narrative,State,Consumer consent provided?,Submitted via,Company response to consumer,Timely response?,Consumer disputed?,days_to_forward_complaint
0,3098543,Debt collection,Improper contact or sharing of info,I started getting calls from XXXX XXXX XXXX XX...,CA,Consent provided,Web,Closed with explanation,Yes,No,0
1,4014715,Bank account or service,Deposits and withdrawals,I effected a transfer online of {$3500.00} fro...,FL,Consent provided,Web,Closed with explanation,No,No,0
2,965199,Credit reporting,Unable to get credit report/credit score,"Hello, The FTC has a website recommending XXXX...",CA,Consent provided,Web,Closed with monetary relief,Yes,No,0
3,3087735,Debt collection,Communication tactics,after i was trying to figure out why i got a s...,TX,Consent provided,Web,Closed with explanation,Yes,No,10
4,2965803,Mortgage,"Loan servicing, payments, escrow account",Chase Bank purchased my mortgage from XXXX XXX...,IN,Consent provided,Web,Closed with explanation,Yes,No,0


Text Processing

In [146]:
df1['Consumer disputed?'].value_counts(normalize=True)*100

Consumer disputed?
No     50.0
Yes    50.0
Name: proportion, dtype: float64

Text Processing

For Vectorization
1. TFIDF
2. CountVectorizer
3. NLTK/Scipy Library
4. Pretrained Glove

here we can use the TFIDF to process

Steps for text processing
1. Remove Punctuation
2. Remove Stop words
3. Lower Casing
4. Tokenization
5. Stemming (finding roor word) / Lemmatization (gives complete word)

- 'Isuue' column has text which has to be preprocessed
- The text needs to be trasnformed into vectors so as the algorithms will be able to make predictions. In this case, it will be used the term Frequency = Inverse Document Frequency (TFIDF) weight to evaluate how important a word is to a document in a collection of documents.  
- After removing punctuation and lower casing the words, the importance of work is determined in terms of its frequency.

In [147]:
import nltk

# Download the 'stopwords' resource
#nltk.download('stopwords')
#nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\aakkewar\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [148]:
#create list of stop words which has to be removed
stopwords_list = stopwords.words('english') +list(string.punctuation)

In [149]:
def process_text(issue):
    #create tokens
    tokens = nltk.word_tokenize(issue)
    #remove common stopwords
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    #remove stopwords including few punctuations
    stopwords_removed = [word for word in stopwords_removed if word.isalpha()]
    return stopwords_removed

# concat the strings
def concat_strings(words_list):
    concat_words = ''
    for word in words_list:
        concat_words += word +' '
    return concat_words.strip()

#funtion to lemmatize words and merge each complaint into a single space-seprated string
lemm =  WordNetLemmatizer()

def lemmatizer_concat(words_list):
    #remove any NAN's
    list_of_words = [i for i in words_list if i is not np.nan]
    #lemmatize each word
    lemmatized_list = []
    for idx, word in enumerate(words_list):
        lemmatized_list.append(lemm.lemmatize(word))
    #make the list into a single string with the words separted by
    final_string = concat_strings(lemmatized_list)
    return final_string


Prepare data with text processing

In [150]:
for i in range(len(df1)):
    text = process_text(df1['Issue'].iloc[i])
    final_texts = lemmatizer_concat(text)
    df1['Issue'].iloc[i] = final_texts
    if i % 5000 == 0:
        print(f'Prcessed Row Number {i}')

Prcessed Row Number 0


Prcessed Row Number 5000
Prcessed Row Number 10000
Prcessed Row Number 15000
Prcessed Row Number 20000
Prcessed Row Number 25000
Prcessed Row Number 30000
Prcessed Row Number 35000
Prcessed Row Number 40000
Prcessed Row Number 45000
Prcessed Row Number 50000
Prcessed Row Number 55000
Prcessed Row Number 60000
Prcessed Row Number 65000
Prcessed Row Number 70000


Vectorizing the processed texts

In [151]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidv = TfidfVectorizer(max_features=None, strip_accents='unicode', analyzer='word', ngram_range=(1,2))

#Getting the data after vectorizing issue column
df_vect = tfidv.fit_transform(df1['Issue'])

feature_names = tfidv.get_feature_names_out()

feature_names

array(['account', 'account opening', 'account term', 'acct',
       'acct credited', 'acct wrong', 'action', 'adding', 'adding money',
       'advance', 'advance fee', 'advertising', 'advertising marketing',
       'amount', 'amount charged', 'amt', 'application',
       'application originator', 'application processing', 'applied',
       'applied receive', 'apply', 'apr', 'apr interest', 'arbitration',
       'atm', 'atm card', 'attempt', 'attempt collect', 'available',
       'available promised', 'balance', 'balance transfer', 'bank',
       'bank account', 'bank acct', 'bankruptcy', 'billing',
       'billing dispute', 'billing statement', 'broker', 'ca',
       'ca contact', 'ca repay', 'ca stop', 'card', 'card protection',
       'cash', 'cash advance', 'caused', 'caused fund', 'change',
       'charge', 'charge bank', 'charged', 'charged bank', 'charged fee',
       'charged received', 'check', 'closing', 'closing account',
       'closing management', 'collect', 'collect debt'

Data Preprocessing

Concat old data with vectorized data from issue text column

In [152]:
df1 = pd.concat([df1, pd.DataFrame(df_vect.toarray())], axis=1)

In [153]:
df1.head()

Unnamed: 0,index,Product,Issue,Consumer complaint narrative,State,Consumer consent provided?,Submitted via,Company response to consumer,Timely response?,Consumer disputed?,...,296,297,298,299,300,301,302,303,304,305
0,3098543,Debt collection,improper contact sharing info,I started getting calls from XXXX XXXX XXXX XX...,CA,Consent provided,Web,Closed with explanation,Yes,No,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4014715,Bank account or service,deposit withdrawal,I effected a transfer online of {$3500.00} fro...,FL,Consent provided,Web,Closed with explanation,No,No,...,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0
2,965199,Credit reporting,unable get credit score,"Hello, The FTC has a website recommending XXXX...",CA,Consent provided,Web,Closed with monetary relief,Yes,No,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3087735,Debt collection,communication tactic,after i was trying to figure out why i got a s...,TX,Consent provided,Web,Closed with explanation,Yes,No,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2965803,Mortgage,loan servicing payment escrow account,Chase Bank purchased my mortgage from XXXX XXX...,IN,Consent provided,Web,Closed with explanation,Yes,No,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [154]:
df1.shape

(71400, 317)

In [155]:
#removing issue column
df1.drop(['Issue','index'], axis=1, inplace=True)

In [156]:
df1.head(2)

Unnamed: 0,Product,Consumer complaint narrative,State,Consumer consent provided?,Submitted via,Company response to consumer,Timely response?,Consumer disputed?,days_to_forward_complaint,0,...,296,297,298,299,300,301,302,303,304,305
0,Debt collection,I started getting calls from XXXX XXXX XXXX XX...,CA,Consent provided,Web,Closed with explanation,Yes,No,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bank account or service,I effected a transfer online of {$3500.00} fro...,FL,Consent provided,Web,Closed with explanation,No,No,0,0.0,...,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0


In [157]:
from sklearn.model_selection import train_test_split

X = df1.drop(['Consumer disputed?'], axis=1)
y = df1['Consumer disputed?']

In [158]:
X.shape

(71400, 314)

In [159]:
df1.columns

Index([                     'Product', 'Consumer complaint narrative',
                              'State',   'Consumer consent provided?',
                      'Submitted via', 'Company response to consumer',
                   'Timely response?',           'Consumer disputed?',
          'days_to_forward_complaint',                              0,
       ...
                                  296,                            297,
                                  298,                            299,
                                  300,                            301,
                                  302,                            303,
                                  304,                            305],
      dtype='object', length=315)

Initialize features for transformation

Binary encoding and one-hot encoding are both techniques used in machine learning to transform categorical data into a format that can be used for training models.

In [160]:
#for binary encoder
binary_features = ['Product', 'State', 'Submitted via', 'Company response to consumer']

#for one hot encoding
onehot_features = ['Consumer consent provided?', 'Timely response?', 'State']


Create column transformer for transformation

In [161]:
onehot_encoder_pipeline = Pipeline(steps=[
    ('SimpleImputer', SimpleImputer(strategy='most_frequent')),
    ('BinaryEncoder', BinaryEncoder())

])

In [162]:
binary_encoder_pipeline = Pipeline(steps= [
    ('SimpleImputer', SimpleImputer(strategy='most_frequent')),
    ('BinaryEncoder', BinaryEncoder())

    
])

In [163]:
#getting data pre processor object

preprocessor = ColumnTransformer(
    [
        ("Categorical_Pipeline", onehot_encoder_pipeline, onehot_features),
        ("Binary_encoder_pipeline", binary_encoder_pipeline, binary_features),
        #("Numeric_Pipeline", RobustScaler(), numerical_feature)
    ], remainder='passthrough'
)

In [164]:
X.dtypes

Product                          object
Consumer complaint narrative     object
State                            object
Consumer consent provided?       object
Submitted via                    object
                                 ...   
301                             float64
302                             float64
303                             float64
304                             float64
305                             float64
Length: 314, dtype: object

Transforming the data for modelling

In [166]:
# need to convert all columns to string data type

X.columns = X.columns.astype(str)


# fit transform the train data
X = preprocessor.fit_transform(X)

In [None]:
y

array([1, 1, 1, ..., 0, 0, 0])

Mannually Encoding Target Geature

In [None]:
#manually encoding "Yes" as 0 and "No" as 1
y = np.where(y.values == 'Yes', 0, 1)

Handling Imbalanced Dataset

* Synthtic minority oversampling technique or SMOTE is another technique to oversample the monority class. simply adding duplicate records of minority class ofter don't add any new information to the model
* SMOTE is one of the famous oversampling techniques and is very effective in handling class imbalance. The idea is to combine SMOTE with some undersampling techniques (ENN, Tomek) to increase the effectiveness of handling the imbalanced data

In [None]:
X

array([[1, 0, 1, ..., 0.0, 0.0, 0.0],
       [1, 0, 1, ..., 0.0, 0.0, 0.0],
       [1, 0, 1, ..., 0.0, 0.0, 0.0],
       ...,
       [1, 0, 1, ..., 0.0, 0.0, 0.0],
       [1, 0, 1, ..., 0.0, 0.0, 0.0],
       [1, 0, 1, ..., 0.0, 0.0, 0.0]], dtype=object)

In [None]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [None]:
# Resampling the minority class. The strategy can be changed as required
from imblearn.combine import SMOTETomek

smt = SMOTETomek(random_state=42, sampling_strategy='minority', n_jobs=-1)
X_res, y_res = smt.fit_resample(X, y)

ValueError: could not convert string to float: "About a two months ago, I began receiving threatening phone calls from a debt collector service regarding a online payday loan that I never received. The company threatened that I committed check fraud and that there will be someone coming to my job to arrest me. I spoke to an individual who was very nasty and would not give me much information. They told me I better pay the loan and hung up. I called back and was extremely upset about this call. I was finally able to speak to someone to explain that I needed to pull my bank records from XXXX 2011. I went to the bank and pulled the information and called this company back and was told to fax the information in. WelI, I faxed in this information and called again speaking to different people and I was told that I was okay and should not get any more calls. Two days later, they began calling my cell, my father 's telephone number and my home number. I again attempted to call and explained what I was told and I reached the same nasty person STATING THAT I BETTER PAY THE MONEY AND HUNG UP. I proceeded to keep calling to get someone different and never did and then the gentleman stated that he would have them come after me for harassment. I could n't believe the conversation. I faxed the information again and left it alone. Three weeks later I receive a call on my telephone from a woman stating that she has my business address and have been given some paperwork to have me picked up and processed for this outstanding charge that I never had. This time the amount was huge and they stated that I had to pay it in order for me not to be arrested. With my job on the line I paid XXXX dollars allowing these people to have my debit card. However after getting off the phone crying, I began to contact people and told them what happened and they informed me that I was duped out of my money. The dept collectors company from the XXXX call never provided their name and all I have is telephone number XXXX and the payday loan they mentioned was XXXX XXXX. The next debt collector did provide their name XXXX XXXX XXXX located in PA and XXXX as well as the person who called to threaten that I would be picked up, XXXX. I have tried to call this number back and no one will pick up. I was given a name XXXX XXXX and the company that is now stating I owe for a payday loan is now call Midland Funding."

In [None]:
X

array([[1, 0, 1, ..., 0.0, 0.0, 0.0],
       [1, 0, 1, ..., 0.0, 0.0, 0.0],
       [1, 0, 1, ..., 0.0, 0.0, 0.0],
       ...,
       [1, 0, 1, ..., 0.0, 0.0, 0.0],
       [1, 0, 1, ..., 0.0, 0.0, 0.0],
       [1, 0, 1, ..., 0.0, 0.0, 0.0]], dtype=object)

In [None]:
y

array([1, 1, 1, ..., 0, 0, 0])

Model Selection

Here should understand the various classification models with default values from these models we can choose top 4 with Highest Accuracy score and preceed with HyperParameter tuning

In [None]:
# Function which returns all evaluation metrics for classification model

def evaluate_clf(true, predicted):
    acc  = accuracy_score(true, predicted) # to calculate accuracy
    f1 = f1_score(true, predicted) # to calcualate F1-score
    precision = precision_score(true, predicted) # to calculate precision
    recall = recall_score(true, predicted) # to calculate recall
    roc_auc = roc_auc_score(true, predicted) # to calcualte ROC
    return acc, f1, precision, recall, roc_auc

In [None]:
#Initialize models which are required for model selection

model ={

    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "XGBClassifier": XGBClassifier(),
    "CatBoostClassifier": CatBoostClassifier(verbose=False),
    "AdaBoostClassifier": AdaBoostClassifier()
}

In [None]:
#Create a function which can evaluate models and return a report in Dataframe

def evaluate_models(X, y, models):
    '''
    This functino takes in X and y and models dictionary as input
    It splits the data into Train Test Split
    Iterates through the given model dicttionary and evalutes teh metrics
    Returns: Dataframe which contains report of all models metrics with cost

    '''

    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

    models_list = []
    accuracy_list = []
    auc = []

    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train,y_train)

        #make prediction

        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Training set performance
        model_train_accurancy, model_train_f1, model_train_precision, model_train_recall, model_train_rocauc_score = evaluate_clf(y_train, y_train_pred)

        #Test set performance
        model_test_accurancy, model_test_f1, model_test_precision, model_test_recall, model_test_rocauc_score = evaluate_clf(y_test, y_test_pred)

        print(list(models.keys())[i])
        models_list.append(list(models.keys())[i])

        print('Model performance for Training set')
        print("- Accuracy: {:.4f}".format(model_train_accurancy))
        print('- F1 score {:.4f}'.format(model_train_f1))
        print('-Precision {:.4f}'.format(model_train_precision))
        print('Recall: {:.4f}'.format(model_train_recall))
        print("- ROC AUC Score {:.4f}".format(model_train_rocauc_score))

        print("=-------------------------------------=")

        print('Model performance for Test set')
        print("- Accuracy: {:.4f}".format(model_test_accurancy))
        print('- F1 score {:.4f}'.format(model_test_f1))
        print('-Precision {:.4f}'.format(model_test_precision))
        print('Recall: {:.4f}'.format(model_test_recall))
        print("- ROC AUC Score {:.4f}".format(model_test_rocauc_score))

        auc.append(model_test_rocauc_score)
        print('='*35)
        print('\n')


    report = pd.DataFrame(list(zip(models_list, accuracy_list)), columns=["Model Name", 'Accuracy']).sort_values(by=['Accuracy'], ascending=False)

    return report

Base report of all models with default parameters

In [None]:
base_report = evaluate_models(X=X_res, y= y_res, models= models)

NameError: name 'X_res' is not defined

In [None]:
base_report