In [78]:
import csv #read and write CSV files in Python
import random #This module provides various functions to generate random numbers and random choices.

In [79]:
import pandas as pd

In [80]:
#Loading dataset using csv module
data = []
with open('Bank Client Deposit Data set Classification.csv', 'r') as f: #opens the file in read mode
    csvreader = csv.reader(f)
    headers = next(csvreader) # Get the headers
    for row in csvreader: #iterates through the remaining rows in the csvreader object
        data.append(row) #each row is appended as a list to the "data" list

In [81]:
#Display the first row that is the header of dataset
headers

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing',
 'loan',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'y']

In [82]:
# Convert data types to int or float if possible
for row in data:
    for i in range(len(row)):
        if row[i].isdigit():
            row[i] = int(row[i])
        else:
            try:
                row[i] = float(row[i])
            except ValueError:
                pass

# Get list of data types for each column
data_types = []
for i in range(len(headers)):
    types = set([type(row[i]).__name__ for row in data])
    data_types.append(types)

# Print headers and data types
print('{:<26} {}'.format('Column Name', 'Data Types'))
print('-' * 27, ' ', '-' * 12)
for i in range(len(headers)):
    print('{:<26} {}'.format(headers[i], data_types[i]))


Column Name                Data Types
---------------------------   ------------
age                        {'int'}
job                        {'str'}
marital                    {'str'}
education                  {'str'}
default                    {'str'}
balance                    {'float', 'int'}
housing                    {'str'}
loan                       {'str'}
contact                    {'str'}
day                        {'int'}
month                      {'str'}
duration                   {'int'}
campaign                   {'int'}
pdays                      {'float', 'int'}
previous                   {'int'}
poutcome                   {'str'}
y                          {'str'}


In [83]:
"""Defines a class DataFrame with an __init__ method. 
   The DataFrame class represents a two-dimensional table-like data structure that can hold data of any type.
   It can be used to perform operations and manipulations on data in a tabular format."""
class DataFrame:
    def __init__(self, data): #The __init__ method initializes a new instance of the DataFrame class.
        self.data = data #assigns the input dictionary to the instance variable data.
        self.headers = list(data.keys()) #creates a list of column headers by extracting the keys of the input dictionary.
        self.n_rows = len(list(data.values())[0]) #determines the number of rows in the dataframe by extracting the first value list and checking its length.
        self.n_cols = len(data) #determines the number of columns in the dataframe by getting the length of the input dictionary.
        self.types = self._infer_types() #infer the data types of each column and store them in a list.
        
    """Head method prints top 5 rows by default"""
    def head(self, n=5):
        for header in self.headers: #iterates over each column header in the dataframe.
            print(header, end='\t')
        print()
        for i in range(n):
            for j, header in enumerate(self.headers): #creates a loop that iterates over each column header in the dataframe, along with its index.
                print(self.data[header][i], end='\t')
            print()
            
    def values(self):
        return [list(self.data[header]) for header in self.headers]
            
    """The _infer_types() method infers the data type of each column based on the types of the values in the column.
        It creates a dictionary mapping column names to data types. If all values in a column have the same type,
        that type is used as the column's data type. Otherwise, the column is treated as an object column."""
    def _infer_types(self):
        types = {} #initializes an empty dictionary
        for header in self.headers: #iterates over each column header in the dataframe
            column = self.data[header] #retrieves the column of data associated with the current header.
            type_set = set(type(x).__name__ for x in column) #creates a set of unique data types in the column
            if len(type_set) == 1: #checks if the column contains only one type of data
                types[header] = type_set.pop() # adds the inferred data type to the types dictionary
            else:
                types[header] = 'object'
        return types
    
    """The _convert_column() method converts the data type of a column to a new type,
        using the built-in int(), float(), or str() functions."""
    def _convert_column(self, header, new_type):
        column = self.data[header] #retrieves the column of data associated with the current header
        if new_type == 'int': #checks if the desired data type is int
            self.data[header] = [int(x) for x in column] #converts each value in the column to int by replacing the old column with the new column of integers
        elif new_type == 'float': #checks if the desired data type is float
            self.data[header] = [float(x) for x in column]
        elif new_type == 'str': #checks if the desired data type is string/object
            self.data[header] = [str(x) for x in column]
        else:
            raise ValueError('Invalid data type') #handles the case where the desired data type is not one of the supported data types
    
    """The convert_type() method allows the user to convert the data type of a specific column to a new type.
         three arguments: self (which is a reference to the current instance of the class),
                          header (which is the header of the column to convert), and 
                          new_type (which is the desired data type to convert the column to).."""
    def convert_type(self, header, new_type):
        if header not in self.headers: #checks if the header is valid
            raise ValueError('Invalid header')
        if new_type not in ('int', 'float', 'str'): #checks if the desired data type is valid
            raise ValueError('Invalid data type')
        current_type = self.types[header] #retrieves the current data type of the column associated with the specified header
        if new_type == current_type: #checks if the column is already of the desired data type
            return
        self._convert_column(header, new_type) #converts the column of data to the specified data type using the private _convert_column method defined earlier.
        self.types[header] = new_type #updates the data type of the column in the types dictionary.
     
    """The unique_values method returns a dictionary where the keys are the headers of the columns 
       and the values are lists of the unique values in each column."""
    def unique_values(self):
        unique_values_dict = {} # initializes an empty dictionary to store the unique values.
        for header in self.headers: #iterates over each header in the dataframe
            unique_values_dict[header] = list(set(self.data[header])) 
        return unique_values_dict #returns dictinory containing unique values
    
    """he cat_to_num method takes a single argument, 
       column: which is the name of the categorical column in the dataframe that the method will convert to numerical values"""
    def cat_to_num(self,column):
        unique_categories=list(set(self.data[column]))#get unique categorical values in column
        mapping= {category: i for i, category in enumerate(unique_categories)}  #create dict to map each unique categorical to a numerical value
        """The enumerate() function is used to generate a unique numerical value for each unique categorical value in the unique_categories list"""
        #Replace each categorical value with its corresponding numerical value
        for i in range(self.n_rows): #iterates over each row in the specified column
            self.data[column][i]=mapping[self.data[column][i]] #replaces the categorical value in the current row with its corresponding numerical value using the mapping dictionary
     
    def columns(self, header):
        # Retrieves a specific column of data by header.
        if header not in self.headers:
            raise ValueError('Invalid header')
        return self.data[header]
    
    """The split_train_test method takes a single argument, 
    split_ratio : ratio of rows to be included in the training set. The remaining rows will be included in the testing set."""
    def split_train_test(self, split_ratio):
        if not 0 <= split_ratio <= 1: #checks if the specified split_ratio is between 0 and 1
            raise ValueError('Invalid split ratio')
        n_rows = self.n_rows #gets the total number of rows in the dataframe
        n_train = int(split_ratio * n_rows) #calculates the number of rows to be included in the training set based on the specified split_ratio
        train_indices = set(random.sample(range(n_rows), n_train)) #randomly selects n_train row indices from the total row indices using the random.sample() function
        test_indices = set(range(n_rows)) - train_indices #contains all row indices not included in the train_indices set
        
        train_data = {header: [self.data[header][i] for i in train_indices] for header in self.headers} #iterate over each column in the dataframe and create a list of values for each column using the row indices in the train_indices set
        test_data = {header: [self.data[header][i] for i in test_indices] for header in self.headers} # iterate over each column in the dataframe and create a list of values for each column using the row indices in the test_indices set.
        train_df = DataFrame(train_data) #create a new dataframe train_df
        test_df = DataFrame(test_data)# create a new dataframe test_df
        return train_df, test_df


In [84]:
# Convert to dictionary
data_dict = {}
for i, header in enumerate(headers): #iterates over the indices and headers of the list headers using the enumerate() 
    data_dict[header] = [row[i] for row in data]

In [85]:
#convert to dataframe
df = DataFrame(data_dict)

In [86]:
df.head(20)

age	job	marital	education	default	balance	housing	loan	contact	day	month	duration	campaign	pdays	previous	poutcome	y	
58	management	married	tertiary	no	2143	yes	no	unknown	5	may	261	1	-1.0	0	unknown	no	
44	technician	single	secondary	no	29	yes	no	unknown	5	may	151	1	-1.0	0	unknown	no	
33	entrepreneur	married	secondary	no	2	yes	yes	unknown	5	may	76	1	-1.0	0	unknown	no	
47	blue-collar	married	unknown	no	1506	yes	no	unknown	5	may	92	1	-1.0	0	unknown	no	
33	unknown	single	unknown	no	1	no	no	unknown	5	may	198	1	-1.0	0	unknown	no	
35	management	married	tertiary	no	231	yes	no	unknown	5	may	139	1	-1.0	0	unknown	no	
28	management	single	tertiary	no	447	yes	yes	unknown	5	may	217	1	-1.0	0	unknown	no	
42	entrepreneur	divorced	tertiary	yes	2	yes	no	unknown	5	may	380	1	-1.0	0	unknown	no	
58	retired	married	primary	no	121	yes	no	unknown	5	may	50	1	-1.0	0	unknown	no	
43	technician	single	secondary	no	593	yes	no	unknown	5	may	55	1	-1.0	0	unknown	no	
41	admin.	divorced	secondary	no	270	yes	no	unknown	5	

In [87]:
#shape of dataframe
df.n_rows,df.n_cols

(45211, 17)

In [88]:
# Print data types
print(df.types)

{'age': 'int', 'job': 'str', 'marital': 'str', 'education': 'str', 'default': 'str', 'balance': 'object', 'housing': 'str', 'loan': 'str', 'contact': 'str', 'day': 'int', 'month': 'str', 'duration': 'int', 'campaign': 'int', 'pdays': 'object', 'previous': 'int', 'poutcome': 'str', 'y': 'str'}


In [89]:
df.unique_values()

{'age': [18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  92,
  93,
  94,
  95],
 'job': ['management',
  'entrepreneur',
  'admin.',
  'housemaid',
  'unemployed',
  'services',
  'retired',
  'blue-collar',
  'unknown',
  'self-employed',
  'student',
  'technician'],
 'marital': ['divorced', 'single', 'married'],
 'education': ['unknown', 'secondary', 'primary', 'tertiary'],
 'default': ['no', 'yes'],
 'balance': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33

In [90]:
#User defined fuction to check for null values
"""This defines the function check_null_values with two arguments: 
the dataset 'data' and the list of special characters to treat as null values 'null_values'."""
def check_null_values_all_cols(data, null_values):
    num_cols = len(data[0])
    # iterates over each column in the dataset using the 'range' object 'num_cols'
    for col_idx in range(num_cols):
        null_found = False
        for row in data:
            value = row[col_idx]
            if value is None or str(value).strip() in null_values:
                null_found = True
                break
        if null_found:
            print(f"Null value found in column {col_idx}!")
        else:
            print(f"No null values found in column {col_idx}.")

In [91]:
null_values=["", "NA", "Na", "nA", "na", "N/A", "N/a", "n/A", "n/a"] #List containing null values(modify according to data)
check_null_values_all_cols(data,null_values)

No null values found in column 0.
No null values found in column 1.
No null values found in column 2.
No null values found in column 3.
No null values found in column 4.
No null values found in column 5.
No null values found in column 6.
No null values found in column 7.
No null values found in column 8.
No null values found in column 9.
No null values found in column 10.
No null values found in column 11.
No null values found in column 12.
No null values found in column 13.
No null values found in column 14.
No null values found in column 15.
No null values found in column 16.


In [92]:
class CustomImputer():
    def __init__(self, col_idx, strategy, null_values=["", "NA", "Na", "nA", "na", "N/A", "N/a", "n/A", "n/a"]):
        self.col_idx = col_idx
        self.strategy = strategy
        self.null_values = null_values

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        col = [row[self.col_idx] for row in X if row[self.col_idx] not in self.null_values]
        if self.strategy == "mean":
            val = sum(col) / len(col)
        elif self.strategy == "median":
            col.sort()
            mid = len(col) // 2
            if len(col) % 2 == 0:
                val = (col[mid-1] + col[mid]) / 2
            else:
                val = col[mid]
        elif self.strategy == "mode":
            val = max(set(col), key = col.count)
        for j in range(len(X)):
            if X[j][self.col_idx] in self.null_values:
                X[j][self.col_idx] = val
        return X


In [93]:
# Check for duplicate values
unique_values = set()#Intialise empty set
for row in data:
    row_tuple = tuple(row)
    if row_tuple in unique_values:
        print("Duplicate found!")
    else:
        unique_values.add(row_tuple)

In [94]:
class OutlierTreatment():
    def __init__(self, method='iqr', multiplier=1.5):
        self.method = method
        self.multiplier = multiplier
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        if self.method == 'iqr':
            Q1 = np.percentile(X, 25, axis=0)
            Q3 = np.percentile(X, 75, axis=0)
            IQR = Q3 - Q1
            lower = Q1 - self.multiplier * IQR
            upper = Q3 + self.multiplier * IQR
            return np.clip(X, lower, upper)
        
        elif self.method == 'zscore':
            Z = np.abs(stats.zscore(X))
            return X[(Z < self.multiplier).all(axis=1)]


In [95]:
class CategoricalToNumerical():
    def __init__(self):
        self.columns = None
        self.encoders = None
        
    def fit(self, X, y=None):
        self.columns = X.columns
        self.encoders = {}
        for col in X.columns:
            if X[col].dtype == 'object':
                self.encoders[col] = LabelEncoder().fit(X[col])
        return self
    
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            for col, encoder in self.encoders.items():
                X[col] = encoder.transform(X[col])
            return X
        else:
            for col, encoder in self.encoders.items():
                X[:, self.columns.get_loc(col)] = encoder.transform(X[:, self.columns.get_loc(col)])
            return X


In [96]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, info_gain=None, value=None):
        ''' constructor ''' 
        
        # for decision node
        self.feature_index = feature_index #the index of the feature that this node splits on
        self.threshold = threshold #the threshold value used to split the data at this node
        self.left = left #the left child of this node
        self.right = right #the right child of this node
        self.info_gain = info_gain #the information gain obtained by splitting the data at this node
        
        # for leaf node
        self.value = value #the predicted value of the target variable at this node

In [97]:
class DecisionTreeClassifier():
    def __init__(self, min_samples_split=2, max_depth=2):
        ''' The __init__() function is the constructor of the DecisionTreeClassifier class. It takes two parameters:

            min_samples_split: the minimum number of samples required to split a node
            max_depth: the maximum depth of the tree '''
        
        # initialize the root of the tree 
        self.root = None
        
        # stopping conditions
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        
    def build_tree(self, dataset, curr_depth=0):
        ''' recursive function to build the tree ''' 
        
        X, Y = dataset[:,:-1], dataset[:,-1]#The X and Y variables represent the features and the target variable of the dataset
        num_samples, num_features = np.shape(X)#number of samples and features in the dataset
        
        # split until stopping conditions are met
        if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
            # find the best split
            best_split = self.get_best_split(dataset, num_samples, num_features)
            # check if information gain is positive
            if best_split["info_gain"]>0:
                # recur left
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
                # recur right
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
                # return decision node
                return Node(best_split["feature_index"], best_split["threshold"], 
                            left_subtree, right_subtree, best_split["info_gain"])
        
        # compute leaf node
        leaf_value = self.calculate_leaf_value(Y)
        # return leaf node
        return Node(value=leaf_value)
    
    def get_best_split(self, dataset, num_samples, num_features):
        ''' function to find the best split '''
        
        # dictionary to store the best split
        best_split = {}
        max_info_gain = -float("inf")
        
        # loop over all the features
        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)
            # loop over all the feature values present in the data
            for threshold in possible_thresholds:
                # get current split
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                # check if childs are not null
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    # compute information gain
                    curr_info_gain = self.information_gain(y, left_y, right_y)
                    # update the best split if needed
                    if curr_info_gain>max_info_gain:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["info_gain"] = curr_info_gain
                        max_info_gain = curr_info_gain
                        
        # return best split
        return best_split
    
    def split(self, dataset, feature_index, threshold):
        ''' the split method is used to split the dataset into two subsets based on a given threshold value and a feature index. It takes the following parameters:

            dataset: The dataset to split.
            feature_index: The index of the feature to split on.
            threshold: The threshold value to split on.'''
        
        dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])#contains all the rows where the feature value is less than or equal to the threshold
        dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
        return dataset_left, dataset_right
    
    def information_gain(self, parent, l_child, r_child):
        ''' function to compute information gain '''
        # calculate entropy of parent node
        parent_entropy = self.entropy(parent)
    
        # calculate entropy of left child node
        left_entropy = self.entropy(l_child)
        
        # calculate entropy of right child node
        right_entropy = self.entropy(r_child)
    
        # calculate weighted average entropy of child nodes
        child_entropy = (len(l_child)/len(parent))*left_entropy + (len(r_child)/len(parent))*right_entropy
    
        # calculate information gain
        gain = parent_entropy - child_entropy
    
        return gain

    
    def entropy(self, y):
        ''' function to compute entropy '''
        
        class_labels = np.unique(y)#gets an array of unique class labels in the set of samples.
        entropy = 0 #This initializes the entropy variable to zero.
        for cls in class_labels: #loops over each class label in the array of unique class labels.
            p_cls = len(y[y == cls]) / len(y) #calculates the probability of a sample 
            entropy += -p_cls * np.log2(p_cls) 
        return entropy
    
        
    def calculate_leaf_value(self, Y):
        ''' function to compute leaf node '''
        
        Y = list(Y)
        return max(Y, key=Y.count)#calculates the majority class label for a leaf node
    
    def print_tree(self, tree=None, indent=" "):
        ''' function to print the tree '''
    
        if not tree:
            tree = self.root
    
        if tree.value is not None:
            print(tree.value)
    
        else:
            print("X"+str(tree.feature_index), "<=", tree.threshold, "?", tree.info_gain)
        
            print(indent + "left: ", end="")
            self.print_tree(tree.left, indent + " ")
        
            print(indent + "right: ", end="")
            self.print_tree(tree.right, indent + " ")

    
    def fit(self, X, Y):
        ''' function to train the tree '''
        
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.build_tree(dataset)
    
    def predict(self, X):
        ''' function to predict the class labels '''
        # convert X to numpy array
        if isinstance(X, pd.DataFrame):
            X = X.values
        
        predictions = [self.make_prediction(x, self.root) for x in X]
        return predictions
    
    def make_prediction(self, x, tree):
        ''' function to traverse the tree and make predictions '''
        
        # leaf node
        if tree.value != None:
            return tree.value
        
        feature_val = x[tree.feature_index]
        if feature_val <= tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)
        
    

In [98]:
class Pipeline():
    def __init__(self, steps):
        self.steps = steps
    
    def fit_transform(self, X):
        for step in self.steps:
            X = step[1].fit_transform(X)
        return X
    
    def transform(self, X):
        for step in self.steps:
            X = step[1].transform(X)
        return X
    
    def fit(self, X, y=None):
        for step in self.steps: 
            X = step[1].fit(X, y).transform(X)
        return self



In [99]:
preprocessing_pipeline = Pipeline([
     ('encoder', CategoricalToNumerical()),
    ('outlier_treatment', OutlierTreatment())
])

In [100]:
preprocessing_pipeline.fit(df)

TypeError: 'method' object is not iterable