Problem Statement:
    Imagine that you are a medical researcher compiling data for a study. You have collected data about a set of patients, all of whom suffered from the same illness. During their course of treatment, each patient responded to one of 5 medications, Drug A, Drug B, Drug c, Drug x and y.

Part of your job is to build a model to find out which drug might be appropriate for a future patient with the same illness. The features of this dataset are Age, Sex, Blood Pressure, and the Cholesterol of the patients, and the target is the drug that each patient responded to.

It is a sample of multiclass classifier, and you can use the training part of the dataset to build a decision tree, and then use it to predict the class of a unknown patient, or to prescribe a drug to a new patient.

Approach:

A decision tree can be built with target variable drug & all other variable will be independent in the analysis.

In [1]:
#Loading dataset using csv module
import csv

data = []
with open('drug200.csv', 'r') as f:
    csvreader = csv.reader(f)
    headers = next(csvreader) # Get the headers
    for row in csvreader:
        data.append(row)

In [2]:
# Check for null values
for row in data:
    for value in row:
        if value is None or value == "":
            print("Null value found!")
        #else:
           # print('No null values found')

In [3]:
# Check for duplicate values
unique_values = set()
for row in data:
    row_tuple = tuple(row)
    if row_tuple in unique_values:
        print("Duplicate found!")
    else:
        unique_values.add(row_tuple)

In [24]:
import random
class DataFrame:
    def __init__(self, data):
        self.data = data
        self.headers = list(data.keys())
        self.n_rows = len(list(data.values())[0])
        self.n_cols = len(data)
        self.types = self._infer_types()
        
    """Head method prints top 5 rows by default"""
    def head(self, n=5):
        for header in self.headers:
            print(header, end='\t')
        print()
        for i in range(n):
            for j, header in enumerate(self.headers):
                print(self.data[header][i], end='\t')
            print()
            
    """The _infer_types() method infers the data type of each column based on the types of the values in the column.
        It creates a dictionary mapping column names to data types. If all values in a column have the same type,
        that type is used as the column's data type. Otherwise, the column is treated as an object column."""
    def _infer_types(self):
        types = {}
        for header in self.headers:
            column = self.data[header]
            type_set = set(type(x).__name__ for x in column)
            if len(type_set) == 1:
                types[header] = type_set.pop()
            else:
                types[header] = 'object'
        return types
    
    """The _convert_column() method converts the data type of a column to a new type,
        using the built-in int(), float(), or str() functions."""
    def _convert_column(self, header, new_type):
        column = self.data[header]
        if new_type == 'int':
            self.data[header] = [int(x) for x in column]
        elif new_type == 'float':
            self.data[header] = [float(x) for x in column]
        elif new_type == 'str':
            self.data[header] = [str(x) for x in column]
        else:
            raise ValueError('Invalid data type')
    
    """The convert_type() method allows the user to convert the data type of a specific column to a new type.
        It first checks that the column name and new data type are valid,
        then checks whether the current data type of the column matches the new data type.
        If it does, no conversion is needed. Otherwise, it calls the _convert_column() method to perform the conversion 
        and updates the column's data type in the types dictionary."""
    def convert_type(self, header, new_type):
        if header not in self.headers:
            raise ValueError('Invalid header')
        if new_type not in ('int', 'float', 'str'):
            raise ValueError('Invalid data type')
        current_type = self.types[header]
        if new_type == current_type:
            return
        self._convert_column(header, new_type)
        self.types[header] = new_type
        
    def unique_values(self):
        unique_values_dict = {}
        for header in self.headers:
            unique_values_dict[header] = list(set(self.data[header]))
        return unique_values_dict
    
    def cat_to_num(self,column):
        unique_categories=list(set(self.data[column]))#get unique categorical values in column
        mapping= {category: i for i, category in enumerate(unique_categories)}  #create dict to map each unique categorical to a numerical value
        #Replace each categorical value with its corresponding numerical value
        for i in range(self.n_rows):
            self.data[column][i]=mapping[self.data[column][i]]
            
    def split_train_test(self, split_ratio):
        if not 0 <= split_ratio <= 1:
            raise ValueError('Invalid split ratio')
        n_rows = self.n_rows
        n_train = int(split_ratio * n_rows)
        train_indices = set(random.sample(range(n_rows), n_train))
        test_indices = set(range(n_rows)) - train_indices
        train_data = {header: [self.data[header][i] for i in train_indices] for header in self.headers}
        test_data = {header: [self.data[header][i] for i in test_indices] for header in self.headers}
        train_df = DataFrame(train_data)
        test_df = DataFrame(test_data)
        return train_df, test_df


In [5]:
# Convert to dictionary
data_dict = {}
for i, header in enumerate(headers):
    data_dict[header] = [row[i] for row in data]

In [6]:
#convert to dataframe
df = DataFrame(data_dict)

In [7]:
df

<__main__.DataFrame at 0x29c05abbf70>

In [8]:
#Shows top 5 rows
df.head()

Age	Sex	BP	Cholesterol	Na_to_K	Drug	
23	F	HIGH	HIGH	25.355	drugY	
47	M	LOW	HIGH	13.093	drugC	
47	M	LOW	HIGH	10.114	drugC	
28	F	NORMAL	HIGH	7.798	drugX	
61	F	LOW	HIGH	18.043	drugY	


In [9]:
#shape of dataframe
df.n_rows,df.n_cols

(200, 6)

In [10]:
# Print data types
print(df.types)

{'Age': 'str', 'Sex': 'str', 'BP': 'str', 'Cholesterol': 'str', 'Na_to_K': 'str', 'Drug': 'str'}


In [11]:
# Convert Age column to float
df.convert_type('Age', 'int')
df.convert_type('Na_to_K','float')

In [12]:
# Print data types
print(df.types)

{'Age': 'int', 'Sex': 'str', 'BP': 'str', 'Cholesterol': 'str', 'Na_to_K': 'float', 'Drug': 'str'}


In [13]:
unique_values_dict = df.unique_values()
print(unique_values_dict)

{'Age': [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 72, 73, 74], 'Sex': ['F', 'M'], 'BP': ['NORMAL', 'HIGH', 'LOW'], 'Cholesterol': ['NORMAL', 'HIGH'], 'Na_to_K': [6.683, 7.798, 8.607, 7.298, 10.114, 11.037, 11.767, 13.972, 13.093, 12.703, 15.171, 16.275, 15.376, 18.043, 19.368, 19.199, 20.942, 15.516, 19.128, 25.355, 25.974, 25.917, 22.697, 27.183, 30.568, 30.366, 31.876, 33.486, 31.686, 27.826, 29.875, 29.45, 29.271, 38.247, 35.639, 8.75, 8.107, 37.188, 9.381, 9.567, 9.445, 9.677, 9.945, 10.189, 10.832, 10.067, 11.198, 11.326, 11.424, 11.939, 11.009, 12.854, 12.006, 11.262, 11.686, 11.567, 13.884, 15.79, 15.015, 15.436, 16.594, 16.725, 16.85, 16.753, 16.347, 17.951, 17.211, 17.225, 18.457, 18.295, 18.348, 18.991, 19.796, 19.161, 7.845, 20.909, 21.036, 22.905, 22.963, 23.091, 23.003, 24.658, 25.475, 25.969, 25.786,

In [14]:
#converting all categorical columns to numerical
df.cat_to_num('Drug')
df.cat_to_num('Sex')
df.cat_to_num('BP')
df.cat_to_num('Cholesterol')

In [15]:
df.head()

Age	Sex	BP	Cholesterol	Na_to_K	Drug	
23	0	1	1	25.355	0	
47	1	2	1	13.093	1	
47	1	2	1	10.114	1	
28	0	0	1	7.798	2	
61	0	2	1	18.043	0	


In [30]:
# Partition the dataset into training and testing datasets

train_df, test_df = df.split_train_test(0.7)

In [45]:
#shape of train data
train_df.n_rows,train_df.n_cols

(140, 6)

In [46]:
#shape of test data
test_df.n_rows,test_df.n_cols

(60, 6)

In [38]:
target_col= 'Drug'
# Get all column names except for the target column
feature_cols = [col for col in train_df.headers if col != target_col]

# Create X and Y dataframes by selecting the appropriate columns
X_train = DataFrame({col: train_df.data[col] for col in feature_cols})
y_train = DataFrame({target_col: train_df.data[target_col]})
X_test = DataFrame({col: test_df.data[col] for col in feature_cols})
y_test = DataFrame({target_col: test_df.data[target_col]})

In [49]:
X_train.head()

Age	Sex	BP	Cholesterol	Na_to_K	
23	0	1	1	25.355	
47	1	2	1	10.114	
61	0	2	1	18.043	
22	0	0	1	8.607	
49	0	0	1	16.275	


In [51]:
y_train.head()

Drug	
0	
1	
0	
2	
0	
