Cart Regression algorithm.

In [None]:
import numpy as np
import pandas as pd 
class Node:
    
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
    
    def is_leaf_node(self):
        return self.value is not None


# Decision Tree Regressor Class
class RegressionTree:
    def __init__(self, n_feats = None, max_depth = 100, min_samples_split = 2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None ,
                 random_state=None ,max_leaf_nodes=None, min_impurity_decrease=0.0, ccp_alpha=0.0):
        
        self.root = None
        self.n_feats = n_feats
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
    
    def fit(self, X, Y):
        self.n_feats = X.shape[1] if not self.n_feats else min(self.n_feats, X.shape[1])
        self.col = list(X.columns)
        self.root = self.growTree(X, Y)

    def growTree(self, X, Y, depth = 0):
        
        df = X.copy()
        df['y'] = Y
        
        ymean = np.mean(Y)
        
        self.mse = self.get_mse(Y, ymean)
        
        n_sample, n_feature = X.shape
        
        # stopping criteria
        if (depth >= self.max_depth or n_sample <= self.min_samples_split):
            leaf_value = np.mean(Y)
            return Node(value=leaf_value)

        feats_idxs = list(X.columns)

        best_feat, best_thresh = self.best_criteria(X, Y, feats_idxs)

        left_df, right_df = df[df[best_feat]<=best_thresh].copy(), df[df[best_feat]>best_thresh].copy()

        left = self.growTree(left_df.drop('y', axis=1), left_df['y'].values.tolist(), depth+1)
        right = self.growTree(right_df.drop('y', axis=1), right_df['y'].values.tolist(), depth+1)

        return Node(best_feat, best_thresh, left, right)
    
    
    # find out best criteria
    def best_criteria(self, X, Y, feats_idxs):
        
        df = X.copy()
        
        df['y'] = Y
        
        mse_base = self.mse
        
        best_feature = None
        best_thresh = None
        
        for feat in feats_idxs:
            
            xdf = df.sort_values(feat)
            
            x_mean = self.moving_average(xdf[feat], 2)

            for value in x_mean:
                left_y = xdf[xdf[feat] < value]['y'].values
                right_y = xdf[xdf[feat] >= value]['y'].values
                
                left_mean = 0
                right_mean = 0
                if len(left_y) > 0:
                    left_mean = np.mean(left_y)
                if len(right_y) > 0:
                    right_mean = np.mean(right_y)
                
                res_left = left_y - left_mean
                res_right = right_y - right_mean
                
                r = np.concatenate((res_left, res_right), axis=None)
                
                n = len(r)

                r = r ** 2
                r = np.sum(r)
                mse_split = r / n
                
                if mse_split < mse_base:
                    mse_base = mse_split
                    best_feature = feat
                    best_thresh = value
                    
        return (best_feature, best_thresh)
    
    def get_mse(self, y_true, y_hat):
        n = len(y_true)
        
        r = y_true - y_hat
        
        r = r ** 2
        
        r = np.sum(r)
        
        return r / n
    
    def moving_average(self, x:np.array, window : int):
        return np.convolve(x, np.ones(window), 'valid') / window 
    
    def predict(self, X):
        X = X.to_numpy().tolist()
        
        return np.array([self.traverse_tree(x, self.root) for x in X])
    def traverse_tree(self, x, node):
       
        if node.value is not None:
            return node.value
        
        fr = node.feature
        index = self.col.index(fr)

        if x[index] <= node.threshold:
            return self.traverse_tree(x, node.left)
        
        return self.traverse_tree(x, node.right)

In [None]:
class Util: 
 def getMSEError(y_test, yp):
      n = len(y_test)
      mse = y_test - yp
      mse = mse ** 2
      mse = np.sum(mse)
      mse = mse / n
      return mse  

Regression models for life expectancy algorithm.

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, accuracy_score,precision_score,recall_score,f1_score,mean_absolute_error, r2_score, mean_squared_error

data = pd.read_csv('LifeExpectancyData.csv')
#data.head()
data.isnull().sum()

x = data['Life expectancy '].mean()
data['Life expectancy '].fillna(x, inplace=True)

x = data['Adult Mortality'].mean()
data['Adult Mortality'].fillna(x, inplace=True)

x = data['Alcohol'].mean()
data['Alcohol'].fillna(x, inplace=True)

x = data['Hepatitis B'].mean()
data['Hepatitis B'].fillna(x, inplace=True)

x = data[' BMI '].mean()
data[' BMI '].fillna(x, inplace=True)

x = data['Polio'].mean()
data['Polio'].fillna(x, inplace=True)

x = data['Total expenditure'].mean()
data['Total expenditure'].fillna(x, inplace=True)

x = data['Diphtheria '].mean()
data['Diphtheria '].fillna(x, inplace=True)

x = data['GDP'].mean()
data['GDP'].fillna(x, inplace=True)

x = data['Population'].mean()
data['Population'].fillna(x, inplace=True)

x = data[' thinness  1-19 years'].mean()
data[' thinness  1-19 years'].fillna(x, inplace=True)

x = data[' thinness 5-9 years'].mean()
data[' thinness 5-9 years'].fillna(x, inplace=True)

x = data['Income composition of resources'].mean()
data['Income composition of resources'].fillna(x, inplace=True)

x = data['Schooling'].mean()
data['Schooling'].fillna(x, inplace=True)



le = LabelEncoder()
data['Country'] = le.fit_transform(data['Country'])

le = LabelEncoder()
data['Status'] = le.fit_transform(data['Status'])

x = data.drop('Life expectancy ', axis=1)
y = data['Life expectancy ']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

regression_tree = RegressionTree(max_depth = 15,min_samples_split = 20)
regression_tree.fit(X_train, y_train)
y_predict = regression_tree.predict(X_test)

print('Mean Squared Error :', round(mean_squared_error(y_test, y_predict),3))
print('Root Mean Squared Error :', round(np.sqrt(mean_squared_error(y_test, y_predict)),3))
print('Mean Absolute Squared Error :', round(mean_absolute_error(y_test, y_predict),3))
print("r2_score:", round(r2_score(y_test, y_predict),3))

# compare with scikit learn regressor
scikit_regressor = DecisionTreeRegressor(max_depth=15, min_samples_split=20)
scikit_regressor.fit(X_train, y_train)
yp = scikit_regressor.predict(X_test)
print('Mean Squared Error for DecisionTreeRegressorModel learn:', round(mean_squared_error(y_test, yp),3))
print('Root Mean Squared Error for DecisionTreeRegressorModel learn:', round(np.sqrt(mean_squared_error(y_test, y_predict)),3))
print('Mean Absolute Squared Error for DecisionTreeRegressorModel learn:', round(mean_absolute_error(y_test, yp),3))
print("r2_score for DecisionTreeRegressorModel learn:", round(r2_score(y_test, yp),2))

print('----------------------------------------------------')

RandomForestRegressorModel = RandomForestRegressor(n_estimators=100,max_depth=15, random_state=33)
RandomForestRegressorModel.fit(X_train, y_train)
y_random_pred = RandomForestRegressorModel.predict(X_test)


print('Mean Squared Error for RandomForestRegressorModel learn:', round(mean_squared_error(y_test, y_random_pred),3))
print('Root Mean Squared Error for RandomForestRegressorModel learn:', round(np.sqrt(mean_squared_error(y_test, y_random_pred)),3))
print('Mean Absolute Squared Error for RandomForestRegressorModel learn:', round(mean_absolute_error(y_test, y_random_pred),3))
print("r2_score of RandomForestRegressorModel:", round(r2_score(y_test, y_random_pred),2))
print('----------------------------------------------------')

#Linear regression
linearReg = LinearRegression()
linearReg.fit(X_train, y_train)
yp_linear = linearReg.predict(X_test)

print('Mean Squared Error for LinearRegression learn:', round(mean_squared_error(y_test, yp_linear),3))
print('Root Mean Squared Error for LinearRegression learn:', round(np.sqrt(mean_squared_error(y_test, yp_linear)),3))
print('Mean Absolute Squared Error for LinearRegression learn:', round(mean_absolute_error(y_test, yp_linear),3))
print("r2_score of LinearRegression:", round(r2_score(y_test, yp_linear),2))


Mean Squared Error : 6.645
Root Mean Squared Error : 2.578
Mean Absolute Squared Error : 1.736
r2_score: 0.927
Mean Squared Error for DecisionTreeRegressorModel learn: 6.591
Root Mean Squared Error for DecisionTreeRegressorModel learn: 2.578
Mean Absolute Squared Error for DecisionTreeRegressorModel learn: 1.718
r2_score for DecisionTreeRegressorModel learn: 0.93
----------------------------------------------------
Mean Squared Error for RandomForestRegressorModel learn: 4.07
Root Mean Squared Error for RandomForestRegressorModel learn: 2.017
Mean Absolute Squared Error for RandomForestRegressorModel learn: 1.242
r2_score of RandomForestRegressorModel: 0.96
----------------------------------------------------
Mean Squared Error for LinearRegression learn: 16.532
Root Mean Squared Error for LinearRegression learn: 4.066
Mean Absolute Squared Error for LinearRegression learn: 3.016
r2_score of LinearRegression: 0.82


Regression models for Heart Disease prediction

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression


heart_data = pd.read_csv("framingham.csv")
heart_data.head()
#heart_data.info()
#plt.figure(figsize=(20,10))
#sns.heatmap(heart_data.corr(),annot = True, cmap = 'coolwarm')

std_corr = heart_data.corr()
std_corr['TenYearCHD'].sort_values(ascending = False)

heart_data.isnull().sum()

# data cleaning
heart_data_X = heart_data.drop(columns = ['education','TenYearCHD','currentSmoker'], axis=1)
y = heart_data['TenYearCHD']

X = heart_data_X.fillna({'cigsPerDay': 0, 'BPMeds': 0 ,'totChol': heart_data_X['totChol'].std(),
                 'BMI':heart_data_X['BMI'].mean(), 'glucose':heart_data_X['glucose'].mean(),
                 'heartRate' : heart_data_X['heartRate'].std()})



X.isnull().sum()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)


#scikit learn
tree_reg = DecisionTreeRegressor(max_depth=5,random_state = 33)
tree_reg.fit(X_train, y_train)
yp = tree_reg.predict(X_test)
finalData = pd.DataFrame({'Actual': y_test, 'Predicted': yp})
finalData.head()

print('Mean Squared Error for DecisionTreeRegressorModel learn:', round(mean_squared_error(y_test, yp),3))
print('Root Mean Squared Error for DecisionTreeRegressorModel learn:', round(np.sqrt(mean_squared_error(y_test, yp)),3))
print('Mean Absolute Squared Error for DecisionTreeRegressorModel learn:', round(mean_absolute_error(y_test, yp),3))
print("r2_score:", round(r2_score(y_test, yp),2))
print('----------------------------------------------------')

RandomForestRegressorModel = RandomForestRegressor(n_estimators=100,max_depth=5, random_state=33)
RandomForestRegressorModel.fit(X_train, y_train)
y_random_pred = RandomForestRegressorModel.predict(X_test)


print('Mean Squared Error for RandomForestRegressorModel learn:', round(mean_squared_error(y_test, y_random_pred),3))
print('Root Mean Squared Error for RandomForestRegressorModel learn:', round(np.sqrt(mean_squared_error(y_test, y_random_pred)),3))
print('Mean Absolute Squared Error for RandomForestRegressorModel learn:', round(mean_absolute_error(y_test, y_random_pred),3))
print("r2_score of RandomForestRegressorModel:", round(r2_score(y_test, y_random_pred),2))
print('----------------------------------------------------')


#Linear regression
linearReg = LinearRegression()
linearReg.fit(X_train, y_train)
yp_linear = linearReg.predict(X_test)

print('Mean Squared Error for LinearRegression learn:', round(mean_squared_error(y_test, yp_linear),3))
print('Root Mean Squared Error for LinearRegression learn:', round(np.sqrt(mean_squared_error(y_test, yp_linear)),3))
print('Mean Absolute Squared Error for LinearRegression learn:', round(mean_absolute_error(y_test, yp_linear),3))
print("r2_score of LinearRegression:", round(r2_score(y_test, yp_linear),2))

Mean Squared Error for DecisionTreeRegressorModel learn: 0.117
Root Mean Squared Error for DecisionTreeRegressorModel learn: 0.343
Mean Absolute Squared Error for DecisionTreeRegressorModel learn: 0.229
r2_score: 0.05
----------------------------------------------------
Mean Squared Error for RandomForestRegressorModel learn: 0.114
Root Mean Squared Error for RandomForestRegressorModel learn: 0.338
Mean Absolute Squared Error for RandomForestRegressorModel learn: 0.231
r2_score of RandomForestRegressorModel: 0.08
----------------------------------------------------
Mean Squared Error for LinearRegression learn: 0.111
Root Mean Squared Error for LinearRegression learn: 0.334
Mean Absolute Squared Error for LinearRegression learn: 0.231
r2_score of LinearRegression: 0.1


Regression models for Car Price Prediction 

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

car_data=pd.read_csv("CarPrice.csv")
#car_data.head()
encoder = LabelEncoder()

#data cleaning
car_data['CarName'] = encoder.fit_transform(car_data['CarName'])
car_name = {index : label for index, label in enumerate(encoder.classes_)}
car_data['fueltype'] = encoder.fit_transform(car_data['fueltype'])
fueltype = {index : label for index, label in enumerate(encoder.classes_)}
car_data['aspiration'] = encoder.fit_transform(car_data['aspiration'])
aspiration = {index : label for index, label in enumerate(encoder.classes_)}
car_data['doornumber'] = encoder.fit_transform(car_data['doornumber'])
doornumber = {index : label for index, label in enumerate(encoder.classes_)}
car_data['carbody'] = encoder.fit_transform(car_data['carbody'])
carbody = {index : label for index, label in enumerate(encoder.classes_)}
car_data['drivewheel'] = encoder.fit_transform(car_data['drivewheel'])
drivewheel = {index : label for index, label in enumerate(encoder.classes_)}
car_data['enginelocation'] = encoder.fit_transform(car_data['enginelocation'])
enginelocation = {index : label for index, label in enumerate(encoder.classes_)}
car_data['fuelsystem'] = encoder.fit_transform(car_data['fuelsystem'])
fuelsystem = {index : label for index, label in enumerate(encoder.classes_)}
car_data['enginetype'] = encoder.fit_transform(car_data['enginetype'])
enginetype = {index : label for index, label in enumerate(encoder.classes_)}
car_data['cylindernumber'] = encoder.fit_transform(car_data['cylindernumber'])
cylindernumber = {index : label for index, label in enumerate(encoder.classes_)}
car_data['fuelsystem'] = encoder.fit_transform(car_data['fuelsystem'])
fuelsystem = {index : label for index, label in enumerate(encoder.classes_)}


x = car_data.drop('price', axis=1)
y = car_data['price']
scaler = MinMaxScaler(copy=True, feature_range=(0, 1))
X = scaler.fit_transform(x)
car_data.isnull().sum()

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=30,random_state=0)


DecisionTreeRegressorModel = DecisionTreeRegressor( max_depth=15,random_state=33)
DecisionTreeRegressorModel.fit(x_train, y_train)
y_pred = DecisionTreeRegressorModel.predict(x_test)



print('Mean Squared Error for DecisionTreeRegressorModel learn:', round(mean_squared_error(y_test, y_pred),3))
print('Root Mean Squared Error for DecisionTreeRegressorModel learn:', round(np.sqrt(mean_squared_error(y_test, y_pred)),3))
print('Mean Absolute Squared Error for DecisionTreeRegressorModel learn:', round(mean_absolute_error(y_test, y_pred),3))
print("r2_score of DecisionTreeRegressorModel:", round(r2_score(y_test, y_pred),2))
print('----------------------------------------------------')


RandomForestRegressorModel = RandomForestRegressor(n_estimators=100,max_depth=15, random_state=33)
RandomForestRegressorModel.fit(x_train, y_train)
y_random_pred = RandomForestRegressorModel.predict(x_test)


print('Mean Squared Error for RandomForestRegressorModel learn:', round(mean_squared_error(y_test, y_random_pred),3))
print('Root Mean Squared Error for RandomForestRegressorModel learn:', round(np.sqrt(mean_squared_error(y_test, y_random_pred)),3))
print('Mean Absolute Squared Error for RandomForestRegressorModel learn:', round(mean_absolute_error(y_test, y_random_pred),3))
print("r2_score of RandomForestRegressorModel:", round(r2_score(y_test, y_random_pred),2))
print('----------------------------------------------------')

#Linear regression
linearReg = LinearRegression()
linearReg.fit(x_train, y_train)
yp_linear = linearReg.predict(x_test)

print('Mean Squared Error for LinearRegression learn:', round(mean_squared_error(y_test, yp_linear),3))
print('Root Mean Squared Error for LinearRegression learn:', round(np.sqrt(mean_squared_error(y_test, yp_linear)),3))
print('Mean Absolute Squared Error for LinearRegression learn:', round(mean_absolute_error(y_test, yp_linear),3))
print("r2_score of LinearRegression:", round(r2_score(y_test, yp_linear),2))


Mean Squared Error for DecisionTreeRegressorModel learn: 13316514.042
Root Mean Squared Error for DecisionTreeRegressorModel learn: 3649.18
Mean Absolute Squared Error for DecisionTreeRegressorModel learn: 2093.617
r2_score of DecisionTreeRegressorModel: 0.61
----------------------------------------------------
Mean Squared Error for RandomForestRegressorModel learn: 3851555.446
Root Mean Squared Error for RandomForestRegressorModel learn: 1962.538
Mean Absolute Squared Error for RandomForestRegressorModel learn: 1461.259
r2_score of RandomForestRegressorModel: 0.89
----------------------------------------------------
Mean Squared Error for LinearRegression learn: 11696841.711
Root Mean Squared Error for LinearRegression learn: 3420.065
Mean Absolute Squared Error for LinearRegression learn: 2493.259
r2_score of LinearRegression: 0.66


Regression models for Students marks prediction

In [None]:
student_data =pd.read_csv(r"Student_Marks.csv")
student_data.head()
#student_data.info()
student_data.isnull().sum()
student_data.columns
X=student_data[['number_courses', 'time_study']]
y=student_data["Marks"]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=22)

model= DecisionTreeRegressor(max_depth=15,random_state=33)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)

print('Mean Squared Error for DecisionTreeRegressorModel learn:', round(mean_squared_error(y_test, y_pred),3))
print('Root Mean Squared Error for DecisionTreeRegressorModel learn:', round(np.sqrt(mean_squared_error(y_test, y_pred)),3))
print('Mean Absolute Squared Error for DecisionTreeRegressorModel learn:', round(mean_absolute_error(y_test, y_pred),3))
print("r2_score of DecisionTreeRegressorModel:", round(r2_score(y_test, y_pred),2))
print('----------------------------------------------------')

model=RandomForestRegressor(n_estimators=100,max_depth=15, random_state=33)
model.fit(X_train,y_train)
y_random_pred=model.predict(X_test)

print('Mean Squared Error for RandomForestRegressorModel learn:', round(mean_squared_error(y_test, y_random_pred),3))
print('Root Mean Squared Error for RandomForestRegressorModel learn:', round(np.sqrt(mean_squared_error(y_test, y_random_pred)),3))
print('Mean Absolute Squared Error for RandomForestRegressorModel learn:', round(mean_absolute_error(y_test, y_random_pred),3))
print("r2_score of RandomForestRegressorModel:", round(r2_score(y_test, y_random_pred),2))
print('----------------------------------------------------')


#Linear regression
linearReg = LinearRegression()
linearReg.fit(X_train, y_train)
yp_linear = linearReg.predict(X_test)

print('Mean Squared Error for LinearRegression learn:', round(mean_squared_error(y_test, yp_linear),3))
print('Root Mean Squared Error for LinearRegression learn:', round(np.sqrt(mean_squared_error(y_test, yp_linear)),3))
print('Mean Absolute Squared Error for LinearRegression learn:',round( mean_absolute_error(y_test, yp_linear),3))
print("r2_score of LinearRegression:", round(r2_score(y_test, yp_linear),2))


Mean Squared Error for DecisionTreeRegressorModel learn: 5.729
Root Mean Squared Error for DecisionTreeRegressorModel learn: 2.394
Mean Absolute Squared Error for DecisionTreeRegressorModel learn: 1.492
r2_score of DecisionTreeRegressorModel: 0.97
----------------------------------------------------
Mean Squared Error for RandomForestRegressorModel learn: 3.507
Root Mean Squared Error for RandomForestRegressorModel learn: 1.873
Mean Absolute Squared Error for RandomForestRegressorModel learn: 1.276
r2_score of RandomForestRegressorModel: 0.98
----------------------------------------------------
Mean Squared Error for LinearRegression learn: 9.961
Root Mean Squared Error for LinearRegression learn: 3.156
Mean Absolute Squared Error for LinearRegression learn: 2.733
r2_score of LinearRegression: 0.96
