In [44]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
import math

In [45]:
#reading training data
df= pd.read_csv(r"C:\Users\berli\Downloads\p1_train.csv",names=["bio1","bio2","health"])

In [6]:
df

Unnamed: 0,bio1,bio2,health
0,-7.262173,9.572604,5.358725
1,3.462140,10.684524,-13.275822
2,-12.996801,-3.446163,31.815190
3,7.083537,-14.074146,1.995030
4,-15.216890,-18.630651,48.812452
...,...,...,...
9995,16.981033,-7.377274,-21.776089
9996,0.258434,-19.488764,22.098571
9997,17.588664,11.326009,-47.812604
9998,0.531756,-15.189405,20.791085


In [7]:
#checking null value in training dataset
df.isna().sum()

bio1      0
bio2      0
health    0
dtype: int64

In [8]:
#reading testing data
df1= pd.read_csv(r"C:\Users\berli\Downloads\p1_test.csv",names=["bio1","bio2","health"])

In [9]:
df1

Unnamed: 0,bio1,bio2,health
0,15.893003,11.712829,-37.567929
1,-2.722421,-4.540615,11.475211
2,-14.587111,18.592864,11.468441
3,0.224331,16.433306,-15.159309
4,-12.215286,11.742682,17.392223
...,...,...,...
4995,-14.424996,7.055352,21.402087
4996,-12.249485,12.226991,14.971488
4997,-19.242759,-18.218209,56.687193
4998,-13.561869,2.773022,26.627790


In [10]:
#checking null value in testing dataset
df1.isna().sum()

bio1      0
bio2      0
health    0
dtype: int64

In [13]:
#checking skewness in testing dataset
df1.skew()

bio1      0.009497
bio2     -0.001184
health   -0.003122
dtype: float64

In [14]:
#checking skewness in training dataset
df.skew()

bio1      0.019314
bio2     -0.020060
health   -0.018327
dtype: float64

In [16]:
#removing outliers in training dataset
cns=list(df.columns)
for cn in cns:
        q1=df[cn].quantile(0.25)
        q3=df[cn].quantile(0.75)

        iqr=q3-q1
        lb=q1-(1.5*iqr)
        ub=q3+(1.5*iqr)

        df=df[(df[cn]>=lb) & (df[cn]<=ub) ]
df

Unnamed: 0,bio1,bio2,health
0,-7.262173,9.572604,5.358725
1,3.462140,10.684524,-13.275822
2,-12.996801,-3.446163,31.815190
3,7.083537,-14.074146,1.995030
4,-15.216890,-18.630651,48.812452
...,...,...,...
9995,16.981033,-7.377274,-21.776089
9996,0.258434,-19.488764,22.098571
9997,17.588664,11.326009,-47.812604
9998,0.531756,-15.189405,20.791085


In [17]:
#removing outliers in testing dataset
cns=list(df1.columns)
for cn in cns:
        q1=df1[cn].quantile(0.25)
        q3=df1[cn].quantile(0.75)

        iqr=q3-q1
        lb=q1-(1.5*iqr)
        ub=q3+(1.5*iqr)

        df1=df1[(df1[cn]>=lb) & (df1[cn]<=ub) ]

df1

Unnamed: 0,bio1,bio2,health
0,15.893003,11.712829,-37.567929
1,-2.722421,-4.540615,11.475211
2,-14.587111,18.592864,11.468441
3,0.224331,16.433306,-15.159309
4,-12.215286,11.742682,17.392223
...,...,...,...
4995,-14.424996,7.055352,21.402087
4996,-12.249485,12.226991,14.971488
4997,-19.242759,-18.218209,56.687193
4998,-13.561869,2.773022,26.627790


In [26]:
# Predicting the test and train data using a linear regression model and printing metrics
X_train,y_train=df.drop(["health"],axis=1),df['health']#train data df
X_test,y_test=df1.drop(["health"],axis=1),df1['health']#test data df1
model=LinearRegression()
model.fit(X_train,y_train)
train_pred=model.predict(X_train)
test_pred=model.predict(X_test)
print("TRAINING METRICS")
print(F"Train MSE={mean_squared_error(y_train,train_pred)} ")
print(F"Train RMSE={math.sqrt(mean_squared_error(y_train,train_pred))} ")
print(F"Train MAE={mean_absolute_error(y_train,train_pred)} ")
print(F"Train r2 score={r2_score(y_train,train_pred)} ")

print("TESTING METRICS")
print(F"Test MSE={mean_squared_error(y_test,test_pred)} ")
print(F"Test RMSE={math.sqrt(mean_squared_error(y_test,test_pred))} ")
print(F"Test MAE={mean_absolute_error(y_test,test_pred)} ")
print(F"Test r2 score={r2_score(y_test,test_pred)} ")

TRAINING METRICS
Train MSE=5.059684615643717 
Train RMSE=2.249374272024048 
Train MAE=1.791753209393991 
Train r2 score=0.9924282831120388 
TESTING METRICS
Test MSE=5.046436003951251 
Test RMSE=2.2464273867524076 
Test MAE=1.7990800530982607 
Test r2 score=0.9926719199876596 


In [27]:
# Predicting the test and train data using a SVR model and printing metrics
X_train,y_train=df.drop(["health"],axis=1),df['health']#train data df
X_test,y_test=df1.drop(["health"],axis=1),df1['health']#test data df1
model=SVR(kernel="linear")
model.fit(X_train,y_train)
train_pred=model.predict(X_train)
test_pred=model.predict(X_test)
print("TRAINING METRICS")
print(F"Train MSE={mean_squared_error(y_train,train_pred)} ")
print(F"Train RMSE={math.sqrt(mean_squared_error(y_train,train_pred))} ")
print(F"Train MAE={mean_absolute_error(y_train,train_pred)} ")
print(F"Train r2 score={r2_score(y_train,train_pred)} ")

print("TESTING METRICS")
print(F"Test MSE={mean_squared_error(y_test,test_pred)} ")
print(F"Test RMSE={math.sqrt(mean_squared_error(y_test,test_pred))} ")
print(F"Test MAE={mean_absolute_error(y_test,test_pred)} ")
print(F"Test r2 score={r2_score(y_test,test_pred)} ")

TRAINING METRICS
Train MSE=5.060298380530224 
Train RMSE=2.249510698025289 
Train MAE=1.791678554147199 
Train r2 score=0.9924273646251549 
TESTING METRICS
Test MSE=5.04549990210172 
Test RMSE=2.2462190236265296 
Test MAE=1.798755289654836 
Test r2 score=0.9926732793290339 


In [52]:
# Predicting the test and train data using various regression models as a function
def models(models):
    X_train,y_train=df.drop(["health"],axis=1),df['health']#train data df
    X_test,y_test=df1.drop(["health"],axis=1),df1['health']#test data df1
    if models=='Linear Regression':
               model=LinearRegression()
    if models=='SVR':
               model=SVR(kernel="linear")
    if models=='Lasso':
               model=Lasso()
    if models=='Ridge':
               model=Ridge()        
    if models=='Decision Tree Regression':
               model=DecisionTreeRegressor()
                  
    if models=='Random Forest Regression':
               model=RandomForestRegressor() 
    if models=='Gradient Boosting Regression':
               model=GradientBoostingRegressor()  
    model.fit(X_train,y_train)
    train_pred=model.predict(X_train)
    test_pred=model.predict(X_test)
    print("TRAINING METRICS")
    print(F"Train MSE={mean_squared_error(y_train,train_pred)} ")
    print(F"Train RMSE={math.sqrt(mean_squared_error(y_train,train_pred))} ")
    print(F"Train MAE={mean_absolute_error(y_train,train_pred)} ")
    print(F"Train r2 score={r2_score(y_train,train_pred)} ")

    print("TESTING METRICS")
    print(F"Test MSE={mean_squared_error(y_test,test_pred)} ")
    print(F"Test RMSE={math.sqrt(mean_squared_error(y_test,test_pred))} ")
    print(F"Test MAE={mean_absolute_error(y_test,test_pred)}")
    print(F"Test r2 score={r2_score(y_test,test_pred)} ")

In [53]:
#single model
models("SVR")

TRAINING METRICS
Train MSE=5.060298380530224 
Train RMSE=2.249510698025289 
Train MAE=1.791678554147199 
Train r2 score=0.9924273646251549 
TESTING METRICS
Test MSE=5.04549990210172 
Test RMSE=2.2462190236265296 
Test MAE=1.798755289654836
Test r2 score=0.9926732793290339 


In [54]:
#All model
modelss=['Linear Regression','SVR','Lasso','Ridge','Decision Tree Regression',
             'Random Forest Regression', 'Gradient Boosting Regression']
for model in modelss:
    print(model)
    models(model)
    print("------------------------------------------------------------")

Linear Regression
TRAINING METRICS
Train MSE=5.059684615643717 
Train RMSE=2.249374272024048 
Train MAE=1.791753209393991 
Train r2 score=0.9924282831120388 
TESTING METRICS
Test MSE=5.046436003951251 
Test RMSE=2.2464273867524076 
Test MAE=1.7990800530982607
Test r2 score=0.9926719199876596 
------------------------------------------------------------
SVR
TRAINING METRICS
Train MSE=5.060298380530224 
Train RMSE=2.249510698025289 
Train MAE=1.791678554147199 
Train r2 score=0.9924273646251549 
TESTING METRICS
Test MSE=5.04549990210172 
Test RMSE=2.2462190236265296 
Test MAE=1.798755289654836
Test r2 score=0.9926732793290339 
------------------------------------------------------------
Lasso
TRAINING METRICS
Train MSE=5.0748076147488135 
Train RMSE=2.2527333652140933 
Train MAE=1.7948522738174506 
Train r2 score=0.9924056518461756 
TESTING METRICS
Test MSE=5.063739326257445 
Test RMSE=2.2502753889818563 
Test MAE=1.80248097756545
Test r2 score=0.992646793318018 
------------------------