In [1]:
from  DataCleaning import  cleanData
X,y,features,target=cleanData('Regression')

**Train test split**

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,\
                                                random_state=0)

In [4]:
def splitDataset(x,y):
    X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)
    return X_train,X_test,y_train,y_test

In [5]:
from sklearn.preprocessing import StandardScaler
def standarScalerData(data):
    scaler= StandardScaler()
    scaleData = scaler.fit_transform(data)
    return scaleData

### Feature Selection

<b>Feature Selection</b>

Feature selection is a process where you automatically select those features in your data that contribute most to the prediction variable or output in which you are interested.

Having irrelevant features in your data can decrease the accuracy of many models, especially linear algorithms like linear and logistic regression.

Three benefits of performing feature selection before modeling your data are:

- <b>Reduces Overfitting:</b> Less redundant data means less opportunity to make decisions based on noise.
- <b>Improves Accuracy:</b> Less misleading data means modeling accuracy improves.
- <b>Reduces Training Time:</b> Less data means that algorithms train faster.

##### RFE

Recursive Feature Elimination is the process of iteratively finding the most relevant features from the parameters of a learnt ML model. The model used for RFE could vary based on the problem at hand and the dataset. Popular models that could be used include Linear Regression, Logistic Regression, Decision Trees, Random Forests and so on.


In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE


# Import some data to play with
X_values = X.values
y_values = y.values
# Create a LinearRegression
estimator = LinearRegression()
# Use RFE to select the most important features
selector = RFE(estimator, n_features_to_select=6, step=2)
selector = selector.fit(X_values, y_values)
# Print the selected features
importantFeatures=[]
print("\033[1m"+'Most important features selected By RFE:'+"\033[0m")
for i in range(0,len(selector.support_)):
    if selector.support_[i]==True:
        print(X[features].columns[i])
        importantFeatures.append(X[features].columns[i])


[1mMost important features selected By RFE:[0m
Throughput
Successability
Compliance
Documentation
WsRF
Class


In [7]:
print(importantFeatures)

['Throughput', 'Successability', 'Compliance', 'Documentation', 'WsRF', 'Class']


In [8]:
x_with_feature_selection=X[importantFeatures]

## Linear Regression

![alt text](images/linear-regression.png "Linear Regression")


###  Get Regression Model Summary from Statsmodels

In [9]:
import statsmodels.api as sm
#add constant to predictor variables
x = sm.add_constant(X_train)
#fit linear regression mode
model = sm.OLS(y_train, X_train).fit()
print_model = model.summary()
#View model summary
print(print_model)

                                 OLS Regression Results                                
Dep. Variable:            Reliability   R-squared (uncentered):                   0.982
Model:                            OLS   Adj. R-squared (uncentered):              0.981
Method:                 Least Squares   F-statistic:                              1303.
Date:                Sun, 09 Jun 2024   Prob (F-statistic):                   7.92e-185
Time:                        12:30:34   Log-Likelihood:                         -822.40
No. Observations:                 228   AIC:                                      1663.
Df Residuals:                     219   BIC:                                      1694.
Df Model:                           9                                                  
Covariance Type:            nonrobust                                                  
                     coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------

-Jarque-Bera (JB):Statistic value
-Prob(JB)::p-value of Jarque-Bera test
Interpretation:
If p-value<significant_Value then the residuals are not normally distibuted.
If p-value>significant_Value the residuals are normally distibuted

#### with feature selection variable

In [10]:
X_train,X_test,y_train,y_test=splitDataset(x_with_feature_selection,y)
X_train

Unnamed: 0,Throughput,Successability,Compliance,Documentation,WsRF,Class
27,15.0,71,89.0,41,84,1
68,9.3,57,100.0,92,76,2
320,8.0,33,89.0,7,55,4
257,1.7,40,67.0,96,62,3
202,1.0,49,78.0,93,67,3
...,...,...,...,...,...,...
307,0.2,50,78.0,40,56,4
229,18.9,29,78.0,4,64,3
142,10.1,51,100.0,3,71,2
69,5.2,90,100.0,11,76,2


In [11]:
import statsmodels.api as sm
#add constant to predictor variables
x = sm.add_constant(X_train)
#fit linear regression mode
model = sm.OLS(y_train, X_train).fit()
print_model = model.summary()
#View model summary
print(print_model)

                                 OLS Regression Results                                
Dep. Variable:            Reliability   R-squared (uncentered):                   0.980
Model:                            OLS   Adj. R-squared (uncentered):              0.979
Method:                 Least Squares   F-statistic:                              1773.
Date:                Sun, 09 Jun 2024   Prob (F-statistic):                   1.81e-184
Time:                        12:30:40   Log-Likelihood:                         -834.83
No. Observations:                 228   AIC:                                      1682.
Df Residuals:                     222   BIC:                                      1702.
Df Model:                           6                                                  
Covariance Type:            nonrobust                                                  
                     coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------

In [12]:
y_pred=model.predict(X_test)
y_pred

78     79.453847
218    31.713114
117    71.531350
249    74.369244
225    68.674007
67     74.921619
136    64.374484
291    53.251974
21     78.429940
43     73.503119
334    51.836191
231    54.453368
18     75.221877
106    86.147927
87     86.907518
228    62.402458
271    36.865012
206    32.406359
240    57.236427
162    77.014963
183    48.522051
285    45.726052
126    58.806442
355    54.929743
131    76.174747
135    64.369839
262    29.244406
274    64.686683
152    46.400532
368    37.436441
255    53.752936
12     82.570881
9      85.247655
237    75.395163
115    63.991620
356    32.358406
122    75.719498
128    84.087782
11     80.873783
88     84.381134
97     56.298556
191    56.179553
216    59.745409
245    62.812056
294    48.825147
150    61.181569
189    48.413605
250    64.986706
98     86.381328
344    47.446442
83     73.777998
353    49.903774
345    57.595262
266    57.107816
101    95.950480
177    70.921209
308    53.083654
dtype: float64

In [13]:
from sklearn.metrics import mean_squared_error, r2_score
def evaluate_model_reg(model,y_test,y_pred):
    #  Mean Squared Error (MSE)
    mse = mean_squared_error(y_test, y_pred)
    #R-squared
    r2 = r2_score(y_test, y_pred)
    # root mean squared error
    rmse=mean_squared_error(y_test, y_pred, squared = False)
    # define variables for adjusted r2 score
    n = len(y_test)
    k = len(X_test.columns)
    # calculate adjusted r2 score
    adj_r2_score = 1-(((1-r2)*(n-1))/(n-k-1))
    # Print the evaluation metrics
    print("Model:",model)
    print("Root Mean Squared Error (RMSE)",rmse)
    print("Mean Squared Error (MSE):", mse)
    print("R-squared (R2):", r2)
    print("Adjusted-R-squared",adj_r2_score)

In [14]:
evaluate_model_reg(model,y_test, y_pred)

Model: <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x289ed2150>
Root Mean Squared Error (RMSE) 7.228992900363941
Mean Squared Error (MSE): 52.25833835351226
R-squared (R2): 0.8323819381226429
Adjusted-R-squared 0.8122677706973601


## Ridge

In [15]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [16]:
from sklearn.linear_model import Ridge

#### Grid search

In [17]:
from sklearn.model_selection import GridSearchCV
# Define the parameter grid to search over
param_grid = {'alpha': [0.1,0.5, 1.0,5.0,10.0]}

# Create a Ridge Regression model
ridgeModel = Ridge()

# Use GridSearchCV to perform cross-validation and find the best set of hyperparameters
grid_search = GridSearchCV(ridgeModel, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Print the best set of hyperparameters found
print("Best set of hyperparameters: ", grid_search.best_params_)



Best set of hyperparameters:  {'alpha': 10.0}


In [18]:

# Create a Ridge Regression model with a regularization strength of 1.0
ridgeModel = Ridge(alpha=10.0)

# Train the model on the training data
ridgeModel.fit(X_train, y_train)

# Use the model to make predictions on the test data
y_pred = ridgeModel.predict(X_test)


In [19]:
evaluate_model_reg(ridgeModel,y_test,y_pred)

Model: Ridge(alpha=10.0)
Root Mean Squared Error (RMSE) 6.608890530878862
Mean Squared Error (MSE): 43.67743404914029
R-squared (R2): 0.8599050969901165
Adjusted-R-squared 0.8330784134350324


## Lasso

In [20]:
from sklearn.linear_model import Lasso

In [21]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)


## Grid Search

In [22]:
# Define the parameter grid to search over
param_grid = {'alpha': [0.1,0.5, 1.0,5.0, 10.0], 'max_iter': [1000, 5000, 10000]}

# Create a Lasso Regression model
lassoModelodel = Lasso()

# Use GridSearchCV to perform cross-validation and find the best set of hyperparameters
grid_search = GridSearchCV(lassoModelodel, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Print the best set of hyperparameters found
print("Best set of hyperparameters: ", grid_search.best_params_)

Best set of hyperparameters:  {'alpha': 0.5, 'max_iter': 1000}


In [23]:
# Create a Lasso Regression model with a regularization strength of 1.0
lassoModel = Lasso(alpha=0.5,max_iter=1000)
# Train the model on the training data
lassoModel.fit(X_train, y_train)
# Use the model to make predictions on the test data
y_pred = lassoModel.predict(X_test)

In [24]:
evaluate_model_reg(lassoModel,y_test,y_pred)

Model: Lasso(alpha=0.5)
Root Mean Squared Error (RMSE) 6.6619277451966425
Mean Squared Error (MSE): 44.38128128222082
R-squared (R2): 0.857647514510769
Adjusted-R-squared 0.8303885279277248


## Elastic Net

In [25]:
from sklearn.linear_model import ElasticNet

In [26]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)


In [27]:
# Define the parameter grid to search over
param_grid = {'alpha': [0.1, 1.0, 10.0], 'l1_ratio': [0.1, 0.5, 0.9], 'max_iter': [1000, 5000, 10000]}

# Create an Elastic Net Regression model
elasticNetModel = ElasticNet()

# Use GridSearchCV to perform cross-validation and find the best set of hyperparameters
grid_search = GridSearchCV(elasticNetModel, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Print the best set of hyperparameters found
print("Best set of hyperparameters: ", grid_search.best_params_)


Best set of hyperparameters:  {'alpha': 1.0, 'l1_ratio': 0.5, 'max_iter': 1000}


In [28]:
# Create an Elastic Net Regression model with a regularization strength of 1.0 and a L1 ratio of 0.5
elasticNetModel = ElasticNet(alpha=1.0, l1_ratio=1.0,max_iter=1000)

# Train the model on the training data
elasticNetModel.fit(X_train, y_train)

# Use the model to make predictions on the test data
y_pred = elasticNetModel.predict(X_test)

In [29]:
evaluate_model_reg(elasticNetModel,y_test,y_pred)

Model: ElasticNet(l1_ratio=1.0)
Root Mean Squared Error (RMSE) 6.704574159228645
Mean Squared Error (MSE): 44.9513146565965
R-squared (R2): 0.8558191385533876
Adjusted-R-squared 0.828210037425313


## DecisionTreeRegressor

![alt text](images/DT.png "Decision Tree")

In [30]:
X_train_Tree,X_test_tree,y_train_tree,y_test_tree=splitDataset(X,y)

In [31]:
# import the regressor
from sklearn.tree import DecisionTreeRegressor
# create a regressor object
regressor = DecisionTreeRegressor(random_state = 0)
# fit the regressor with X and Y data
regressor.fit(X_train_Tree, y_train_tree)

In [32]:
y_pred_tree=regressor.predict(X_test_tree)
y_pred_tree

array([78.7, 29.4, 70.2, 65.1, 61.9, 79. , 55.4, 80.9, 85.5, 79. , 19.1,
       63.8, 82.5, 81.4, 86.6, 64.6, 29.4, 27.7, 77.1, 82. , 61.9, 54.3,
       55. , 40.2, 60.2, 55.4, 23.1, 45.6, 50.6, 40.2, 77.1, 84.1, 89.2,
       61.9, 75.7, 15.9, 94.4, 78.5, 82.5, 86.6, 62.8, 64. , 77.1, 61.2,
       62.2, 62.6, 56.4, 71.4, 86.6, 11.9, 78.7, 50.5, 69.7, 66.5, 82.4,
       53. , 43.9])

In [33]:
evaluate_model_reg(regressor,y_test_tree, y_pred_tree)

Model: DecisionTreeRegressor(random_state=0)
Root Mean Squared Error (RMSE) 14.209318662135257
Mean Squared Error (MSE): 201.90473684210525
R-squared (R2): 0.3523927139742874
Adjusted-R-squared 0.22838280813957645


### Grid Search

![alt text](images/grid-search.png "Grid Search")

In [34]:
from sklearn.model_selection import GridSearchCV

In [35]:
# set up the hyperparameters to tune
params = {'max_depth': [3, 5, 7, 9,11,13,15,17],
          'min_samples_split': [2, 5, 7,9,11],
          'min_samples_leaf': [1, 2, 4,6,7,9,11]}
# perform a grid search with cross-validation
grid_search = GridSearchCV(regressor, param_grid=params, cv=5)
grid_search.fit(X, y)
# print the best hyperparameters and score
print("Best hyperparameters: ", grid_search.best_params_)

Best hyperparameters:  {'max_depth': 7, 'min_samples_leaf': 9, 'min_samples_split': 2}


In [36]:
# import the regressor
from sklearn.tree import DecisionTreeRegressor
# create a regressor object
regressor = DecisionTreeRegressor(random_state = 0,max_depth=5,min_samples_leaf=7,min_samples_split=2)
# fit the regressor with X and Y data
regressor.fit(X_train_Tree, y_train_tree)

In [37]:
y_pred_tree=regressor.predict(X_test_tree)
y_pred_tree

array([83.86153846, 31.83846154, 71.325     , 71.325     , 71.325     ,
       55.8       , 65.5       , 60.51764706, 83.86153846, 55.8       ,
       46.87407407, 60.51764706, 83.86153846, 71.325     , 90.15714286,
       71.325     , 31.83846154, 31.83846154, 60.51764706, 83.86153846,
       71.325     , 46.87407407, 60.51764706, 46.87407407, 65.5       ,
       65.5       , 27.25555556, 48.36      , 45.3625    , 46.87407407,
       60.51764706, 83.86153846, 90.15714286, 71.325     , 71.325     ,
       46.87407407, 83.28      , 71.325     , 83.86153846, 90.15714286,
       60.51764706, 71.325     , 60.51764706, 71.325     , 46.87407407,
       65.5       , 71.325     , 83.28      , 80.28571429, 46.87407407,
       83.86153846, 46.87407407, 75.67692308, 60.51764706, 71.325     ,
       57.2       , 48.36      ])

In [38]:
evaluate_model_reg(regressor,y_test_tree, y_pred_tree)

Model: DecisionTreeRegressor(max_depth=5, min_samples_leaf=7, random_state=0)
Root Mean Squared Error (RMSE) 12.226912264302376
Mean Squared Error (MSE): 149.49738351894786
R-squared (R2): 0.5204887397745244
Adjusted-R-squared 0.42866743462496515


### with feature selection

In [39]:
X_train_Tree,X_test_tree,y_train_tree,y_test_tree=splitDataset(x_with_feature_selection,y)

In [40]:
# import the regressor
from sklearn.tree import DecisionTreeRegressor
# create a regressor object
regressor = DecisionTreeRegressor(random_state = 0,max_depth=5,min_samples_leaf=7,min_samples_split=2)
# fit the regressor with X and Y data
regressor.fit(X_train_Tree, y_train_tree)

In [41]:
y_pred_tree=regressor.predict(X_test_tree)
y_pred_tree

array([72.44166667, 21.75      , 66.17777778, 71.575     , 80.3       ,
       75.73333333, 65.67692308, 31.58571429, 82.65555556, 82.68888889,
       54.88421053, 61.50555556, 82.65555556, 75.73333333, 82.68888889,
       61.50555556, 47.5125    , 21.75      , 61.50555556, 83.84285714,
       40.58888889, 31.58571429, 66.17777778, 33.83      , 65.67692308,
       65.67692308, 21.75      , 71.575     , 52.00909091, 33.83      ,
       61.50555556, 88.7       , 88.7       , 80.3       , 66.17777778,
       33.83      , 74.7       , 77.625     , 82.65555556, 75.73333333,
       66.17777778, 47.5125    , 61.50555556, 61.50555556, 54.88421053,
       52.00909091, 40.58888889, 71.575     , 75.73333333, 33.83      ,
       72.44166667, 33.83      , 33.83      , 61.50555556, 74.7       ,
       61.50555556, 54.88421053])

In [42]:
evaluate_model_reg(regressor,y_test_tree, y_pred_tree)


Model: DecisionTreeRegressor(max_depth=5, min_samples_leaf=7, random_state=0)
Root Mean Squared Error (RMSE) 10.711323640686151
Mean Squared Error (MSE): 114.73245413552205
R-squared (R2): 0.6319968793011491
Adjusted-R-squared 0.5615281966141351


## KNN Regressor

![alt text](images/knn.png "K Nearset Neighbors")

In [43]:
from sklearn.neighbors import KNeighborsRegressor

In [44]:
X_train_knn,X_test_knn,y_train_knn,y_test_knn=splitDataset(X,y)

In [45]:
# Create a KNN regressor with k=5
knn = KNeighborsRegressor(n_neighbors=5)

# Fit the regressor to the training data
knn.fit(X_train_knn, y_train_knn)
# Predict the output values for the test data
y_pred_knn = knn.predict(X_test_knn)

In [46]:
evaluate_model_reg(regressor,y_test_knn, y_pred_knn)


Model: DecisionTreeRegressor(max_depth=5, min_samples_leaf=7, random_state=0)
Root Mean Squared Error (RMSE) 11.311946797562177
Mean Squared Error (MSE): 127.96014035087721
R-squared (R2): 0.5895692171060567
Adjusted-R-squared 0.5109760884667909


## Grid search

In [47]:
# Define the parameter grid for the grid search
param_grid = {'n_neighbors': [1,3, 5, 7, 9,11,13]}

# Create a KNN regressor
knn = KNeighborsRegressor()

# Perform the grid search using 5-fold cross-validation
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the grid search to the data
grid_search.fit(X_train_knn, y_train_knn)

print("Best hyperparameters:", grid_search.best_params_)

Best hyperparameters: {'n_neighbors': 7}


In [48]:
# Create a KNN regressor with k=5
knn = KNeighborsRegressor(n_neighbors=5)

# Fit the regressor to the training data
knn.fit(X_train_knn, y_train_knn)
# Predict the output values for the test data
y_pred_knn = knn.predict(X_test_knn)

In [49]:
evaluate_model_reg(regressor,y_test_knn, y_pred_knn)

Model: DecisionTreeRegressor(max_depth=5, min_samples_leaf=7, random_state=0)
Root Mean Squared Error (RMSE) 11.311946797562177
Mean Squared Error (MSE): 127.96014035087721
R-squared (R2): 0.5895692171060567
Adjusted-R-squared 0.5109760884667909


### With feature selectiopn

In [50]:
X_train_knn,X_test_knn,y_train_knn,y_test_knn=splitDataset(x_with_feature_selection,y)

### Grid Search

In [51]:
# Define the parameter grid for the grid search
param_grid = {'n_neighbors': [1,3, 5, 7, 9,11,13]}

# Create a KNN regressor
knn = KNeighborsRegressor()

# Perform the grid search using 5-fold cross-validation
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the grid search to the data
grid_search.fit(X_train_knn, y_train_knn)

print("Best hyperparameters:", grid_search.best_params_)

Best hyperparameters: {'n_neighbors': 5}


In [52]:
# Create a KNN regressor with k=5
knn = KNeighborsRegressor(n_neighbors=5)

# Fit the regressor to the training data
knn.fit(X_train_knn, y_train_knn)
# Predict the output values for the test data
y_pred_knn = knn.predict(X_test_knn)

In [53]:
evaluate_model_reg(regressor,y_test_knn, y_pred_knn)

Model: DecisionTreeRegressor(max_depth=5, min_samples_leaf=7, random_state=0)
Root Mean Squared Error (RMSE) 10.94348672747338
Mean Squared Error (MSE): 119.75990175438601
R-squared (R2): 0.6158713947829972
Adjusted-R-squared 0.5423148533584647
