In [None]:
import numpy as np
import pandas as pd

In [None]:
class LinearRegression:
    def __init__(self, lr=0.01, n_iter=1000):
        # hyperparameters initialization
        self.lr = lr
        self.n_iter = n_iter
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # parameter initialization
        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iter):
            # prediction
            y_pred = self.predict(X)

            # y = Wx+b

            # compute gradients
            dW = np.dot(X.T, (y_pred - y)) / n_samples
            db = np.sum(y_pred - y) / n_samples

            # update parameters
            self.weights -= self.lr * dW
            self.bias -= self.lr * db

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias


In [None]:
class RidgeRegression:
    def __init__(self, alpha =1, lr=0.01, n_iter=1000):
        # hyperparameters initialization
        self.alpha = alpha
        self.lr = lr
        self.n_iter = n_iter
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # parameter initialization
        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iter):
            # prediction
            y_pred = self.predict(X)

            #cost = MSE + alpha*W**2

            # compute gradients
            dW = (-(2*np.dot(X.T,(y-y_pred)))+(2*self.alpha*self.weights)) / n_samples
            db = -2*np.sum(y_pred - y) / n_samples

            # update parameters
            self.weights -= self.lr * dW
            self.bias -= self.lr * db

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

In [None]:
class LassoRegression:
    def __init__(self, alpha =1, lr=0.01, n_iter=1000):
        # hyperparameters initialization
        self.alpha = alpha
        self.lr = lr
        self.n_iter = n_iter
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # parameter initialization
        self.weights = np.zeros(n_features)
        self.bias = 0

        # cost = MSE + alpha*|W|

        for _ in range(self.n_iter):
            # prediction
            y_pred = self.predict(X)

            # compute gradients
            dW = (-(2*np.dot(X.T,(y-y_pred)))+(self.alpha)) / n_samples
            db = -2*np.sum(y_pred - y) / n_samples

            # update parameters
            self.weights -= self.lr * dW
            self.bias -= self.lr * db

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

# Data Preprocessing

In [None]:
data_path = "./EPL_Soccer_MLR_LR.csv"
df = pd.read_csv(data_path)
print("ACTUAL DF SHAPE : ", df.shape)

ACTUAL DF SHAPE :  (217, 13)


In [None]:
df.head()

Unnamed: 0,PlayerName,Club,DistanceCovered(InKms),Goals,MinutestoGoalRatio,ShotsPerGame,AgentCharges,BMI,Cost,PreviousClubCost,Height,Weight,Score
0,"Braund, Mr. Owen Harris",MUN,3.96,7.5,37.5,12.3,60.0,20.56,109.1,63.32,195.9,78.9,19.75
1,"Allen, Mr. William Henry",MUN,4.41,8.3,38.2,12.7,68.0,20.67,102.8,58.55,189.7,74.4,21.3
2,"Moran, Mr. James",MUN,4.14,5.0,36.4,11.6,21.0,21.86,104.6,55.36,177.8,69.1,19.88
3,"McCarthy, Mr. Timothy J",MUN,4.11,5.3,37.3,12.6,69.0,21.88,126.4,57.18,185.0,74.9,23.66
4,"Palsson, Master. Gosta Leonard",MUN,4.45,6.8,41.5,14.0,29.0,18.96,80.3,53.2,184.6,64.6,17.64


In [None]:
#dropping Null and removing categorical columns
df.dropna(axis=0, how='all', thresh=None, subset=None, inplace=True)
new_df = df.select_dtypes(['number'])


In [None]:
# Finding correlated features
X = new_df.iloc[:,:-1] # independent features
y = new_df.iloc[:,-1] # dependent feature

print("org shape of X : ", X.shape)

correlated_features = set()
correlation_matrix = X.corr()

org shape of X :  (202, 10)


In [None]:
correlation_matrix

Unnamed: 0,DistanceCovered(InKms),Goals,MinutestoGoalRatio,ShotsPerGame,AgentCharges,BMI,Cost,PreviousClubCost,Height,Weight
DistanceCovered(InKms),1.0,0.147098,0.924964,0.8888,0.250865,0.299471,-0.403004,0.550975,0.358854,0.403743
Goals,0.147098,1.0,0.153333,0.134721,0.131973,0.177032,0.137131,0.102734,0.076958,0.155844
MinutestoGoalRatio,0.924964,0.153333,1.0,0.950757,0.25824,0.320527,-0.449135,0.583375,0.371192,0.423699
ShotsPerGame,0.8888,0.134721,0.950757,1.0,0.308391,0.382524,-0.435429,0.610986,0.352322,0.455255
AgentCharges,0.250865,0.131973,0.25824,0.308391,1.0,0.302556,-0.108243,0.317581,0.123255,0.273686
BMI,0.299471,0.177032,0.320527,0.382524,0.302556,1.0,0.321116,0.713858,0.337097,0.845955
Cost,-0.403004,0.137131,-0.449135,-0.435429,-0.108243,0.321116,1.0,-0.207749,-0.071253,0.154227
PreviousClubCost,0.550975,0.102734,0.583375,0.610986,0.317581,0.713858,-0.207749,1.0,0.802119,0.930904
Height,0.358854,0.076958,0.371192,0.352322,0.123255,0.337097,-0.071253,0.802119,1.0,0.780906
Weight,0.403743,0.155844,0.423699,0.455255,0.273686,0.845955,0.154227,0.930904,0.780906,1.0


In [None]:


for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8: #positively or negatively correlated
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

print("Correlated Features : ", correlated_features)

Correlated Features :  {'Height', 'MinutestoGoalRatio', 'Weight', 'ShotsPerGame'}


In [None]:
# Dropping Correlated features from X
X.drop(columns=correlated_features, axis=1, inplace=True)
print("Shape of X after dropping correlated features : ", X.shape)

Shape of X after dropping correlated features :  (202, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [None]:
# Creating Train and Test data

def shuffle_data(X, y, seed=None):
    """ Random shuffle of the samples in X and y """
    if seed:
        np.random.seed(seed)
    idx = np.arange(X.shape[0])
    np.random.shuffle(idx)
    try:
        return X[idx], y[idx]
    except:
        return X.iloc[idx], y.iloc[idx]


def train_test_split(X, y, test_size=0.5, shuffle=True, seed=None):
    """ Split the data into train and test sets """
    if shuffle:
        X, y = shuffle_data(X, y, seed)
    # Split the training data from test data in the ratio specified in test_size
    split_i = len(y) - int(len(y) // (1 / test_size))
    X_train, X_test = X[:split_i], X[split_i:]
    y_train, y_test = y[:split_i], y[split_i:]

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, seed=42)

# Model Creation

#### _*Linear Model*_

In [None]:
linear_model = LinearRegression(lr=.00001, n_iter=100)
linear_model.fit(X_train, y_train)
linear_predict = linear_model.predict(X_test)

#### _*Lasso Model*_

In [None]:
lasso_model = LassoRegression(alpha=0.03, lr=.00001, n_iter=100)
lasso_model.fit(X_train, y_train)
lasso_predict = lasso_model.predict(X_test)

#### _*Ridge Model*_

In [None]:
ridge_model = RidgeRegression(alpha=0.03, lr=.00001, n_iter=100)
ridge_model.fit(X_train, y_train)
ridge_predict = ridge_model.predict(X_test)

# Metrics

#### _*MSE*_

In [None]:
def mean_squared_error(y_true, y_pred):
        return np.mean((y_true - y_pred) ** 2)

#### _*R2 Score*_

In [None]:
def r2_score(y_true, y_pred):
    corr_matrix = np.corrcoef(y_true, y_pred)
    print(corr_matrix)
    corr = corr_matrix[0, 1]
    return corr ** 2

In [None]:
print("MSE of Linear Model : ", mean_squared_error(y_test, linear_predict))
print("MSE of Lasso Model : ", mean_squared_error(y_test, lasso_predict))
print("MSE of Ridge Model : ", mean_squared_error(y_test, ridge_predict))

MSE of Linear Model :  4.288101722760345
MSE of Lasso Model :  2.849692797681988
MSE of Ridge Model :  2.8496948261295376


In [None]:
print("R2 Score of Linear Model : ", r2_score(y_test, linear_predict))
print("R2 Score of Lasso Model : ", r2_score(y_test, lasso_predict))
print("R2 Score of Ridge Model : ", r2_score(y_test, ridge_predict))

[[1.         0.96521229]
 [0.96521229 1.        ]]
R2 Score of Linear Model :  0.9316347693382256
[[1.         0.97366924]
 [0.97366924 1.        ]]
R2 Score of Lasso Model :  0.9480317900859903
[[1.         0.97366923]
 [0.97366923 1.        ]]
R2 Score of Ridge Model :  0.9480317788420329


In [None]:
pwd

'C:\\Users\\Goutam Kelam\\Videos\\Regression\\regression'