In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt #plotting graphs
from ipywidgets import interact, widgets, HBox, VBox #for interactive elements

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/salary-dataset/Linear Regression Data.csv


In [2]:
df = pd.read_csv('/kaggle/input/salary-dataset/Linear Regression Data.csv') #convert the dataset into a Pandas dataframe

In [3]:
df #the dataset is very small, so it looks pretty clean; might not need any preprocessing

Unnamed: 0,YearsExperience,Salary
0,1.1,39343
1,1.3,46205
2,1.5,37731
3,2.0,43525
4,2.2,39891
5,2.9,56642
6,3.0,60150
7,3.2,54445
8,3.2,64445
9,3.7,57189


In [4]:
df.info() #no null values; data type is correct

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   YearsExperience  30 non-null     float64
 1   Salary           30 non-null     int64  
dtypes: float64(1), int64(1)
memory usage: 608.0 bytes


In [5]:
df.describe() 

Unnamed: 0,YearsExperience,Salary
count,30.0,30.0
mean,5.313333,76003.0
std,2.837888,27414.429785
min,1.1,37731.0
25%,3.2,56720.75
50%,4.7,65237.0
75%,7.7,100544.75
max,10.5,122391.0


When a feature's input value is large, the model is more likely to associate a smaller value for its parameter and vice-versa. Having huge differences in parameter ranges acorss various features causes Gradient Descent to run slower, which is why we use **'Feature Sclaing'**. It makes the relative sizes of different features similar so there isn't much variation in their parameter values.

Now since we only have one feature here, this is not really needed, but when we'll go to multiple linear regression, this concept will become extremely important. One benefit of **normalization** (a method to scale features) here will be that the values of the cost function will be reasonable, instead of being very large numbers. (Feel free to test it out by using non-normalized dataframe once and a normalized one afterwards.)

There are various ways to normalize values; we'll see all of them in this notebook.

In [6]:
# Normalization functions
def no_normalization(df):
    return df

def min_max_normalization(df):
    return df / df.max()

def mean_normalization(df):
    return (df - df.mean()) / (df.max() - df.min())

def z_score_normalization(df):
    return (df - df.mean()) / df.std()

In [7]:
# Plotting function
def plot_data(normalization_type):
    if normalization_type == 0:
        normalized_df = no_normalization(df)
        title = "No Normalization"
    elif normalization_type == 1:
        normalized_df = min_max_normalization(df)
        title = "Min-Max Normalization"
    elif normalization_type == 2:
        normalized_df = mean_normalization(df)
        title = "Mean Normalization"
    elif normalization_type == 3:
        normalized_df = z_score_normalization(df)
        title = "Z-Score Normalization"
    
    plt.figure(figsize=(8, 5))
    plt.scatter(normalized_df['YearsExperience'], normalized_df['Salary'], color='orange', alpha=0.6, edgecolor='r', s=70)
    plt.title(title, fontsize=14)
    plt.xlabel('Years of Experience', fontsize=12)
    plt.ylabel('Salary', fontsize=12)
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=10)
    plt.box(True)  # Remove the top and right spines
    plt.show()

# Interactive widget
interact(plot_data, normalization_type=widgets.IntSlider(min=0, max=3, step=1, value=0, description='Normalization:'))

interactive(children=(IntSlider(value=0, description='Normalization:', max=3), Output()), _dom_classes=('widge…

<function __main__.plot_data(normalization_type)>

In [8]:
#let us go with z-score normalization for now, any other kind of normalization will also work just as well here
df2 = z_score_normalization(df)

In [9]:
#now we'll plot the points and try to fit a line by choosing different values for parameter 'w' and bias 'b'
def plot_data(w=0, b=1):
    plt.figure(figsize=(8, 5))
    plt.scatter(df2['YearsExperience'], df2['Salary'], color='yellow', alpha=0.6, edgecolor='#ffA500', s=70, label='Data Points')
    
    # Plot the line wx + b
    x = np.linspace(df2['YearsExperience'].min(), df2['YearsExperience'].max(), 100)
    y = w * x + b
    plt.plot(x, y, color='orange', linewidth=2, label=f'y = {w}x + {b}')
    
    plt.title('Interactive Plot', fontsize=14)
    plt.xlabel('Years of Experience', fontsize=12)
    plt.ylabel('Salary', fontsize=12)
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=10)
    plt.box(False)  # Remove the top and right spines
    plt.legend()
    plt.show()

# Interactive widget
interact(plot_data, w=widgets.FloatSlider(min=-0.5, max=1.5, step=0.1, value=0, description='Slope (w):'),
                     b=widgets.FloatSlider(min=-2.5, max=1.5, step=0.1, value=1, description='Intercept (b):'))

interactive(children=(FloatSlider(value=0.0, description='Slope (w):', max=1.5, min=-0.5), FloatSlider(value=1…

<function __main__.plot_data(w=0, b=1)>

We find that the line where w = 1 and b = 0 fits our data well. But this took a lot of trial and error. Previously, I had both w and b values from 0 to 100 to find the optimum value. Hit-and-trial is not a feasible option when your dataset is very large and you have no idea about the possible range of your parameters. 

This is where **Gradient Descent** comes into play. It utilises the concept of a **Cost Function**. Cost functions are named so because the more our predctions are far away from the true values, the more it will *'cost'* us. So, the goal of Gradient Descent is to reduce the cost function and keep reducing it until the value converges and reaches a minima. It does so by chaging the values of the parameters. The amount by which these values change is defined by a **Learning Rate**, which is generally kept between 0 and 1. 

I'll first define the cost function and then the function which keeps iterating a specified number of times to reduce the cost function. Then you can play around with the number of iterations and the learning rate to find an optimal number via another interactive graph.

In [10]:
def cost_func(w, b, df2, J):
    m = len(df2)
    J += ((df2['YearsExperience']*w + b) - df2['Salary'])**2
    J = J/(2*m)
    return sum(J)

In [11]:
def iterate(num_iter, df2, alpha):
    J, w, b = 0, 0, 0
    Js = []
    ws = []
    bs = []
    m = len(df2)
    
    for i in range(num_iter):
        Js.append(cost_func(w, b, df2, J))

        dJ_dw = sum(((df2['YearsExperience']*w + b) - df2['Salary'])*df2['YearsExperience'])/m
        dJ_db = sum((df2['YearsExperience']*w + b) - df2['Salary'])/m

        w = w - (alpha*dJ_dw)
        b = b - (alpha*dJ_db)
        ws.append(w)
        bs.append(b)
    return Js, ws, bs

In [12]:
def plot_data(num_iterations, alpha):
    # Generate example data for the given number of iterations
    Js, ws, bs = iterate(num_iterations, df2, alpha)
    
    # Create a list of iteration numbers (indices)
    iterations = list(range(num_iterations))
    
    # Plot the data
    plt.figure(figsize=(10, 5))
    plt.plot(iterations, Js, linestyle='-', color='r', label='Js values')
    
    # Add titles and labels
    plt.title(f'Cost Function vs. Number of Iterations ({num_iterations} Iterations)', fontsize=13)
    plt.xlabel('Number of Iterations', fontsize=12)
    plt.ylabel('Cost Function Value', fontsize=12)
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=10)
    plt.legend()
    plt.grid(True)  # Optional: Adds grid lines for better readability
    
    # Show the plot
    plt.show()

# Interactive widget to adjust the number of iterations
# interact(plot_data, num_iterations=widgets.IntSlider(min=0, max=400, step=25, value=200, description='Iterations:'),
#         alpha=widgets.FloatSlider(min=0.001, max=2.2, step=0.1, value=0.01, description='Learning Rate:'))

iterations_slider = widgets.IntSlider(min=0, max=1000, step=50, value=200, description='Iterations:')
learning_rate_dropdown = widgets.Dropdown(
    options=[0.001, 0.01, 0.1, 1, 10],
    value=0.01,
    description='Learning Rate:'
)

# Arrange widgets in a horizontal box
controls = HBox([iterations_slider, learning_rate_dropdown])

# Interactive widget
interact(plot_data, num_iterations=iterations_slider, alpha=learning_rate_dropdown)


interactive(children=(IntSlider(value=200, description='Iterations:', max=1000, step=50), Dropdown(description…

<function __main__.plot_data(num_iterations, alpha)>

We see that the required number of iterations decrease as the learning rate (or step size) increases. But when we set the learning rate to 10, our cost function keeps on increasing instead of decreason. So, we need to be very careful about the value we choose for our learning rate.

Now, we'll choose an optimal value for the number of iterations needed and a learning rate, then we'll print the parameter values we get after these iterations.

In [13]:
Js, ws, bs = iterate(100, df2, 0.1)

In [14]:
print(f"Squared Error Cost Function Value: {Js[-1]}")
print(f"Final Value of the Parameter w: {ws[-1]}")
print(f"Final Value of the Parameter b: {bs[-1]}")

Squared Error Cost Function Value: 0.020804279834930255
Final Value of the Parameter w: 0.9782040131287187
Final Value of the Parameter b: 5.736152293896644e-17


We can see that these values are very close to the values we found by hit-and-trial method above where w was equal to 1 and b was equal to 0.

This was the simplest form of linear regression! But wait! These values of w and b are the normalized values! We need to **convert them back to an unnormalized version** before we can start predicting. We use the same values that we used to normalize them i.e., standard deviation and the mean or original columns. 

Note that this is the reason why we did not normalize the original dataset and created a duplicate one instead. 

In [15]:
mu_x = df['YearsExperience'].mean()
sigma_x = df['YearsExperience'].std()
mu_y = df['Salary'].mean()
sigma_y = df['Salary'].std()

# Your final normalized parameters
w_normalized = 0.978
b_normalized = 0

# Transform the parameters back to the original scale
w_new = w_normalized * (sigma_y / sigma_x)
b_new = b_normalized * sigma_y + mu_y - w_normalized * (sigma_y * mu_x / sigma_x)

print(f"Adjusted Value of w: {w_new}")
print(f"Adjusted Value of b: {b_new}")

# To make predictions with the adjusted parameters:
def predict(years_experience):
    return w_new * years_experience + b_new

Adjusted Value of w: 9447.628250227892
Adjusted Value of b: 25804.601897122455


In [16]:
#try it out by uncommenting the lines ahead!
# x = float(input())
# predicted_salary = predict(x)
# print(f"Predicted Salary for {x} years of experience: {predicted_salary}")

And that was it! Thank you for reading my notebook so far!