# Simple Linear Regression

## Importing the libraries

In [1]:
import numpy as np
import pandas as pd

## Importing the dataset

In [2]:
# Import the necessary module for file uploads in Google Colab
from google.colab import files

# Use the files.upload() method to upload files
uploaded = files.upload()

Saving Salary_Data.csv to Salary_Data.csv


In [3]:
# Import necessary libraries
import io

# Read the CSV file from the uploaded content using io.BytesIO
# 'uploaded' is a dictionary containing the uploaded files
# 'Salary_Data.csv' is the key corresponding to the CSV file
dataset = pd.read_csv(io.BytesIO(uploaded['Salary_Data.csv']), delimiter=",")

# Display the first 10 rows of the dataset
dataset.head(10)


Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0
5,2.9,56642.0
6,3.0,60150.0
7,3.2,54445.0
8,3.2,64445.0
9,3.7,57189.0


In [4]:
# dataset = pd.read_csv('Salary_Data.csv')

# Extract the feature variable (X) and target variable (y) from the dataset

# X contains all rows and all columns except the last one (feature variable)
X = dataset.iloc[:, :-1].values  # Independent Variable (IV)

# y contains all rows and the last column only (target variable)
y = dataset.iloc[:, -1].values  # Dependent Variable (DV)

## Splitting the dataset into the Training set and Test set

In [5]:
# Import necessary library
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
# X_train and y_train will be the training features and labels, respectively
# X_test and y_test will be the testing features and labels, respectively
# The test_size parameter specifies the proportion of the dataset to include in the test split
# The random_state parameter ensures reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [6]:
print(X_train)

[[ 9.6]
 [ 4. ]
 [ 5.3]
 [ 7.9]
 [ 2.9]
 [ 5.1]
 [ 3.2]
 [ 4.5]
 [ 8.2]
 [ 6.8]
 [ 1.3]
 [10.5]
 [ 3. ]
 [ 2.2]
 [ 5.9]
 [ 6. ]
 [ 3.7]
 [ 3.2]
 [ 9. ]
 [ 2. ]
 [ 1.1]
 [ 7.1]
 [ 4.9]
 [ 4. ]]


In [7]:
print(y_train)

[112635.  55794.  83088. 101302.  56642.  66029.  64445.  61111. 113812.
  91738.  46205. 121872.  60150.  39891.  81363.  93940.  57189.  54445.
 105582.  43525.  39343.  98273.  67938.  56957.]


In [8]:
print(X_test)

[[ 1.5]
 [10.3]
 [ 4.1]
 [ 3.9]
 [ 9.5]
 [ 8.7]]


In [9]:
print(y_test)

[ 37731. 122391.  57081.  63218. 116969. 109431.]


## Training the Simple Linear Regression model on the Training set

In [10]:
# Import the LinearRegression class from scikit-learn
from sklearn.linear_model import LinearRegression

In [11]:
# Create an instance of the LinearRegression class
my_regressor = LinearRegression()

In [12]:
# Train the linear regression model using the training data
my_regressor.fit(X_train, y_train)

## Predicting the Test set results

In [13]:
# Use the trained linear regression model to make predictions on the test data
y_pred = my_regressor.predict(X_test)

In [14]:
# Create a DataFrame to store the predicted salaries
y_pred_df = pd.DataFrame(data=y_pred, columns=["Predicted_Salaries"])

# Display the first 10 rows of the predicted salaries DataFrame
y_pred_df.head(10)

Unnamed: 0,Predicted_Salaries
0,40748.961841
1,122699.622956
2,64961.65717
3,63099.142145
4,115249.562855
5,107799.502753


In [15]:
print(y_pred)

[ 40748.96184072 122699.62295594  64961.65717022  63099.14214487
 115249.56285456 107799.50275317]


In [16]:
# Print the true salaries (y_test)
print(y_test)

# Create a DataFrame to store the true salaries
y_test_df = pd.DataFrame(data=y_test, columns=["Real_Salaries"])

# Display the first 10 rows of the true salaries DataFrame
y_test_df.head(10)

[ 37731. 122391.  57081.  63218. 116969. 109431.]


Unnamed: 0,Real_Salaries
0,37731.0
1,122391.0
2,57081.0
3,63218.0
4,116969.0
5,109431.0


In [17]:
# Merge the predicted salaries DataFrame (y_pred_df) with the true salaries DataFrame (y_test_df)
# Using left merge and index-based merging
compared_df = y_pred_df.merge(y_test_df, how="left", left_index=True, right_index=True)

# Display the resulting DataFrame for comparison
compared_df

Unnamed: 0,Predicted_Salaries,Real_Salaries
0,40748.961841,37731.0
1,122699.622956,122391.0
2,64961.65717,57081.0
3,63099.142145,63218.0
4,115249.562855,116969.0
5,107799.502753,109431.0


## Model metrics

In [18]:
# Import the necessary library
from sklearn.metrics import r2_score

# Calculate the R-squared (coefficient of determination) score
r2_score_result = r2_score(y_test, y_pred)

# Display the R-squared score
r2_score_result

0.988169515729126