# Simple Linear Regression

## Importing the libraries

In [1]:
import numpy as np
import pandas as pd

## Importing the dataset

In [2]:
# Import the necessary module for file uploads in Google Colab
from google.colab import files

# Use the files.upload() method to upload files
uploaded = files.upload()

Saving Position_Salaries.csv to Position_Salaries.csv


In [3]:
# Import necessary libraries
import io

# Read the CSV file from the uploaded content using io.BytesIO
# 'uploaded' is a dictionary containing the uploaded files
# 'Position_Salaries.csv' is the key corresponding to the CSV file
dataset = pd.read_csv(io.BytesIO(uploaded['Position_Salaries.csv']), delimiter=",")

# Display the first 10 rows of the dataset
dataset.head(10)


Unnamed: 0,Position,Level,Salary
0,Business Analyst,1,43000
1,Business Analyst,1,45000
2,Business Analyst,1,46000
3,Junior Consultant,2,50000
4,Junior Consultant,2,53000
5,Junior Consultant,2,55000
6,Senior Consultant,3,60000
7,Senior Consultant,3,63000
8,Senior Consultant,3,65000
9,Manager,4,80000


In [4]:
# dataset = pd.read_csv('Position_Salaries.csv')

# Extract the feature variable (X) and target variable (y) from the dataset

# X contains all rows and all columns except the last one (feature variable)
X = dataset.iloc[:, 1:-1].values  # Independent Variable (IV)
# do not select position column

# y contains all rows and the last column only (target variable)
y = dataset.iloc[:, -1].values  # Dependent Variable (DV)

## Splitting the dataset into the Training set and Test set

In [5]:
# Import necessary library
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
# X_train and y_train will be the training features and labels, respectively
# X_test and y_test will be the testing features and labels, respectively
# The test_size parameter specifies the proportion of the dataset to include in the test split
# The random_state parameter ensures reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [6]:
print(X_train)

[[10]
 [ 4]
 [ 6]
 [ 8]
 [ 2]
 [ 6]
 [ 3]
 [ 5]
 [ 8]
 [ 7]
 [ 1]
 [10]
 [ 3]
 [ 2]
 [ 7]
 [ 7]
 [ 4]
 [ 3]
 [ 9]
 [ 2]
 [ 1]
 [ 8]
 [ 6]
 [ 5]]


In [7]:
print(y_train)

[1000000   85000  155000  330000   55000  152000   65000  115000  350000
  250000   45000 1050000   60000   53000  200000  230000   80000   63000
  520000   50000   43000  300000  150000  110000]


In [8]:
print(X_test)

[[ 1]
 [10]
 [ 5]
 [ 4]
 [ 9]
 [ 9]]


In [9]:
print(y_test)

[  46000 1030000  113000   83000  550000  500000]


## Training the Simple Linear Regression model on the Training set

In [10]:
# Import the LinearRegression class from scikit-learn
from sklearn.linear_model import LinearRegression

In [11]:
# Create an instance of the LinearRegression class
my_regressor = LinearRegression()

In [12]:
# Train the linear regression model using the training data
my_regressor.fit(X_train, y_train)

## Predicting the Test set results

In [13]:
# Use the trained linear regression model to make predictions on the test data
y_pred = my_regressor.predict(X_test)

In [14]:
# Create a DataFrame to store the predicted salaries
y_pred_df = pd.DataFrame(data=y_pred, columns=["Predicted_Salaries"])

# Display the first 10 rows of the predicted salaries DataFrame
y_pred_df.head(10)

Unnamed: 0,Predicted_Salaries
0,-108815.133877
1,600923.399302
2,206624.214203
3,127764.377183
4,522063.562282
5,522063.562282


In [15]:
print(y_pred)

[-108815.1338766   600923.39930151  206624.21420256  127764.37718277
  522063.56228172  522063.56228172]


In [16]:
# Print the true salaries (y_test)
print(y_test)

# Create a DataFrame to store the true salaries
y_test_df = pd.DataFrame(data=y_test, columns=["Real_Salaries"])

# Display the first 10 rows of the true salaries DataFrame
y_test_df.head(10)

[  46000 1030000  113000   83000  550000  500000]


Unnamed: 0,Real_Salaries
0,46000
1,1030000
2,113000
3,83000
4,550000
5,500000


In [17]:
# Merge the predicted salaries DataFrame (y_pred_df) with the true salaries DataFrame (y_test_df)
# Using left merge and index-based merging
compared_df = y_pred_df.merge(y_test_df, how="left", left_index=True, right_index=True)

# Display the resulting DataFrame for comparison
compared_df

Unnamed: 0,Predicted_Salaries,Real_Salaries
0,-108815.133877,46000
1,600923.399302,1030000
2,206624.214203,113000
3,127764.377183,83000
4,522063.562282,550000
5,522063.562282,500000


## Model metrics

In [18]:
# Import the necessary library
from sklearn.metrics import r2_score

# Calculate the R-squared (coefficient of determination) score
r2_score_result = r2_score(y_test, y_pred)

# Display the R-squared score
r2_score_result

0.7011634582019748

R squared score is very low. This shows us, the structure of the dataset seems unsuitable for applying simple linear regression.