# Multiple Linear Regression

## Importing the libraries

In [1]:
import numpy as np
import pandas as pd

## Importing the dataset

In [2]:
# Import the necessary module from Google Colab for uploading files
from google.colab import files

# Prompt the user to upload files and store the uploaded files in the 'uploaded' variable
uploaded = files.upload()

Saving 50_Startups.csv to 50_Startups.csv


In [3]:
# Import the necessary modules
import io
import pandas as pd

# Read the CSV file from the uploaded content using io.BytesIO
# 'uploaded' is assumed to be a dictionary containing the uploaded files
# '50_Startups.csv' is the key corresponding to the CSV file
dataset = pd.read_csv(io.BytesIO(uploaded['50_Startups.csv']), delimiter=",")

# Display the first 10 rows of the dataset
dataset.head(10)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


In [4]:
# Extract the feature variables (X) and target variable (y) from the dataset

# dataset = pd.read_csv('50_Startups.csv')

# X contains all rows and all columns except the last one (feature variables)
X = dataset.iloc[:, :-1].values

# y contains all rows and the last column only (target variable)
y = dataset.iloc[:, -1].values

## Encoding categorical data

In [5]:
# Import necessary libraries
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Create a ColumnTransformer for one-hot encoding the categorical column at index 3
converted_column = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
# remainder='passthrough' parameter ensures that the non-transformed columns are retained in the output as well

# Apply the ColumnTransformer to transform the feature variables (X)
X = np.array(converted_column.fit_transform(X))

In [6]:
print(X)

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [0.0 1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [0.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [0.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 0.0 1.0 78389.47 153773.43 299737.29]
 [0.0 1.0 0.0 73994.56 122782.75 3

In [7]:
# Remove the first column from the feature variables (X) to avoid dummy variable trap
X = np.delete(X, 0, axis=1)

In [8]:
print(X)

[[0.0 1.0 165349.2 136897.8 471784.1]
 [0.0 0.0 162597.7 151377.59 443898.53]
 [1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 1.0 144372.41 118671.85 383199.62]
 [1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 1.0 131876.9 99814.71 362861.36]
 [0.0 0.0 134615.46 147198.87 127716.82]
 [1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 1.0 120542.52 148718.95 311613.29]
 [0.0 0.0 123334.88 108679.17 304981.62]
 [1.0 0.0 101913.08 110594.11 229160.95]
 [0.0 0.0 100671.96 91790.61 249744.55]
 [1.0 0.0 93863.75 127320.38 249839.44]
 [0.0 0.0 91992.39 135495.07 252664.93]
 [1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 1.0 114523.61 122616.84 261776.23]
 [0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 1.0 94657.16 145077.58 282574.31]
 [1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 1.0 86419.7 153514.11 0.0]
 [0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 1.0 78389.47 153773.43 299737.29]
 [1.0 0.0 73994.56 122782.75 303319.26]
 [1.0 0.0 67532.53 105751.03 304768.73]
 [0.0 1.0 77044.01 99281.34 140574.81]
 [0

## Splitting the dataset into the Training set and Test set

In [9]:
# Import necessary library
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
# X_train and y_train will be the training features and labels, respectively
# X_test and y_test will be the testing features and labels, respectively
# The test_size parameter specifies the proportion of the dataset to include in the test split
# The random_state parameter ensures reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Training the Multiple Linear Regression model on the Training set

In [10]:
# Import necessary library
from sklearn.linear_model import LinearRegression

# Create a Linear Regression model
my_regressor = LinearRegression()

# Train the model using the training data
my_regressor.fit(X_train, y_train)

## Predicting the Test set results

In [11]:
# Use the trained regressor to make predictions on the test set
y_pred = my_regressor.predict(X_test)

# Create a DataFrame to store the predicted results for further analysis
y_pred_df = pd.DataFrame(data=y_pred, columns=["Predicted_Result"])

In [12]:
# Create a DataFrame for the real values of the target variable (y_test)
y_test_df = pd.DataFrame(data=y_test, columns=["Real_Result"])

# Merge the predicted results DataFrame (y_pred_df) with the real values DataFrame (y_test_df)
# Using left merge and index-based merging
compared_df = y_pred_df.merge(y_test_df, how="left", left_index=True, right_index=True)

# Display the DataFrame for comparison between predicted and real values
compared_df


Unnamed: 0,Predicted_Result,Real_Result
0,103015.201598,103282.38
1,132582.277608,144259.4
2,132447.738452,146121.95
3,71976.098513,77798.83
4,178537.482211,191050.39
5,116161.242302,105008.31
6,67851.692097,81229.06
7,98791.733747,97483.56
8,113969.43533,110352.25
9,167921.065696,166187.94


## Model metrics

In [13]:
# Import necessary library
from sklearn.metrics import r2_score

# Calculate the R-squared (coefficient of determination) score
# This metric measures the proportion of the variance in the dependent variable that is predictable from the independent variables
r2_score_result = r2_score(y_test, y_pred)

# Display the R-squared score
r2_score_result

0.9347068473282987

In [14]:
# Load data from local drive in Colab
from google.colab import files

uploaded = files.upload()

Saving 10_Startups_New.csv to 10_Startups_New.csv


In [15]:
# Import necessary library
import pandas as pd

# '10_Startups_New.csv' is the key for the new dataset in the 'uploaded' dictionary
startups_New_dataset = pd.read_csv(io.BytesIO(uploaded['10_Startups_New.csv']), delimiter=",")

# Display the first 10 rows of the new dataset
startups_New_dataset.head(10)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,155349.2,136897.8,571784.1,New York
1,162597.7,151377.59,443898.53,California
2,183441.51,101145.55,417934.54,Florida
3,134372.41,138671.85,393199.62,New York
4,142107.34,91391.77,366168.42,Florida
5,151876.8,99814.71,362861.36,New York
6,134615.46,147198.87,127716.82,California
7,130298.13,145530.06,323876.68,Florida
8,120542.52,148718.95,311613.29,New York
9,123334.88,118679.17,314981.62,California


In [16]:
# startups_New_dataset = pd.read_csv('10_Startups_New.csv')

# Extract the data from the new dataset and store it in a NumPy array
to_be_predicted_data_numpy = startups_New_dataset.iloc[:, :].values

# Display the NumPy array containing the data to be predicted
to_be_predicted_data_numpy

array([[155349.2, 136897.8, 571784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [183441.51, 101145.55, 417934.54, 'Florida'],
       [134372.41, 138671.85, 393199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida'],
       [151876.8, 99814.71, 362861.36, 'New York'],
       [134615.46, 147198.87, 127716.82, 'California'],
       [130298.13, 145530.06, 323876.68, 'Florida'],
       [120542.52, 148718.95, 311613.29, 'New York'],
       [123334.88, 118679.17, 314981.62, 'California']], dtype=object)

In [17]:
print(to_be_predicted_data_numpy)

[[155349.2 136897.8 571784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [183441.51 101145.55 417934.54 'Florida']
 [134372.41 138671.85 393199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']
 [151876.8 99814.71 362861.36 'New York']
 [134615.46 147198.87 127716.82 'California']
 [130298.13 145530.06 323876.68 'Florida']
 [120542.52 148718.95 311613.29 'New York']
 [123334.88 118679.17 314981.62 'California']]


In [18]:
# 'to_be_predicted_data_numpy' contains the data to be predicted

# Create a ColumnTransformer for one-hot encoding the categorical column at index 3
converted_column = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')

# Apply the ColumnTransformer to transform the data to be predicted
to_be_predicted_data_numpy = np.array(converted_column.fit_transform(to_be_predicted_data_numpy))

# Display the transformed NumPy array
to_be_predicted_data_numpy

array([[0.0, 0.0, 1.0, 155349.2, 136897.8, 571784.1],
       [1.0, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [0.0, 1.0, 0.0, 183441.51, 101145.55, 417934.54],
       [0.0, 0.0, 1.0, 134372.41, 138671.85, 393199.62],
       [0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42],
       [0.0, 0.0, 1.0, 151876.8, 99814.71, 362861.36],
       [1.0, 0.0, 0.0, 134615.46, 147198.87, 127716.82],
       [0.0, 1.0, 0.0, 130298.13, 145530.06, 323876.68],
       [0.0, 0.0, 1.0, 120542.52, 148718.95, 311613.29],
       [1.0, 0.0, 0.0, 123334.88, 118679.17, 314981.62]], dtype=object)

In [19]:
# Remove the first column from the transformed data
# Assuming the first column is being removed
to_be_predicted_data_numpy = np.delete(to_be_predicted_data_numpy, 0, axis=1)

In [20]:
# Display the transformed and modified NumPy array
print(to_be_predicted_data_numpy)

[[0.0 1.0 155349.2 136897.8 571784.1]
 [0.0 0.0 162597.7 151377.59 443898.53]
 [1.0 0.0 183441.51 101145.55 417934.54]
 [0.0 1.0 134372.41 138671.85 393199.62]
 [1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 1.0 151876.8 99814.71 362861.36]
 [0.0 0.0 134615.46 147198.87 127716.82]
 [1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 1.0 120542.52 148718.95 311613.29]
 [0.0 0.0 123334.88 118679.17 314981.62]]


In [21]:
# 'my_regressor' is the trained regression model
profit_prediction_numpy = my_regressor.predict(to_be_predicted_data_numpy)

# Display the predicted profit values
profit_prediction_numpy

array([188845.90603365, 189547.28196891, 202107.59825117, 166141.40364839,
       167921.0656955 , 177291.98914863, 156191.10124357, 159019.04337113,
       152787.95464276, 153383.85301359])

In [22]:
# Create a DataFrame to store the predicted profit values
profit_prediction_df = pd.DataFrame(profit_prediction_numpy, columns=["Predicted_Profit"])

# Display the first 10 rows of the DataFrame
profit_prediction_df.head(10)

Unnamed: 0,Predicted_Profit
0,188845.906034
1,189547.281969
2,202107.598251
3,166141.403648
4,167921.065696
5,177291.989149
6,156191.101244
7,159019.043371
8,152787.954643
9,153383.853014


In [23]:
# Merge the original dataset (startups_New_dataset) with the predicted profit DataFrame (profit_prediction_df)
# Using left merge and index-based merging
startups_df = startups_New_dataset.merge(profit_prediction_df, how="left", left_index=True, right_index=True)

# Display the resulting DataFrame
startups_df

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Predicted_Profit
0,155349.2,136897.8,571784.1,New York,188845.906034
1,162597.7,151377.59,443898.53,California,189547.281969
2,183441.51,101145.55,417934.54,Florida,202107.598251
3,134372.41,138671.85,393199.62,New York,166141.403648
4,142107.34,91391.77,366168.42,Florida,167921.065696
5,151876.8,99814.71,362861.36,New York,177291.989149
6,134615.46,147198.87,127716.82,California,156191.101244
7,130298.13,145530.06,323876.68,Florida,159019.043371
8,120542.52,148718.95,311613.29,New York,152787.954643
9,123334.88,118679.17,314981.62,California,153383.853014
