In [None]:
## Importing Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
## Importing Dataset
df = pd.read_csv("50_Startups.csv")
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [None]:
print(X)

[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']
 [131876.9 99814.71 362861.36 'New York']
 [134615.46 147198.87 127716.82 'California']
 [130298.13 145530.06 323876.68 'Florida']
 [120542.52 148718.95 311613.29 'New York']
 [123334.88 108679.17 304981.62 'California']
 [101913.08 110594.11 229160.95 'Florida']
 [100671.96 91790.61 249744.55 'California']
 [93863.75 127320.38 249839.44 'Florida']
 [91992.39 135495.07 252664.93 'California']
 [119943.24 156547.42 256512.92 'Florida']
 [114523.61 122616.84 261776.23 'New York']
 [78013.11 121597.55 264346.06 'California']
 [94657.16 145077.58 282574.31 'New York']
 [91749.16 114175.79 294919.57 'Florida']
 [86419.7 153514.11 0.0 'New York']
 [76253.86 113867.3 298664.47 'California']
 [78389.47 153773.43 299737.29 'New York']
 [73994.56 122782.75 303319.26 'Florida']
 [67532

In [None]:
print(y)

[192261.83 191792.06 191050.39 182901.99 166187.94 156991.12 156122.51
 155752.6  152211.77 149759.96 146121.95 144259.4  141585.52 134307.35
 132602.65 129917.04 126992.93 125370.37 124266.9  122776.86 118474.03
 111313.02 110352.25 108733.99 108552.04 107404.34 105733.54 105008.31
 103282.38 101004.64  99937.59  97483.56  97427.84  96778.92  96712.8
  96479.51  90708.19  89949.14  81229.06  81005.76  78239.91  77798.83
  71498.49  69758.98  65200.33  64926.08  49490.75  42559.73  35673.41
  14681.4 ]


In [None]:
# ISSUE_1 - use mean to handle missing data, and apply

df.isnull().sum() #checks if there is any null value in each column
#fillna is used to replace the null values
#mean is used to replace the values with the mean of all values present in column
#condition of numeric is entered because some columns have non-numeric datatype
df=df.fillna(df.mean(numeric_only=True))


In [None]:
# ISSUE_2 - Encoding categorical column, using one hot encoding
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output = False) #converts sparse matrix into dense matrix
encoded_data = encoder.fit_transform(df[['State']]) #encodes data
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['State'])) #organises encoded data into a dataframe
#get_feature_names_out generates meaningful names for new columns
df = pd.concat([df, encoded_df], axis=1) #adds encoded columns to df
df.drop('State', axis=1, inplace=True) #removes State column from df

df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0.0,0.0,1.0
1,162597.7,151377.59,443898.53,191792.06,1.0,0.0,0.0
2,153441.51,101145.55,407934.54,191050.39,0.0,1.0,0.0
3,144372.41,118671.85,383199.62,182901.99,0.0,0.0,1.0
4,142107.34,91391.77,366168.42,166187.94,0.0,1.0,0.0


In [None]:
# ISSUE_3 - split data into train and test with 80/20 ratio

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)      # Splitting in 80:20 Ratio


In [None]:
print(train_X)

In [None]:
print(test_X)

In [None]:
print(train_y)

In [None]:
print(test_y)

In [None]:
# ISSUE_4 - Training on Train set - use linear regression

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(train_X, train_y)

In [None]:
# ISSUE_5 - Testing on test set - with trained linear regression

y_pred_test = model.predict(test_X)
print("Predictions on test set:", y_pred)

In [None]:
# ISSUE_6 - Measuring the performance - r2

# Importing the required library
from sklearn.metrics import r2_score

# Calculating the R2 score
r2 = r2_score(y_test, y_pred)
print("R2 Score :", r2)


In [None]:
# ISSUE_7 - print coefficient/weight of the trained model

print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

In [None]:
# ISSUE_8 - now predict values based on trained linear regression model



In [None]:
# ISSUE_9 - plot predicted and actual values

# Importing the required library :
import seaborn as sns

# (i) Plotting Predicted vs Actual Values for the Testing Dataset :
print("~ For TESTING SET : ")
plt.figure(figsize=(10, 8))
sns.scatterplot(x=y_test, y=y_pred_test)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')  # Diagonal line
plt.xlabel('Actual Profit')
plt.ylabel('Predicted Profit')
plt.title('Predicted vs Actual Profit (TESTING SET)')
plt.show()
print()

# (ii) Plotting Predicted vs Actual Values for the Testing Dataset :
print("~ For TRAINING SET : ")
plt.figure(figsize=(10, 8))
sns.scatterplot(x=y_train, y=y_pred_train)
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], color='red', linestyle='--')  # Diagonal line
plt.xlabel('Actual Profit')
plt.ylabel('Predicted Profit')
plt.title('Predicted vs Actual Profit (TRAINING SET)')
plt.show()
print()


In [None]:
# ISSUE_10 - Use classification_report to compute metrics like Precision, Recall, and F1-Score

# Importing the required library :
from sklearn.metrics import classification_report

# Displaying the Classification Report :
print("~ CLASSIFICATION REPORT : ")
print(classification_report(y_test, y_pred))
