### Multiple Linear Regression Model

In [43]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LinearRegression

In [44]:
#loading the dataset
df = pd.read_csv('50_Startups.csv')
print(f'DataSet:\n {df.head(1)}')
print(f'\nShape of the dataset: {df.shape}')
missing_data = df.isnull().sum()
print(f'\nMissing Values Each column:\n {missing_data}')
print(f'\nData Types of dataset:\n {df.dtypes}')

#spliting the feature and target variable
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

# Categorial Data Encoder
ct = ColumnTransformer(transformers=[('encode',OneHotEncoder(),[3])],remainder='passthrough')
X = np.array(ct.fit_transform(X))

#print(f'\nPrinting the final X variable:\n {X}')



DataSet:
    R&D Spend  Administration  Marketing Spend     State     Profit
0   165349.2        136897.8         471784.1  New York  192261.83

Shape of the dataset: (50, 5)

Missing Values Each column:
 R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

Data Types of dataset:
 R&D Spend          float64
Administration     float64
Marketing Spend    float64
State               object
Profit             float64
dtype: object


In [45]:
#spliting the train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=43)

In [57]:
#training the linear regression model
ln = LinearRegression()
ln.fit(X_train,y_train)

print(f'Intercept: {ln.intercept_:.4f}\n')


feature_name = ['California','New York','Florida','R&D Spend','Admin Spend','Marketing Spend']

for name, coef in zip(feature_name, ln.coef_):
    print(f'Coefficient for {name} is {coef:.4f}\n')

Intercept: 48910.1980

Coefficient for California is 755.3891

Coefficient for New York is 459.2199

Coefficient for Florida is -1214.6090

Coefficient for R&D Spend is 0.8380

Coefficient for Admin Spend is -0.0461

Coefficient for Marketing Spend is 0.0289



In [59]:
#making the prediction 

pred_y = ln.predict(X_test)

pred_y

array([113037.65136733,  73351.2522634 ,  99718.18194483, 150087.80079663,
       116959.44011873,  42863.73306412,  83215.7407567 , 159383.56771857,
       127820.13481382,  61932.16135862])

In [68]:
#comparing the results
np.set_printoptions(precision=2)

print(np.concatenate((pred_y.reshape(len(pred_y),1),y_test.reshape(len(y_test),1)),1))

[[113037.65 122776.86]
 [ 73351.25  90708.19]
 [ 99718.18 103282.38]
 [150087.8  132602.65]
 [116959.44 118474.03]
 [ 42863.73  64926.08]
 [ 83215.74  81005.76]
 [159383.57 156122.51]
 [127820.13 134307.35]
 [ 61932.16  65200.33]]


In [71]:
#evaluate the model
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

r2 = r2_score(y_test,pred_y)
mae = mean_absolute_error(y_test,pred_y)
mse = mean_squared_error(y_test,pred_y)

print(f'R2 score of the model: {r2:.2f}')
print(f'Mean Absolute Error: {mae:.2f}')
print(f'Mean Squared Error: {mse:.2f}')

R2 score of the model: 0.85
Mean Absolute Error: 8694.89
Mean Squared Error: 127187401.29
