In [1]:
# Importing the libraries

import os  # For setting the working directory
import numpy as np   # For numpy calculations
import matplotlib.pyplot as plt   # For data visualization
import pandas as pd   # For DataFrame processing
import statsmodels.api as sm  # For getting the statistics of the final model
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split


In [2]:
# # Importing the dataset
# =============================================================================

os.chdir('D:\\MLP_Session_26_JULY\\MLP_10_11_08_21')
dataset = pd.read_csv('50_Startups_data.csv')

In [3]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,14681.4
1,162597.7,151377.59,443898.53,California,35673.41
2,153441.51,101145.55,407934.54,Florida,42559.73
3,144372.41,118671.85,383199.62,New York,49490.75
4,142107.34,91391.77,366168.42,Florida,64926.08


In [4]:
# Missing values

dataset.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [5]:
# For Viewing the columns

dataset.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')

In [6]:
# For checking the data type of the variables

dataset.dtypes

R&D Spend          float64
Administration     float64
Marketing Spend    float64
State               object
Profit             float64
dtype: object

In [7]:
# For checking the number of rows and columns
 
dataset.shape

(50, 5)

In [8]:
# For getting information about the data
 
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [9]:
# =============================================================================
# Creating the Independendent and Dependent Data Sets
# =============================================================================


X = dataset.loc[:,'R&D Spend':'State'] #Feature Data
y = dataset.loc[:, 'Profit'] # Dependent Data


#X = dataset.iloc[:,:-1] #Feature Data
#Y = dataset.iloc[:,-1 ] # Dependent Data


# X = dataset.iloc[:, dataset.columns != 'Profit']
# Y = dataset.iloc[:, dataset.columns == 'Profit']

In [10]:
dataset.State.unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [11]:
# =============================================================================
#  label Encoder vs One-Hot Encoding categorical data
# =============================================================================

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

#Label Encoder : Encode labels with value between 0 and n_classes-1.

labelencoder = LabelEncoder()
X['State'] = labelencoder.fit_transform(X['State'])
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,2
1,162597.7,151377.59,443898.53,0
2,153441.51,101145.55,407934.54,1
3,144372.41,118671.85,383199.62,2
4,142107.34,91391.77,366168.42,1


In [12]:
#One-Hot  Encoder : Encode categorical integer features as a one-hot numeric array.

onehotencoder = OneHotEncoder(handle_unknown='ignore')
enc = onehotencoder.fit_transform(X[['State']]).toarray()
enc_df = pd.DataFrame(enc)

In [13]:
enc_df.head()

Unnamed: 0,0,1,2
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,0.0,1.0,0.0


In [14]:
X = X.join(enc_df)

In [15]:
X

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,0,1,2
0,165349.2,136897.8,471784.1,2,0.0,0.0,1.0
1,162597.7,151377.59,443898.53,0,1.0,0.0,0.0
2,153441.51,101145.55,407934.54,1,0.0,1.0,0.0
3,144372.41,118671.85,383199.62,2,0.0,0.0,1.0
4,142107.34,91391.77,366168.42,1,0.0,1.0,0.0
5,131876.9,99814.71,362861.36,2,0.0,0.0,1.0
6,134615.46,147198.87,127716.82,0,1.0,0.0,0.0
7,130298.13,145530.06,323876.68,1,0.0,1.0,0.0
8,120542.52,148718.95,311613.29,2,0.0,0.0,1.0
9,123334.88,108679.17,304981.62,0,1.0,0.0,0.0


In [16]:
X = pd.DataFrame(X)

In [17]:
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,0,1,2
0,165349.2,136897.8,471784.1,2,0.0,0.0,1.0
1,162597.7,151377.59,443898.53,0,1.0,0.0,0.0
2,153441.51,101145.55,407934.54,1,0.0,1.0,0.0
3,144372.41,118671.85,383199.62,2,0.0,0.0,1.0
4,142107.34,91391.77,366168.42,1,0.0,1.0,0.0


In [18]:
# Creating the Independendent and Dependent Data Sets
# =============================================================================

# Independent variables ---> features, dimensions, inputs

X = dataset.loc[:,'R&D Spend':'State'] #Feature Data
y = dataset.loc[:, 'Profit'] # Dependent Data



In [19]:
#Dummy Variables creation


X = pd.get_dummies(X, columns=['State'])

In [20]:
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,0,0,1
1,162597.7,151377.59,443898.53,1,0,0
2,153441.51,101145.55,407934.54,0,1,0
3,144372.41,118671.85,383199.62,0,0,1
4,142107.34,91391.77,366168.42,0,1,0


In [21]:
# Creating the Independendent and Dependent Data Sets
# =============================================================================


X = dataset.loc[:,'R&D Spend':'State'] #Feature Data
y = dataset.loc[:, 'Profit'] # Dependent Data



In [22]:
#Dummy Variables creation

X = pd.get_dummies(X, columns=['State'],drop_first=True)

In [23]:
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0


In [24]:
# # Splitting the dataset into the Training set and Test set
# =============================================================================
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=0)


In [25]:
y_train.head()

33    126992.93
35    132602.65
26    108733.99
34    129917.04
18     97483.56
Name: Profit, dtype: float64

In [26]:
X_train.shape

(40, 5)

In [27]:
X_test.shape

(10, 5)

In [28]:
y_train.shape

(40,)

In [29]:
y_test.shape

(10,)

In [30]:
y_train=pd.DataFrame(y_train)
y_test=pd.DataFrame(y_test)


In [31]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,14681.4
1,162597.7,151377.59,443898.53,California,35673.41
2,153441.51,101145.55,407934.54,Florida,42559.73
3,144372.41,118671.85,383199.62,New York,49490.75
4,142107.34,91391.77,366168.42,Florida,64926.08


In [32]:
# Feature Scaling -- Useful when Features have different units

from sklearn.preprocessing import StandardScaler

sc= StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)



In [33]:
X_train

array([[-0.35006454, -0.78547109,  0.1011968 ,  1.73205081, -0.73379939],
       [-0.55530319, -1.48117426,  0.02734979, -0.57735027,  1.36277029],
       [ 0.07935762,  0.80133381, -0.55152132,  1.73205081, -0.73379939],
       [-0.54638238,  1.32505817,  0.07011684, -0.57735027, -0.73379939],
       [ 0.43485371, -0.35598663,  0.75148516,  1.73205081, -0.73379939],
       [ 1.26943143,  0.85518519,  0.98603118,  1.73205081, -0.73379939],
       [ 1.04525007,  1.28077047,  0.4404    ,  1.73205081, -0.73379939],
       [-1.529843  ,  0.02942065, -1.6218751 , -0.57735027,  1.36277029],
       [-1.53976251, -2.76767264, -1.6372965 , -0.57735027,  1.36277029],
       [-0.13115188,  1.14497701, -0.76949991, -0.57735027,  1.36277029],
       [ 0.92791613, -0.02992062,  0.48303162, -0.57735027,  1.36277029],
       [-0.20932933, -0.2993768 , -0.89915412,  1.73205081, -0.73379939],
       [-0.17870828,  0.2251352 , -1.26401642, -0.57735027, -0.73379939],
       [ 0.1374709 , -0.06929437,  0.5

In [34]:
X_test

array([[-0.1214952 ,  2.2889053 , -0.68032287,  1.73205081, -0.73379939],
       [ 0.6280306 , -1.22069499,  0.38557774, -0.57735027, -0.73379939],
       [ 0.65490061, -0.49434195,  0.21885524,  1.73205081, -0.73379939],
       [-0.94762148, -1.49417936, -0.30512104,  1.73205081, -0.73379939],
       [ 1.77048111, -0.85932667,  1.6668808 ,  1.73205081, -0.73379939],
       [ 0.00961775,  0.17279112,  1.22341229, -0.57735027,  1.36277029],
       [-1.11353109, -2.21896176, -0.13669119, -0.57735027,  1.36277029],
       [-0.22790703,  1.13222416, -0.92274884, -0.57735027,  1.36277029],
       [ 0.05047007, -0.02351175,  0.81952074,  1.73205081, -0.73379939],
       [ 1.52509853, -1.23610162,  1.32858469,  1.73205081, -0.73379939]])

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=0)


In [36]:
y_train=pd.DataFrame(y_train)
y_test=pd.DataFrame(y_test)


In [37]:
# Feature Scaling -- Useful when Features have different units

from sklearn.preprocessing import MinMaxScaler

sc= MinMaxScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)



In [38]:
X_train

array([[0.33561668, 0.48655174, 0.45494286, 1.        , 0.        ],
       [0.2782839 , 0.3173015 , 0.43561799, 0.        , 1.        ],
       [0.45557444, 0.87258866, 0.28413435, 1.        , 0.        ],
       [0.2807759 , 1.        , 0.44680961, 0.        , 0.        ],
       [0.55488118, 0.59103645, 0.62511553, 1.        , 0.        ],
       [0.7880179 , 0.88568959, 0.68649342, 1.        , 0.        ],
       [0.72539353, 0.98922572, 0.54370828, 1.        , 0.        ],
       [0.0060492 , 0.6847981 , 0.0040356 , 0.        , 1.        ],
       [0.00327821, 0.00432296, 0.        , 0.        , 1.        ],
       [0.39676926, 0.95618996, 0.22709197, 0.        , 1.        ],
       [0.69261666, 0.67036159, 0.55486446, 0.        , 1.        ],
       [0.37493063, 0.60480846, 0.19316302, 1.        , 0.        ],
       [0.38348453, 0.73241142, 0.09768292, 0.        , 0.        ],
       [0.47180821, 0.66078277, 0.56031151, 0.        , 0.        ],
       [0.14297577, 0.42201072, 0.

In [39]:
X_train = pd.DataFrame(X_train)

In [40]:
X_train.head()

Unnamed: 0,0,1,2,3,4
0,0.335617,0.486552,0.454943,1.0,0.0
1,0.278284,0.317301,0.435618,0.0,1.0
2,0.455574,0.872589,0.284134,1.0,0.0
3,0.280776,1.0,0.44681,0.0,0.0
4,0.554881,0.591036,0.625116,1.0,0.0


In [41]:
X_test = pd.DataFrame(X_test)

In [42]:
# USING SKLEARN PACKAGE

# Fitting Multiple Linear Regression to the Training set

from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)


LinearRegression()

In [43]:
# Predicting the Test set results

y_pred = regressor.predict(X_test)
y_pred=pd.DataFrame(y_pred)


In [44]:
y_pred.head()

Unnamed: 0,0
0,119090.04529
1,92835.714481
2,92174.105499
3,154057.278222
4,46171.17269


In [45]:
regressor.score(X_train,y_train) # R square for the Train data # 90%


0.953444874625409

In [46]:
regressor.score(X_test,y_test)  # R square for the Test data  # 50%


0.9418012076435293

In [47]:
regressor.coef_


array([[-128913.5470078 ,   -7497.15809519,  -17139.766198  ,
            883.02562988,   -1681.02256964]])

In [48]:
regressor.intercept_


array([183251.11427892])

In [49]:
X_train.head()

Unnamed: 0,0,1,2,3,4
0,0.335617,0.486552,0.454943,1.0,0.0
1,0.278284,0.317301,0.435618,0.0,1.0
2,0.455574,0.872589,0.284134,1.0,0.0
3,0.280776,1.0,0.44681,0.0,0.0
4,0.554881,0.591036,0.625116,1.0,0.0


In [64]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=0)


In [65]:
# Feature Scaling -- Useful when Features have different units

from sklearn.preprocessing import MinMaxScaler

sc= MinMaxScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [66]:
# USING STATSMODELS PACKAGE 

import statsmodels.api as sm  # For getting the statistics of the final model

#Adding Intercept term to the model

X_train=sm.add_constant(X_train)
X_test=sm.add_constant(X_test)


In [67]:
#Fitting a linear regression model to the train data set

m = sm.OLS(y_train,X_train)
model = m.fit()


In [68]:
#Predicting the model on the test data set
predictions = model.predict(X_test)


In [69]:
#Printing the Model Statistics
model.summary()


0,1,2,3
Dep. Variable:,Profit,R-squared:,0.953
Model:,OLS,Adj. R-squared:,0.947
Method:,Least Squares,F-statistic:,139.3
Date:,"Sat, 23 Oct 2021",Prob (F-statistic):,1.24e-21
Time:,22:30:45,Log-Likelihood:,-420.35
No. Observations:,40,AIC:,852.7
Df Residuals:,34,BIC:,862.8
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.833e+05,5328.564,34.390,0.000,1.72e+05,1.94e+05
x1,-1.289e+05,8951.155,-14.402,0.000,-1.47e+05,-1.11e+05
x2,-7497.1581,6942.860,-1.080,0.288,-2.16e+04,6612.431
x3,-1.714e+04,8997.011,-1.905,0.065,-3.54e+04,1144.359
x4,883.0256,3963.845,0.223,0.825,-7172.477,8938.529
x5,-1681.0226,3594.225,-0.468,0.643,-8985.367,5623.322

0,1,2,3
Omnibus:,2.908,Durbin-Watson:,2.197
Prob(Omnibus):,0.234,Jarque-Bera (JB):,2.34
Skew:,0.592,Prob(JB):,0.31
Kurtosis:,2.964,Cond. No.,11.9


In [71]:
X_train=pd.DataFrame(X_train)

In [72]:
#Checking the VIF Value

from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()

vif["VIF"] =[variance_inflation_factor(X_train.values, j) for j in range(X_train.shape[1])]
vif["features"] = X_train.columns


In [73]:
vif

Unnamed: 0,VIF,features
0,12.282476,0
1,2.704653,1
2,1.234107,2
3,2.397892,3
4,1.274384,4
5,1.271328,5


In [75]:
# MAPE (Mean Absolute Percentage Error)

import numpy as np

def mean_abs_perc_error(y_test,y_pred):
    y_test,y_pred = np.array(y_test),np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred)/y_test))*100

mean_abs_perc_error(y_test, predictions)

7.481007567244112