In [1]:
# Importing the libraries

import os  # For setting the working directory
import numpy as np   # For numpy calculations
import matplotlib.pyplot as plt   # For data visualization
import pandas as pd   # For DataFrame processing
import statsmodels.api as sm  # For getting the statistics of the final model
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split


In [13]:
# # Importing the dataset
# =============================================================================

os.chdir('D:\\MLP_Session_26_JULY\\MLP_10_11_08_21')
dataset = pd.read_csv('50_Startups_data.csv')

In [14]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,14681.4
1,162597.7,151377.59,443898.53,California,35673.41
2,153441.51,101145.55,407934.54,Florida,42559.73
3,144372.41,118671.85,383199.62,New York,49490.75
4,142107.34,91391.77,366168.42,Florida,64926.08


In [15]:
# Missing values

dataset.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [16]:
# For Viewing the columns

dataset.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')

In [17]:
# For checking the data type of the variables

dataset.dtypes

R&D Spend          float64
Administration     float64
Marketing Spend    float64
State               object
Profit             float64
dtype: object

In [18]:
# For checking the number of rows and columns
 
dataset.shape

(50, 5)

In [19]:
# For getting information about the data
 
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [23]:
# =============================================================================
# Creating the Independendent and Dependent Data Sets
# =============================================================================


X = dataset.loc[:,'R&D Spend':'State'] #Feature Data
y = dataset.loc[:, 'Profit'] # Dependent Data


#X = dataset.iloc[:,:-1] #Feature Data
#Y = dataset.iloc[:,-1 ] # Dependent Data


# X = dataset.iloc[:, dataset.columns != 'Profit']
# Y = dataset.iloc[:, dataset.columns == 'Profit']

In [24]:
#Dummy Variables creation

X_dummy = pd.get_dummies(dataset['State'],prefix='State',drop_first=True)

In [25]:
X_dummy.head()

Unnamed: 0,State_Florida,State_New York
0,0,1
1,0,0
2,1,0
3,0,1
4,1,0


In [27]:
X_numeric = dataset.loc[:,'R&D Spend':'Marketing Spend']

In [28]:
# Feature Scaling -- Useful when Features have different units

from sklearn.preprocessing import MinMaxScaler

sc= MinMaxScaler()
X_numeric = sc.fit_transform(X_numeric)



In [30]:
X_numeric

array([[1.        , 0.65174393, 1.        ],
       [0.98335946, 0.76197173, 0.94089337],
       [0.92798459, 0.37957895, 0.8646636 ],
       [0.87313643, 0.51299839, 0.81223513],
       [0.85943772, 0.30532804, 0.77613557],
       [0.797566  , 0.3694479 , 0.76912588],
       [0.81412828, 0.73016111, 0.27071031],
       [0.7880179 , 0.71745725, 0.68649342],
       [0.72901786, 0.74173276, 0.66049977],
       [0.74590551, 0.43692884, 0.64644319],
       [0.61635061, 0.45150637, 0.48573267],
       [0.60884455, 0.30836422, 0.52936195],
       [0.56766982, 0.57883556, 0.52956308],
       [0.55635219, 0.64106561, 0.53555202],
       [0.72539353, 0.8013272 , 0.54370828],
       [0.69261666, 0.54302973, 0.55486446],
       [0.47180821, 0.53527036, 0.56031151],
       [0.57246821, 0.71401273, 0.59894835],
       [0.55488118, 0.47877201, 0.62511553],
       [0.52264964, 0.77823604, 0.        ],
       [0.46116861, 0.47642362, 0.63305328],
       [0.47408436, 0.78021012, 0.63532724],
       [0.

In [32]:
X_numeric = pd.DataFrame(X_numeric)

In [33]:
# concatenate the dummy variable columns onto the original DataFrame (axis=0 means rows, axis=1 means columns)
data = pd.concat([X_numeric, X_dummy], axis=1)
data.head()

Unnamed: 0,0,1,2,State_Florida,State_New York
0,1.0,0.651744,1.0,0,1
1,0.983359,0.761972,0.940893,0,0
2,0.927985,0.379579,0.864664,1,0
3,0.873136,0.512998,0.812235,0,1
4,0.859438,0.305328,0.776136,1,0


In [34]:
data.columns

Index([0, 1, 2, 'State_Florida', 'State_New York'], dtype='object')

In [39]:
# Renaming Multiple columns

data = data.rename(columns={0:'R&D Spend',1:'Administration',2:'Marketing Spend'})

In [40]:
data.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State_Florida',
       'State_New York'],
      dtype='object')

In [41]:
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
0,1.0,0.651744,1.0,0,1
1,0.983359,0.761972,0.940893,0,0
2,0.927985,0.379579,0.864664,1,0
3,0.873136,0.512998,0.812235,0,1
4,0.859438,0.305328,0.776136,1,0


In [42]:
# # Splitting the dataset into the Training set and Test set
# =============================================================================

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size = 0.2,random_state=0)

In [43]:
X_train.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
33,0.335617,0.394134,0.454943,1,0
35,0.278284,0.257032,0.435618,0,1
26,0.455574,0.706845,0.284134,1,0
34,0.280776,0.810055,0.44681,0,0
18,0.554881,0.478772,0.625116,1,0


In [44]:
y_train.head()

33    126992.93
35    132602.65
26    108733.99
34    129917.04
18     97483.56
Name: Profit, dtype: float64

In [45]:
# USING STATSMODELS PACKAGE 

import statsmodels.api as sm  # For getting the statistics of the final model

#Adding Intercept term to the model

X_train=sm.add_constant(X_train)
X_test=sm.add_constant(X_test)

In [46]:
#Fitting a linear regression model to the train data set

m = sm.OLS(y_train,X_train)
model = m.fit()


In [47]:
#Predicting the model on the test data set
predictions = model.predict(X_test)


In [48]:
#Printing the Model Statistics
model.summary()


0,1,2,3
Dep. Variable:,Profit,R-squared:,0.953
Model:,OLS,Adj. R-squared:,0.947
Method:,Least Squares,F-statistic:,139.3
Date:,"Thu, 21 Oct 2021",Prob (F-statistic):,1.24e-21
Time:,18:45:45,Log-Likelihood:,-420.35
No. Observations:,40,AIC:,852.7
Df Residuals:,34,BIC:,862.8
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.833e+05,5328.564,34.390,0.000,1.72e+05,1.94e+05
R&D Spend,-1.289e+05,8951.155,-14.402,0.000,-1.47e+05,-1.11e+05
Administration,-9255.1227,8570.851,-1.080,0.288,-2.67e+04,8162.941
Marketing Spend,-1.714e+04,8997.011,-1.905,0.065,-3.54e+04,1144.359
State_Florida,883.0256,3963.845,0.223,0.825,-7172.477,8938.529
State_New York,-1681.0226,3594.225,-0.468,0.643,-8985.367,5623.322

0,1,2,3
Omnibus:,2.908,Durbin-Watson:,2.197
Prob(Omnibus):,0.234,Jarque-Bera (JB):,2.34
Skew:,0.592,Prob(JB):,0.31
Kurtosis:,2.964,Cond. No.,11.8


In [49]:
#Checking the VIF Value

from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()

vif["VIF"] =[variance_inflation_factor(X_train.values, j) for j in range(X_train.shape[1])]
vif["features"] = X_train.columns


In [50]:
vif

Unnamed: 0,VIF,features
0,12.282476,const
1,2.704653,R&D Spend
2,1.234107,Administration
3,2.397892,Marketing Spend
4,1.274384,State_Florida
5,1.271328,State_New York


In [51]:
# MAPE (Mean Absolute Percentage Error)

import numpy as np

def mean_abs_perc_error(y_test,y_pred):
    y_test,y_pred = np.array(y_test),np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred)/y_test))*100

mean_abs_perc_error(y_test, predictions)

7.48100756724408