In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("downloads/Multiple-Linear-Regression-master/50_Startups.csv")
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
df.shape

(50, 5)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [5]:
df.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [6]:
# df.isnull()
# df.isnull
df.nunique()

R&D Spend          49
Administration     50
Marketing Spend    48
State               3
Profit             50
dtype: int64

In [7]:
df['State'].unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [8]:
state_grp = df.groupby("State")

In [9]:
for State, State_df in state_grp:
    print(State)
    print(State_df)

California
    R&D Spend  Administration  Marketing Spend       State     Profit
1   162597.70       151377.59        443898.53  California  191792.06
6   134615.46       147198.87        127716.82  California  156122.51
9   123334.88       108679.17        304981.62  California  149759.96
11  100671.96        91790.61        249744.55  California  144259.40
13   91992.39       135495.07        252664.93  California  134307.35
16   78013.11       121597.55        264346.06  California  126992.93
20   76253.86       113867.30        298664.47  California  118474.03
25   64664.71       139553.16        137962.62  California  107404.34
32   63408.86       129219.61         46085.25  California   97427.84
34   46426.07       157693.92        210797.67  California   96712.80
37   44069.95        51283.14        197029.42  California   89949.14
39   38558.51        82982.09        174999.30  California   81005.76
40   28754.33       118546.05        172795.67  California   78239.91
42   2364

In [10]:
df1 = state_grp.get_group("California")
df1.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
1,162597.7,151377.59,443898.53,California,191792.06
6,134615.46,147198.87,127716.82,California,156122.51
9,123334.88,108679.17,304981.62,California,149759.96
11,100671.96,91790.61,249744.55,California,144259.4
13,91992.39,135495.07,252664.93,California,134307.35


In [11]:
df1.shape

(17, 5)

In [12]:
x = df.iloc[:, :-1] ##independent features
y = df.iloc[:,-1] ## dependent feature = profit

In [13]:
x.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [14]:
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

In [15]:
states = pd.get_dummies(x['State'], drop_first = True)
# drop_first = we use to prevent from dummy trap

In [16]:
states.head()

Unnamed: 0,Florida,New York
0,0,1
1,0,0
2,1,0
3,0,1
4,1,0


In [17]:
x = x.drop('State', axis = 1)

In [18]:
x.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,165349.2,136897.8,471784.1
1,162597.7,151377.59,443898.53
2,153441.51,101145.55,407934.54
3,144372.41,118671.85,383199.62
4,142107.34,91391.77,366168.42


In [19]:
x = pd.concat([x, states], axis = 1)

In [20]:
x.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Florida,New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0


In [22]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [23]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()

regressor.fit(x_train, y_train)

LinearRegression()

In [27]:
y_pred = regressor.predict(x_test)
# predicting the test result

In [25]:
y_pred

array([103015.20159796, 132582.27760816, 132447.73845174,  71976.09851258,
       178537.48221055, 116161.24230165,  67851.69209676,  98791.73374687,
       113969.43533012, 167921.0656955 ])

In [26]:
y_test

28    103282.38
11    144259.40
10    146121.95
41     77798.83
2     191050.39
27    105008.31
38     81229.06
31     97483.56
22    110352.25
4     166187.94
Name: Profit, dtype: float64

In [29]:
# comparing our pred-value from the actual values
# we use r-squared method


In [32]:
from sklearn.metrics import r2_score
score = r2_score(y_test, y_pred)

In [33]:
score

0.9347068473282423

In [None]:
# it shows score value is near to the 1 so our model is predicting better results 