In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#import dataset
startup = pd.read_csv('./Dataset/50_Startups.csv')
startup.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
startup.corr()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
R&D Spend,1.0,0.241955,0.724248,0.9729
Administration,0.241955,1.0,-0.032154,0.200717
Marketing Spend,0.724248,-0.032154,1.0,0.747766
Profit,0.9729,0.200717,0.747766,1.0


In [4]:
startup.dtypes

R&D Spend          float64
Administration     float64
Marketing Spend    float64
State               object
Profit             float64
dtype: object

In [5]:
startup.shape

(50, 5)

In [6]:
startup.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [7]:
### Categorical Features: Convert string type into numeric type. Here State is nominal feature
df = startup.iloc[:, :-1]
x = pd.get_dummies(df).values
y = startup.iloc[:, -1].values

In [8]:
## Splitting the dataset into train and train
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size=5, 
                                                    random_state=0)

In [9]:
## Fitting Multiple linear Regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [10]:
lr.coef_   # slope

array([ 7.78395354e-01,  5.70695437e-03,  3.21344322e-02,  1.24850276e+02,
       -1.02835632e+03,  9.03506043e+02])

In [11]:
lr.intercept_  # intercept

46250.73722073354

In [12]:
## Predicting the test set result manually
coef = lr.coef_.reshape(-1,1)
np.dot(x_test, coef) + lr.intercept_

array([[101475.55240111],
       [133287.417583  ],
       [132546.1614474 ],
       [ 72702.7136192 ],
       [178346.51727008]])

In [13]:
## Predicting the test set result using predict() method
y_pred_train = lr.predict(x_train)
y_pred_test = lr.predict(x_test)

In [14]:
y_test

array([103282.38, 144259.4 , 146121.95,  77798.83, 191050.39])

In [15]:
# Model Evaluation:
from sklearn.metrics import r2_score
print(r2_score(y_train, y_pred_train))
print(r2_score(y_test, y_pred_test))

0.9489916431016199
0.9348094371060999


In [16]:
print(lr.score(x_train, y_train))
print(lr.score(x_test, y_test))

0.9489916431016199
0.9348094371060998


In [17]:
# Prediction for an arbitrary value:
lr.predict([[10000, 30000, 50000, 1, 0, 0], [10000, 30000, 50000, 0, 1, 0],
           [10000, 30000, 50000, 0, 0, 1]])

array([55937.47127899, 54784.26468341, 56716.12704531])

#### If we want to improve accuracy, we can use feature selection.
- Since advertisement has 20% impact on profit hence dropping advertisement column.

In [18]:
del df['Administration']
df.head()

Unnamed: 0,R&D Spend,Marketing Spend,State
0,165349.2,471784.1,New York
1,162597.7,443898.53,California
2,153441.51,407934.54,Florida
3,144372.41,383199.62,New York
4,142107.34,366168.42,Florida


In [19]:
x = pd.get_dummies(df).values
y = startup.iloc[:, -1].values

In [20]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size=5, 
                                                    random_state=0)

In [21]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [22]:
y_pred = lr.predict(x_test)

In [23]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9365755604265402

Droping administration column has not improved our model accuracy significantly.