In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression

In [2]:
dataset = pd.read_csv('dataset/50_startups.csv')
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
dataset.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [4]:
dataset['State'].value_counts()

New York      17
California    17
Florida       16
Name: State, dtype: int64

In [5]:
states = pd.get_dummies(dataset['State'])
states.head()

Unnamed: 0,California,Florida,New York
0,0,0,1
1,1,0,0
2,0,1,0
3,0,0,1
4,0,1,0


In [6]:
dataset = dataset.drop(['State'], axis=1)
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
0,165349.2,136897.8,471784.1,192261.83
1,162597.7,151377.59,443898.53,191792.06
2,153441.51,101145.55,407934.54,191050.39
3,144372.41,118671.85,383199.62,182901.99
4,142107.34,91391.77,366168.42,166187.94


In [7]:
dataset = pd.concat([dataset, states], axis=1)
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,California,Florida,New York
0,165349.2,136897.8,471784.1,192261.83,0,0,1
1,162597.7,151377.59,443898.53,191792.06,1,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1,0
3,144372.41,118671.85,383199.62,182901.99,0,0,1
4,142107.34,91391.77,366168.42,166187.94,0,1,0


In [8]:
dataset = dataset.drop(['New York'], axis=1)
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,California,Florida
0,165349.2,136897.8,471784.1,192261.83,0,0
1,162597.7,151377.59,443898.53,191792.06,1,0
2,153441.51,101145.55,407934.54,191050.39,0,1
3,144372.41,118671.85,383199.62,182901.99,0,0
4,142107.34,91391.77,366168.42,166187.94,0,1


In [9]:
dataset.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'Profit',
       'California', 'Florida'],
      dtype='object')

In [10]:
dataset = dataset[['R&D Spend', 'Administration', 'Marketing Spend','California', 'Florida', 'Profit']]
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,California,Florida,Profit
0,165349.2,136897.8,471784.1,0,0,192261.83
1,162597.7,151377.59,443898.53,1,0,191792.06
2,153441.51,101145.55,407934.54,0,1,191050.39
3,144372.41,118671.85,383199.62,0,0,182901.99
4,142107.34,91391.77,366168.42,0,1,166187.94


In [11]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)

In [13]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [14]:
y_pred = regressor.predict(X_test)

In [18]:
regressor.coef_

array([ 7.73467193e-01,  3.28845975e-02,  3.66100259e-02, -6.99369053e+02,
       -1.65865321e+03])

In [20]:
p_values = f_regression(X, y)[1]

In [21]:
summary = pd.DataFrame(data=['R&D Spend', 'Administration', 'Marketing Spend','California', 'Florida'], columns=['Features'])
summary['coefficients'] = regressor.coef_
summary['p-values'] = p_values.round(3)
summary

Unnamed: 0,Features,coefficients,p-values
0,R&D Spend,0.773467,0.0
1,Administration,0.032885,0.162
2,Marketing Spend,0.03661,0.0
3,California,-699.369053,0.312
4,Florida,-1658.653213,0.421


In [None]:
# Here we can remove columns such as states and administration