## Multiple Linear Regression

The dataset used contains information about startups. I build a model to see if there are linear dependencies between all these variables to predict the profit based on the information on the dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#### Import the dataset

In [2]:
dataset = pd.read_csv('50_Startups.csv')

In [3]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
X = dataset.iloc[:, :-1].values

In [7]:
y = dataset.iloc[:, 4].values

#### Encode the categorical data

In [8]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [10]:
labelencoder_X = LabelEncoder()

In [11]:
X[:, 3] = labelencoder_X.fit_transform(X[:, 3])

In [12]:
onehotencoder = OneHotEncoder(categorical_features=[3])

In [13]:
X = onehotencoder.fit_transform(X).toarray()

In [16]:
X = X[:, 1:] #Avoiding the dummy variable trap

 #### Spitting the dataset into training set and test set

In [17]:
from sklearn.cross_validation import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#### Fitting multiple linear regression to the training set

In [22]:
from sklearn.linear_model import LinearRegression

In [23]:
regressor = LinearRegression()

In [24]:
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

#### Predicting the test set results

In [46]:
y_pred = regressor.predict(X_test)

#### Building the optimal model using backward elimination

In [47]:
import statsmodels.formula.api as sm

In [49]:
X = np.append(arr = np.ones((50,1)).astype(int), values = X, axis = 1)

In [50]:
X_opt = X[:, [0, 1, 2, 3, 4, 5]]

In [52]:
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

In [56]:
X_opt = X[:, [0, 1, 3, 4, 5]]

In [57]:
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

In [59]:
X_opt = X[:, [0, 3, 4, 5]]

In [60]:
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

In [61]:
X_opt = X[:, [0, 3, 5]]

In [62]:
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

In [64]:
X_opt = X[:, [0, 3]]

In [65]:
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()