In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [2]:
dataset = pd.read_csv('50_Startups.csv')
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [None]:
# Data Preprocessing
### 1) Handling missing value
### 2) Handling outlier - Outlier only required when we have large tree to avoid overfitting
### 3) Encoding - in tree based model, always use label encoder method
### 4) Feature Scaling - please note, there is no need to do feature scaling in DT
###5 no need to worry about imbalance data bcs it is Regression problem

In [3]:
dataset.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [4]:
dataset.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [6]:
dataset['State'] = dataset['State'].astype('category')
dataset['State'] = dataset['State'].cat.codes
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,2,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,1,191050.39
3,144372.41,118671.85,383199.62,2,182901.99
4,142107.34,91391.77,366168.42,1,166187.94


In [7]:
# split the data into ind and dep variable
x = dataset.iloc[:,0:-1]
y = dataset[['Profit']]

In [8]:
x.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,2
1,162597.7,151377.59,443898.53,0
2,153441.51,101145.55,407934.54,1
3,144372.41,118671.85,383199.62,2
4,142107.34,91391.77,366168.42,1


In [9]:
y.head()

Unnamed: 0,Profit
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


In [10]:
y.head()

Unnamed: 0,Profit
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


In [11]:
# splitting the data into training and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)

# Building DecisionTree Regressor Model

In [12]:
from sklearn.tree import DecisionTreeRegressor
dtregressor = DecisionTreeRegressor()
dtregressor.fit(x_train, y_train)

In [13]:
# Predict test data with dtregressor model
y_pred_train = dtregressor.predict(x_train)
y_pred_test = dtregressor.predict(x_test)

# Evaluate your model

In [14]:
from sklearn.metrics import r2_score

In [15]:
print("Training Accuracy :", r2_score(y_train, y_pred_train))
print()
print("Test Accuracy :", r2_score(y_test, y_pred_test))

Training Accuracy : 1.0

Test Accuracy : 0.901250196464153


In [16]:
# Using cross validation method (K-Fold Mothod)
from sklearn.model_selection import cross_val_score
Trainging_accuracy = cross_val_score(dtregressor, x_train, y_train, cv=10)
print(Trainging_accuracy)

[  0.40699815   0.80313477   0.67505641   0.8153543    0.97493126
   0.26438253   0.92982412   0.88695806 -21.22435468   0.92487939]


In [17]:
Trainging_accuracy.mean()

-1.4542835701348242

# Linear Regression Model

In [18]:
from sklearn.linear_model import LinearRegression
linear = LinearRegression()
linear.fit(x_train, y_train)

In [19]:
# Predict test data with linear model
y_pred_train_lr = linear.predict(x_train)
y_pred_test_lr = linear.predict(x_test)

In [20]:
print("Training Accuracy :", r2_score(y_train, y_pred_train_lr))
print()
print("Test Accuracy :", r2_score(y_test, y_pred_test_lr))

Training Accuracy : 0.9419507593691141

Test Accuracy : 0.9616053937220065


# RandomForestRegressor Model

In [21]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor(n_estimators=500)
rf_regressor.fit(x_train, y_train)

In [22]:
# Predict test data with linear model
y_pred_train_rf = rf_regressor.predict(x_train)
y_pred_test_rf = rf_regressor.predict(x_test)

In [23]:
print("Training Accuracy :", r2_score(y_train, y_pred_train_rf))
print()
print("Test Accuracy :", r2_score(y_test, y_pred_test_rf))

Training Accuracy : 0.9860662822735389

Test Accuracy : 0.941600345067936
