In [125]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [126]:
dataset = pd.read_csv('50_Startups.csv')
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [127]:
# State - char variable (Encoding is required)

# Pre-processing
# part 1 : handling missing value
# part 2 : handling encoding part (label encoder, one hot encoder, dummy)
# part 3 : handling outlier
# part 4 : feature scaling - standarisation and normalisation
# part 5 : handling imbalance dataset

In [128]:
# part 1 : check missing data

In [129]:
dataset.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [130]:
# part 2 : Encoding 
# Label encoder
dataset['State'] = dataset['State'].astype('category')
dataset['State'] = dataset['State'].cat.codes

In [131]:
dataset['State'].value_counts()

2    17
0    17
1    16
Name: State, dtype: int64

In [132]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,2,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,1,191050.39
3,144372.41,118671.85,383199.62,2,182901.99
4,142107.34,91391.77,366168.42,1,166187.94


In [133]:
# One Hot Encoder
dataset = pd.get_dummies(dataset, columns=['State'])

In [134]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_0,State_1,State_2
0,165349.2,136897.8,471784.1,192261.83,0,0,1
1,162597.7,151377.59,443898.53,191792.06,1,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1,0
3,144372.41,118671.85,383199.62,182901.99,0,0,1
4,142107.34,91391.77,366168.42,166187.94,0,1,0


In [135]:
# Dummy Variables - n-1

In [136]:
dataset = dataset.drop(['State_0'], axis=1)

In [137]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_1,State_2
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


In [138]:
x = dataset.drop(['Profit'], axis=1)
y = dataset['Profit']

In [139]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75, random_state=501)

In [140]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(37, 5)
(13, 5)
(37,)
(13,)


In [141]:
from sklearn.metrics import r2_score 

## Linear Regression

In [142]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
lr_model.fit(x_train, y_train)

In [143]:
y_predict_lr_train = lr_model.predict(x_train)
y_predict_lr_test = lr_model.predict(x_test)

# for Linear regression
print(r2_score(y_train, y_predict_lr_train))
print("\n")
print(r2_score(y_test, y_predict_lr_test))

0.960389517551679


0.8242696380420218


### Grid Search

In [145]:

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
import numpy as np

ridge = Ridge()
alpha_space = np.logspace(-5, 8, 15)
param_grid = {'alpha': alpha_space}

reg_cv = GridSearchCV(ridge, param_grid, cv=5)
reg_cv.fit(x_train, y_train)


In [146]:
# for Ridge regression
y_predict_train_cv = reg_cv.predict(x_train)
y_predict_test_cv = reg_cv.predict(x_test)

In [147]:
print(r2_score(y_train, y_predict_train_cv))
print("\n")
print(r2_score(y_test, y_predict_test_cv))

0.9594281838702149


0.8514673950311288


## RandomizedSearchCV

In [148]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Model
ridge = Ridge()

# Define parameter space
param_dist = {'alpha': np.logspace(-5, 8, 200)}

# Randomized search
ridge_random_search = RandomizedSearchCV(
    ridge,
    param_distributions=param_dist,
    n_iter=100,         # Number of parameter settings sampled
    cv=5,              # 5-fold cross-validation
    scoring='r2',      # or 'neg_mean_squared_error' for regression
    random_state=42
)

# Fit the model
ridge_random_search.fit(x_train, y_train)


In [149]:
# for Ridge regression
y_predict_train_rrs = ridge_random_search.predict(x_train)
y_predict_test_rrs = ridge_random_search.predict(x_test)

print(r2_score(y_train, y_predict_train_rrs))
print("\n")
print(r2_score(y_test, y_predict_test_rrs))

0.9594293429168641


0.8513793946797087


In [108]:
lr_model.coef_

array([ 8.15770421e-01,  1.84366878e-02,  2.95195307e-02, -3.11356703e+03,
        1.08176884e+02])

In [109]:
lr_model.intercept_

42612.16966033258

In [110]:
# (R&D Spend * 8.15770421e-0) + (Administration * 1.84366878e-02) + (So....on) + 42612.169660332554 = Predicted Profit

# Decision Tree Regressor

In [111]:
from sklearn.tree import DecisionTreeRegressor

In [112]:
dt_model = DecisionTreeRegressor()
dt_model.fit(x_train, y_train)

In [113]:
y_predict_dt_train = dt_model.predict(x_train)
y_predict_dt_test = dt_model.predict(x_test)

In [114]:
print(r2_score(y_train, y_predict_dt_train))
print("\n")
print(r2_score(y_test, y_predict_dt_test))

1.0


0.7832961377810379


# Random Forest Regressor

In [115]:
# from sklearn.ensemble import RandomForestRegressor

In [116]:
# rf_model = RandomForestRegressor()
# rf_model.fit(x_train, y_train)

In [117]:
# y_predict_rf_train = rf_model.predict(x_train)
# y_predict_rf_test = rf_model.predict(x_test)

In [118]:
# print(r2_score(y_train, y_predict_rf_train))
# print("\n")
# print(r2_score(y_test, y_predict_rf_test))

In [150]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

# Define model
dt = DecisionTreeRegressor(random_state=42)

# Define hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': ['sqrt', 'log2', None],
    'criterion': ['squared_error', 'friedman_mse']
}

# GridSearchCV
grid_search = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1
)

# Fit model
grid_search.fit(x_train, y_train)

# Best model
best_dt = grid_search.best_estimator_

# Predict
y_predict_train = best_dt.predict(x_train)
y_predict_test = best_dt.predict(x_test)

# Scores
from sklearn.metrics import r2_score
print("Train R2:", r2_score(y_train, y_predict_train))
print("Test R2 :", r2_score(y_test, y_predict_test))
print("Best Parameters:", grid_search.best_params_)


Train R2: 0.9895639678280409
Test R2 : 0.8538277614134092
Best Parameters: {'criterion': 'squared_error', 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5}


In [120]:
# y_predict_dt_cv_train = dt_cv.predict(x_train)
# y_predict_dt_cv_test = dt_cv.predict(x_test)

# print(r2_score(y_train, y_predict_dt_cv_train))
# print("\n")
# print(r2_score(y_test, y_predict_dt_cv_test))

In [151]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score
import numpy as np

# Define model
dt = DecisionTreeRegressor(random_state=42)

# Define hyperparameter distributions
param_dist = {
    'max_depth': [3, 5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': ['sqrt', 'log2', None],
    'criterion': ['squared_error', 'friedman_mse']
}

# RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=dt,
    param_distributions=param_dist,
    n_iter=30,           # Try 30 random combinations
    cv=5,
    scoring='r2',
    n_jobs=-1,
    random_state=42
)

# Fit model
random_search.fit(x_train, y_train)

# Best model
best_dt_random = random_search.best_estimator_

# Predict
y_predict_train = best_dt_random.predict(x_train)
y_predict_test = best_dt_random.predict(x_test)

# Scores
print("Train R2:", r2_score(y_train, y_predict_train))
print("Test R2 :", r2_score(y_test, y_predict_test))
print("Best Parameters:", random_search.best_params_)


Train R2: 0.9869025842270561
Test R2 : 0.8256137396825216
Best Parameters: {'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': None, 'criterion': 'friedman_mse'}


Better Model Accuracy: Finding the right settings for your model can make it perform much better overall.

Avoids Overfitting or Underfitting: Tuning helps strike the right balance — your model won't just memorize the training data or perform poorly on new data.

Generalizes Well: A well-tuned model is more likely to work well on real-world, unseen data — not just on your test dataset.

Efficient Use of Resources: You can save time and computing power by avoiding trial-and-error or overcomplicated models.

Easier to Understand: With the right parameters, your model may become simpler and more interpretable, making it easier to explain to others.