In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
import pandas as pd
df = pd.read_csv('50_Startups.csv')

In [3]:
df.head()

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RND     50 non-null     float64
 1   ADMIN   50 non-null     float64
 2   MKT     50 non-null     float64
 3   STATE   50 non-null     object 
 4   PROFIT  50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [6]:
s = df.isna().sum()
s

RND       0
ADMIN     0
MKT       0
STATE     0
PROFIT    0
dtype: int64

## Seperate X and Y features(Profit)

In [7]:
X = df.drop(columns=['PROFIT'])
Y = df[['PROFIT']]

In [8]:
X.head()

Unnamed: 0,RND,ADMIN,MKT,STATE
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [10]:
Y.head()

Unnamed: 0,PROFIT
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


## Seperate Cat and Con for X

In [11]:
cat = list(X.columns[X.dtypes=='object'])
con = list(X.columns[X.dtypes!='object'])

In [12]:
cat

['STATE']

In [13]:
con

['RND', 'ADMIN', 'MKT']

## Creating a preprocessing Pipeline

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [15]:
num_pipe = Pipeline(steps=[('impute', SimpleImputer(strategy='mean')),
                           ('scaler', StandardScaler())])

cat_pipe = Pipeline(steps=[('impute', SimpleImputer(strategy='most_frequent')),
                           ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

pre = ColumnTransformer([('num', num_pipe, con),
                         ('cat', cat_pipe, cat)]).set_output(transform='pandas')

In [18]:
x_pre = pre.fit_transform(X)
x_pre.head()

Unnamed: 0,num__RND,num__ADMIN,num__MKT,cat__STATE_California,cat__STATE_Florida,cat__STATE_New York
0,2.016411,0.560753,2.153943,0.0,0.0,1.0
1,1.95586,1.082807,1.9236,1.0,0.0,0.0
2,1.754364,-0.728257,1.626528,0.0,1.0,0.0
3,1.554784,-0.096365,1.42221,0.0,0.0,1.0
4,1.504937,-1.079919,1.281528,0.0,1.0,0.0


## Apply train and test Split

In [19]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x_pre, Y, test_size=0.2, random_state=21)


In [20]:
xtrain.head()

Unnamed: 0,num__RND,num__ADMIN,num__MKT,cat__STATE_California,cat__STATE_Florida,cat__STATE_New York
30,-0.258074,-0.205629,-0.990357,0.0,1.0,0.0
21,0.102724,1.169186,0.732788,0.0,0.0,1.0
19,0.279442,1.159837,-1.743127,0.0,0.0,1.0
11,0.593085,-1.06554,0.319834,1.0,0.0,0.0
22,0.006007,0.05185,0.762376,0.0,1.0,0.0


In [21]:
xtest.head()

Unnamed: 0,num__RND,num__ADMIN,num__MKT,cat__STATE_California,cat__STATE_Florida,cat__STATE_New York
7,1.245057,0.87198,0.932186,0.0,1.0,0.0
44,-1.134305,1.206419,-1.509074,1.0,0.0,0.0
43,-1.281134,0.217682,-1.449605,0.0,0.0,1.0
25,-0.199312,0.656489,-0.603517,1.0,0.0,0.0
14,1.017181,1.269199,0.375742,0.0,1.0,0.0


In [22]:
ytrain.head()

Unnamed: 0,PROFIT
30,99937.59
21,111313.02
19,122776.86
11,144259.4
22,110352.25


In [23]:
ytest.head()

Unnamed: 0,PROFIT
7,155752.6
44,65200.33
43,69758.98
25,107404.34
14,132602.65


In [24]:
xtrain.shape

(40, 6)

In [25]:
xtest.shape

(10, 6)

## Create a KNN Regression Model

In [37]:
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor(n_neighbors=10)
model.fit(xtrain, ytrain)

In [38]:
model.score(xtrain, ytrain)

0.7313615687197589

In [39]:
model.score(xtest, ytest)

0.7531046602989124

## Hyperparameter Tuning with GridSearchCV

In [29]:
params = {"n_neighbors":[2,3,4,5,6,7,8,9,10]}

In [40]:
from sklearn.model_selection import GridSearchCV
kr = KNeighborsRegressor()
gscv = GridSearchCV(kr, param_grid=params, cv=5, scoring='neg_mean_absolute_error')
gscv.fit(xtrain, ytrain)

In [41]:
gscv.best_score_

-16171.048625

In [42]:
best_kr = gscv.best_estimator_
best_kr

In [43]:
best_kr.score(xtrain, ytrain)

0.9106191272125944

In [44]:
best_kr.score(xtest, ytest)

0.8275361964808616

In [45]:
ypred_tr = best_kr.predict(xtrain)
ypred_ts = best_kr.predict(xtest)

In [46]:
ypred_tr[0:5]

array([[ 98358.255],
       [131762.395],
       [111890.75 ],
       [147009.68 ],
       [117309.575]])

In [47]:
ytrain.head()

Unnamed: 0,PROFIT
30,99937.59
21,111313.02
19,122776.86
11,144259.4
22,110352.25


In [48]:
ypred_ts[0:5]

array([[132926.21],
       [ 56054.62],
       [ 39803.74],
       [ 97070.32],
       [123659.53]])

In [49]:
ytest.head()

Unnamed: 0,PROFIT
7,155752.6
44,65200.33
43,69758.98
25,107404.34
14,132602.65
