# ADVERTISING DATA

The dataset contains statistics about the sales of a product in 200 different markets, together with advertising budgets in each of these markets for different media channels: TV, radio and newspaper.
The sales are in thousands of units and the budget is in thousands of dollars.
To predict the sales of the product in a particular market based on the advertising budgets for different media channels (TV, radio and newspaper) in that market using K-Nearest Neighbors (KNN) regression.

In [13]:
#importing the necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.impute import SimpleImputer
from sklearn import set_config


#reading the dataset
df = pd.read_csv("Advertising.csv")

#dropping the zeroth index as it is of no use in the dataset
df.drop(df.columns[[0]], axis=1, inplace=True)

#splitting data into independent and dependant or target variables
X = df.drop(['Sales'], axis=1)
y = df['Sales']


#splitting the data into train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


#preprocessing pipeline
preprocessor = Pipeline(steps=[
    ('imputation_median', SimpleImputer(missing_values='NaN', strategy='median')),
    ('scaler', StandardScaler())
])


#define the machine learning model pipeline
knn = KNeighborsRegressor()
knn_pipeline = Pipeline([
    ('imputation_median', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor())
])



#defining the parameter grid 
from sklearn.model_selection import GridSearchCV
param_grid = {
    'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]
}
# Define the grid search object
grid_search = GridSearchCV(knn_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')


#fit the model
grid_search.fit(X_train, y_train)


#finding best parameter
print('Best parameters: %s' % grid_search.best_params_)

Best parameters: {'knn__n_neighbors': 4}


In [14]:
#fitting the model
knn_pipeline.fit(X_train,y_train)

In [15]:
y_pred = knn_pipeline.predict(X_test)
y_pred

array([17.8 , 21.62, 19.94, 11.  , 22.3 , 13.16, 22.34,  7.8 , 12.66,
       14.86,  9.14,  7.74, 13.6 ,  7.92, 10.48, 12.2 ,  8.66, 16.26,
       11.44, 18.56, 20.66, 11.62,  7.74, 22.48,  9.9 ,  7.9 , 21.74,
       12.92, 10.7 ,  7.92, 11.98, 10.26, 21.7 , 11.62, 14.62, 20.24,
        7.74, 20.2 , 11.  ,  6.76,  9.84, 12.36, 10.36,  8.66, 12.66,
        7.44, 10.66, 14.22, 10.74, 11.94, 14.4 , 11.36, 12.12, 10.44,
        9.06, 10.98, 11.  , 25.3 ,  7.24, 11.92])

In [10]:
best_estimator = grid_search.best_estimator_
scores = cross_val_score(best_estimator, X_train, y_train, cv=5)
print('Model accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

Model accuracy: 0.911 (0.038)
