In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
sns.set(style="darkgrid")

In [None]:
data=pd.read_csv('/kaggle/input/advertisingcsv/Advertising.csv')
data.head()

# Understanding the Problem
* Input: Investment made on advertising modes: TV, Radio, Newspaper
* Output: Sales made by using the following investments on the advertising.
* Model Training: Different models can work perfectly as Linear Model so we will try to evaluate three models: Linear Regression, Random Forest Regressor, Gradient Boosting Regressor. Then we will choose suitable model.
* Metrics to be used: Mean Squared Error and $R^2$ Score

# EDA

In [None]:
data.drop('Unnamed: 0', axis=1, inplace=True)
data.shape

In [None]:
data.isna().sum()

In [None]:
data.describe()

**So it shows that there's huge differences between features' ranges so Scaling is needed**

In [None]:
data.duplicated().sum()

In [None]:
data.info()

**It shows that all the data is numeric so hot encoding NOT required.**

# Visualisation of Data

In [None]:
plt.figure(figsize=(15,8))
for i,col in enumerate(['TV','Radio','Newspaper']):
    plt.subplot(2,2,i+1)
    sns.regplot(data=data, x=col, y='Sales')

In [None]:
plt.figure(figsize=(15,8))
for i,col in enumerate(['TV','Radio','Newspaper']):
    plt.subplot(2,2,i+1)
    sns.histplot(data=data,x=col,bins=20,kde=True)

In [None]:
plt.figure(figsize=(15,8))
for i,col in enumerate(['TV','Radio','Newspaper','Sales']):
    plt.subplot(2,2,i+1)
    sns.boxplot(data=data, x=col, orient='h')

In [None]:
data.corr()

In [None]:
sns.heatmap(data.corr(), annot=True)

**Analysis**
* With increase in TV, Radio advertisement, the Sales seem to increase as well; however, more good increasing relation of Sales is with TV.
* Newspaper advertising investment within 40-80 seems perfect as it doesn't guarantee that increasing Newspaper sales might give good results in Sales. Also, correlation coefficient b/w newspaper sales is 0.23 approx, indicating a weak positive correlation.
* More investment seems to made on TV as above plots concerned, TV mode seems beneficial as well; however, if we analyse the data, it seems that the amount of investment in TV advertising seems higher (0-300 approx) while the sales aren't much good (1-27 approx).
* Data shows that more money invested on advertising as compare to the sales made.
* Data doesn't seem to have outliers.

# Feature Engineering

In [None]:
data['Total_advertising']=data['TV']+data['Radio']+data['Newspaper']
data.head()

In [None]:
data.corr() #strong positive correlation of new features to Sales

# Scaling and Splitting of Data

In [None]:
X=(data.drop(columns=['Sales'])).values
Y=data[['Sales']].values.flatten()
X[:5,], Y[:5]

In [None]:
x_train, x_test, y_train, y_test=train_test_split(X,Y,test_size=0.25,random_state=42)
x_train.shape, x_test.shape

In [None]:
scaler=StandardScaler()
x_train_scale=scaler.fit_transform(x_train)
x_test_scale=scaler.fit_transform(x_test)
x_train_scale[:5,]

# Model Training
**Models:**
1. Linear Regression
2. Random Forest Regressor
3. Gradient Boosting Regressor

**Linear Regression**

In [None]:
lin_model=LinearRegression()
lin_model.fit(x_train_scale, y_train)

y_train_pred=lin_model.predict(x_train_scale)
y_test_pred=lin_model.predict(x_test_scale)

mse_train=mean_squared_error(y_train, y_train_pred)
mse_test=mean_squared_error(y_test, y_test_pred)
r2_train=r2_score(y_train, y_train_pred)
r2_test=r2_score(y_test, y_test_pred)

print('Linear Regression Evaluation =>\n\tTrain:\n\t\tMSE: {}\n\t\tR2 Score: {}\n\tTest:\n\t\tMSE: {}\n\t\tR2 Score: {}'.format(mse_train, r2_train,mse_test,r2_test))

**Random Forest Regressor**

In [None]:
rfr_model=RandomForestRegressor(random_state=42)
rfr_model.fit(x_train_scale, y_train)

y_train_pred=rfr_model.predict(x_train_scale)
y_test_pred=rfr_model.predict(x_test_scale)

mse_train=mean_squared_error(y_train, y_train_pred)
mse_test=mean_squared_error(y_test, y_test_pred)
r2_train=r2_score(y_train, y_train_pred)
r2_test=r2_score(y_test, y_test_pred)

print('Random Forest Regressor Evaluation =>\n\tTrain:\n\t\tMSE: {}\n\t\tR2 Score: {}\n\tTest:\n\t\tMSE: {}\n\t\tR2 Score: {}'.format(mse_train, r2_train,mse_test,r2_test))

**Gradient Boosting Regressor**

In [None]:
gbr_model=GradientBoostingRegressor(random_state=42,loss='squared_error')
gbr_model.fit(x_train_scale, y_train)

y_train_pred=gbr_model.predict(x_train_scale)
y_test_pred=gbr_model.predict(x_test_scale)

mse_train=mean_squared_error(y_train, y_train_pred)
mse_test=mean_squared_error(y_test, y_test_pred)
r2_train=r2_score(y_train, y_train_pred)
r2_test=r2_score(y_test, y_test_pred)

print('Gradient Boosting Regressor Evaluation =>\n\tTrain:\n\t\tMSE: {}\n\t\tR2 Score: {}\n\tTest:\n\t\tMSE: {}\n\t\tR2 Score: {}'.format(mse_train, r2_train,mse_test,r2_test))

# Models Result Analysis
Now Gradient Boosting Regressor seems best as it learns from previous mistakes, as it works better on test set with same number of decision trees (default n_estimators=100). <br>
The Random Forest Regressor works best on training but could be **Over-fitting** on that model<br>
**Gradient Boosting Regressor chosen!**

In [None]:
plt.figure(figsize=(14,4))
sns.regplot(x=y_test,y=y_test_pred,color='purple')
plt.title('Actual vs Predicted Selling Price (Gradient Boosting Regressor)')
plt.xlabel('Actual Test Values')
plt.ylabel('Predicted Test Values')