In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import RFE
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

# Read the data in df
media = pd.read_csv('/content/mediacompany.csv')
media['Date'] = pd.to_datetime(media['Date'])
df = media.drop(['Unnamed: 7', 'Date'],axis = 1)


# Step-1: Perform EDA / Data Cleaning



# Split DF into train/test data frames
train, test = train_test_split(df, train_size = 0.7, test_size = 0.3, random_state = 100)




# Define Feature Scaling instance
scaler = MinMaxScaler()

# Step-2: Define features for X_train/y_train sets
x_cols = train.columns.drop('Views_show')
y_cols = ['Views_show']

# Extract the columns to get X_train & y_train
X_train = train[x_cols]
y_train = train[y_cols]

# Step-3
# Part-1 : Perform Linear Regression using Ordinary Least Squares (OLS) in Statsmodels Library
X_train_const = sm.add_constant(X_train)
X_train_scaled = scaler.fit_transform(X_train_const)

lm_OLS = sm.OLS(y_train,X_train_scaled).fit()
print(lm_OLS.summary())

# Part-2 : Perform Linear Regression using Scikit-Learn (sklearn) Library
lm_SKL = LinearRegression()
lm_SKL.fit(X_train_scaled[:,1:], y_train)


# We can also use RFE in sklearn for feature selection
rfe = RFE(estimator=lm_SKL, n_features_to_select = 4).fit(X_train_scaled,y_train)
print("\nRFE Estimator : ")
print(list(zip(X_train.columns,rfe.support_,rfe.ranking_)))

# Part-3 : Using SGD Regressor
from sklearn.linear_model import SGDRegressor
lm_SGD = SGDRegressor()
lm_SGD.fit(X_train_scaled[:,1:], y_train)


# Check VIF for X_train (unscaled version)
X_vif = X_train
vif = pd.DataFrame()
vif['Features'] = X_vif.columns
vif['VIF'] = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by="VIF", ascending=False)

print("\nVIF Values:\n",vif)




# Train the model using above by iterating and changing the features at Step-1 until a good model with high r2 is reached


# Now, make predictions using the model and check error
X_test = test[x_cols]
y_test = test[y_cols]

# Apply Tranformation used before in feature scaling, after adding a constant as done before
X_test_const = sm.add_constant(X_test)
X_test_transf = scaler.transform(X_test_const)

# USing model to make predictions and check error
y_test_pred_OLS = lm_OLS.predict(X_test_transf)
y_test_pred_SKL = lm_SKL.predict(X_test_transf[:, 1:])
y_test_pred_SGD = lm_SGD.predict(X_test_transf[:, 1:])


# Plotting Error Distribution
#error = y_test - y_test_pred
#sns.distplot(error)
#plt.show()




                                 OLS Regression Results                                
Dep. Variable:             Views_show   R-squared (uncentered):                   0.915
Model:                            OLS   Adj. R-squared (uncentered):              0.907
Method:                 Least Squares   F-statistic:                              110.1
Date:                Tue, 31 Oct 2023   Prob (F-statistic):                    4.27e-26
Time:                        07:53:32   Log-Likelihood:                         -716.04
No. Observations:                  56   AIC:                                      1442.
Df Residuals:                      51   BIC:                                      1452.
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------