<a href="https://colab.research.google.com/github/agarwal-peeush/MachineLearning/blob/master/Python/Practice/MediaCompany_case_study.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Problem statement
Media company wants to know important predictors behind the viewership for show. We've performed LinearRegression to predict the predictors for target variable. 

In [0]:
#@title
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

In [0]:
#@title
for fn in uploaded.keys():
  filename = fn
  break

print (filename)

In [0]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="white")

In [0]:
media_df = pd.read_csv(filename)

media_df = media_df.drop('Unnamed: 7', axis=1)
media_df.head()

In [0]:
media_df.describe()

In [0]:
media_df.info()

In [0]:
media_df.isnull().sum() #None missing values

In [0]:
media_df['Date'] = pd.to_datetime(media_df['Date'])
media_df.info()

In [0]:
media_df['Date'].describe()

# EDA

In [0]:
df = media_df.groupby('Date')['Views_show'].sum()
df.head()

In [0]:
# Plot "Views_Show" on timeseries plot
sns.tsplot(data=df)
plt.xlabel('Date')
plt.ylabel('Views_Show')
plt.show()

In [0]:
df = media_df.groupby('Date').agg({'Views_show':'sum','Ad_impression':'sum'}).reset_index()
print(df.head())
print(type(df))
print(df.columns)

In [0]:
ax = df.plot(x='Date',y='Views_show',legend=False)
ax2=ax.twinx()
df.plot(x='Date',y='Ad_impression',ax=ax2,legend=False,color='r')
ax.figure.legend()
plt.show()

In [0]:
sns.pairplot(media_df)

In [0]:
# Derive Weekday,Weekend from Date
media_df['Weekday'] = media_df['Date'].map(lambda x:x.weekday()) # weekday => Mon == 0 and Sun=6
def is_weekend(dayNum):
  if (dayNum == 5) | (dayNum == 6):
    return 1
  return 0
media_df['Weekend'] = media_df['Weekday'].map(lambda x: is_weekend(x))
media_df.describe()

In [0]:
# Derive Days from Date
from datetime import date


d0 = date(2017,2,28)
media_df['Days'] = media_df['Date'] - d0
media_df.head()

In [0]:
media_df.info()

In [0]:
# convert days column into string and extract number from column
media_df['Days'] = media_df['Days'].astype(str)
media_df['Days'] = media_df['Days'].map(lambda x: x.split(' days')[0])
media_df['Days'] = media_df['Days'].astype(int)
media_df.describe()

In [0]:
media_df.describe()

In [0]:
million = 1000000
billion = million*1000
media_df['Visitors_million'] = media_df['Visitors']/million
media_df['Views_platform_million'] = media_df['Views_platform']/million
media_df['Ad_impression_billion'] = media_df['Ad_impression']/billion
media_df.describe()

# Perform Linear Regression

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import statsmodels.api as sm

In [0]:
def build_evaluate_model(X_param, y_param, random_state_param):
  X_train, X_test, y_train, y_test = train_test_split(X_param, y_param, train_size=0.7, random_state=random_state_param)
  
  # Model object
  lr = LinearRegression()
  
  # Fit model
  lr.fit(X_train, y_train)
  print('Model intercept:', lr.intercept_)
  print('Model coefs: ', lr.coef_)
  
  # Predict using model
  y_pred = lr.predict(X_test)
  
  # Evaluation
  r_squared = r2_score(y_test, y_pred)
  print('R_Squared: ', r_squared)
  
  # Plot Actual vs Predict
  c = [i for i in range(0,len(y_test),1)]
  plt.plot(c, y_test, color='r')
  plt.plot(c, y_pred, color='g')
  plt.title('Actual vs prediction')
  plt.ylabel('Target var')
  plt.show()
  
  # Plot error term
  plt.plot(c, y_test-y_pred, color='r')
  plt.title('Plotting error term')
  plt.ylabel("Error term")
  plt.show()
  
  # Plot heatmap between variables
  sns.heatmap((pd.concat([X_param,y_param],axis=1)).corr(), annot=True)
  plt.show()
  
  # Check p-value
  X_train_sm = X_train
  X_train_sm = sm.add_constant(X_train_sm)
  lr_1 = sm.OLS(y_train, X_train_sm).fit()
  print(lr_1.summary())
  

In [0]:
X = media_df[['Visitors_million', 'Views_platform_million','Ad_impression_billion','Cricket_match_india','Character_A','Weekday', 'Weekend','Days']]
y = media_df[['Views_show']]

build_evaluate_model(X,y,random_state_param=10)

In [0]:
X = media_df[['Views_platform_million','Ad_impression_billion','Cricket_match_india','Character_A','Weekday', 'Weekend','Days']]
y = media_df[['Views_show']]

build_evaluate_model(X,y,random_state_param=10)

In [0]:
X = media_df[['Views_platform_million','Ad_impression_billion','Cricket_match_india','Character_A', 'Weekend','Days']]
y = media_df[['Views_show']]

build_evaluate_model(X,y,random_state_param=10)

In [0]:
X = media_df[['Views_platform_million','Cricket_match_india','Character_A','Weekend','Days']]
y = media_df[['Views_show']]

build_evaluate_model(X,y,random_state_param=10)

In [0]:
X = media_df[['Views_platform_million','Ad_impression_billion','Character_A','Weekend','Days']]
y = media_df[['Views_show']]

build_evaluate_model(X,y,random_state_param=10)

In [0]:
X = media_df[['Views_platform_million','Ad_impression_billion','Character_A','Days']]
y = media_df[['Views_show']]

build_evaluate_model(X,y,random_state_param=10)

In [0]:
help(date)

In [0]:
media_df['Weeknumber'] = media_df['Date'].dt.week
media_df.head(10)

In [0]:
media_df.groupby('Weeknumber')['Ad_impression_billion','Views_platform_million'].sum()
# We can see here that Best 'Ad_impression_billion' was in Week 14 and then it has decreased to almost half in Week 20. 
# So if we increase our 'Ad_impression_billion' again, we'll see rise in "Views_show" (dependent variable)