In [None]:
import pandas as pd
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [None]:
demographics_data = pd.read_csv(filepath_or_buffer = 'Data/demographics_data_to_use.csv')
placement_data = pd.read_csv(filepath_or_buffer = 'Data/placement_data_to_use.csv')

In [None]:
demographics_data.drop(['Ad set name','Date'], axis=1, inplace = True)

In [None]:
placement_data.drop(['Ad set name', 'Placement', 'Date'], axis=1, inplace = True)

In [None]:
less_placement = list(placement_data.columns)
less_placement.remove('Total investment')
less_demographics = list(demographics_data.columns)
less_demographics.remove('Total investment')

placement_without_new_feature = placement_data[less_placement]
demographics_without_new_feature = demographics_data[less_demographics]

In [None]:
'''
Converting age from catagorical to numeric variable
'''
demographics_data['Age'] = demographics_data['Age'].map({'65+': '65-85','18-24':'18-24',
                 '25-34':'25-34','35-44':'35-44','45-54':'45-54','55-64':'55-64',
                 '13-17':'13-17'})
low = []
high = []
for i in demographics_data['Age']:
    low.append(int(i[0:2]))
    high.append(int(i[3:5]))
demographics_data['low_age'] = low
demographics_data['high_age'] = high
demographics_data.drop(['Age'], axis=1, inplace=True)

demographics_without_new_feature['Age'] = demographics_without_new_feature['Age'].map({'65+': '65-85','18-24':'18-24',
                 '25-34':'25-34','35-44':'35-44','45-54':'45-54','55-64':'55-64',
                 '13-17':'13-17'})
low = []
high = []
for i in demographics_without_new_feature['Age']:
    low.append(int(i[0:2]))
    high.append(int(i[3:5]))
demographics_without_new_feature['low_age'] = low
demographics_without_new_feature['high_age'] = high
demographics_without_new_feature.drop(['Age'], axis=1, inplace=True)

In [None]:
'''
One hot encoding
'''
def one_hot(data):
    for col_name in data.columns:
        if(data[col_name].dtype == 'object'):
            one_hot = pd.get_dummies(data[col_name])
            data.drop([col_name], axis = 1, inplace = True)
            data = data.join(one_hot)
    return data

placement_without_new_feature = one_hot(placement_without_new_feature)
demographics_without_new_feature = one_hot(demographics_without_new_feature)

demographics_data = one_hot(demographics_data)
placement_data = one_hot(placement_data)

In [None]:
'''
Splitting data into X,Y
'''
placement_without_Y = placement_without_new_feature['Return on ad spend (ROAS)']
placement_without_X = placement_without_new_feature.drop('Return on ad spend (ROAS)',axis=1)

demographics_without_Y = demographics_without_new_feature['Return on ad spend (ROAS)']
demographics_without_X = demographics_without_new_feature.drop('Return on ad spend (ROAS)',axis=1)

placement_Y = placement_data['Return on ad spend (ROAS)']
placement_X = placement_data.drop('Return on ad spend (ROAS)',axis=1)

demographics_Y = demographics_data['Return on ad spend (ROAS)']
demographics_X = demographics_data.drop('Return on ad spend (ROAS)',axis=1)

In [None]:
'''
Splitting data into train,val,test
'''
placement_without_train_X = placement_without_X[:int(len(placement_without_X)*0.7)]
placement_without_val_X = placement_without_X[int(len(placement_without_X)*0.7):int(len(placement_without_X)*0.85)]
placement_without_test_X = placement_without_X[int(len(placement_without_X)*0.85):]
placement_without_train_Y = placement_without_Y[:int(len(placement_without_Y)*0.7)]
placement_without_val_Y = placement_without_Y[int(len(placement_without_Y)*0.7):int(len(placement_without_Y)*0.85)]
placement_without_test_Y = placement_without_Y[int(len(placement_without_Y)*0.85):]

demographics_without_train_X = demographics_without_X[:int(len(demographics_without_X)*0.7)]
demographics_without_val_X = demographics_without_X[int(len(demographics_without_X)*0.7):int(len(demographics_without_X)*0.85)]
demographics_without_test_X = demographics_without_X[int(len(demographics_without_X)*0.85):]
demographics_without_train_Y = demographics_without_Y[:int(len(demographics_without_Y)*0.7)]
demographics_without_val_Y = demographics_without_Y[int(len(demographics_without_Y)*0.7):int(len(demographics_without_Y)*0.85)]
demographics_without_test_Y = demographics_without_Y[int(len(demographics_without_Y)*0.85):]

placement_train_X = placement_X[:int(len(placement_X)*0.7)]
placement_val_X = placement_X[int(len(placement_X)*0.7):int(len(placement_X)*0.85)]
placement_test_X = placement_X[int(len(placement_X)*0.85):]
placement_train_Y = placement_Y[:int(len(placement_Y)*0.7)]
placement_val_Y = placement_Y[int(len(placement_Y)*0.7):int(len(placement_Y)*0.85)]
placement_test_Y = placement_Y[int(len(placement_Y)*0.85):]

demographics_train_X = demographics_X[:int(len(demographics_X)*0.7)]
demographics_val_X = demographics_X[int(len(demographics_X)*0.7):int(len(demographics_X)*0.85)]
demographics_test_X = demographics_X[int(len(demographics_X)*0.85):]
demographics_train_Y = demographics_Y[:int(len(demographics_Y)*0.7)]
demographics_val_Y = demographics_Y[int(len(demographics_Y)*0.7):int(len(demographics_Y)*0.85)]
demographics_test_Y = demographics_Y[int(len(demographics_Y)*0.85):]

In [None]:
'''
Finds RMSE
'''
def RMSE(y_predicted,y):
    n = len(y_predicted)
    diff = [m - n for m,n in zip(y_predicted,y)]
    diff = [x**2 for x in diff]
    diff = sum(diff)
    diff = diff/n
    diff = diff**0.5
    return diff

In [None]:
'''
No penalty regression to test difference between datasets
'''
'''
Since here no validation is required, we use validation as testing set.
'''
from sklearn.linear_model import LinearRegression

Model = LinearRegression()

Placement_without_model = Model.fit(placement_without_train_X,placement_without_train_Y)
print('Placement without model R^2: '+str(Placement_without_model.score(placement_without_train_X,placement_without_train_Y)))

Demographics_without_model = Model.fit(demographics_without_train_X,demographics_without_train_Y)
print('Demographics without model R^2: '+str(Demographics_without_model.score(demographics_without_train_X,demographics_without_train_Y)))

print()

Placement_model = Model.fit(placement_train_X,placement_train_Y)
print('Placement model R^2: '+str(Placement_model.score(placement_train_X,placement_train_Y)))
Demographics_model = Model.fit(demographics_train_X,demographics_train_Y)
print('Demographics model R^2: '+str(Demographics_model.score(demographics_train_X,demographics_train_Y)))

In [None]:
'''
We can conclude that the feature added improves predictions. Thus, we shall use it.
'''

In [None]:
'''
Test lasso regression
'''
from sklearn.linear_model import Lasso

lambda_list = [1,100,1000,10000,100000,1000000,10000000]



In [None]:
'''
Test ridge regression
'''

In [None]:
'''
Test elastic nets
'''

In [None]:
'''
Test KNN regression
'''

In [None]:
'''
Test Kernel Regression
'''