In [1]:
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from math import log
#warnings.filterwarnings('ignore')

In [2]:
placement_train_data = pd.read_csv(filepath_or_buffer = 'Data/placement_data_to_use.csv')
demographics_train_data = pd.read_csv(filepath_or_buffer='Data/demographics_data_to_use.csv')

In [3]:
placement_train_data.drop(['Ad set name','Date'], axis=1, inplace = True)
demographics_train_data.drop(['Ad set name','Date'], axis=1, inplace = True)

In [4]:
'''
Converting age from catagorical to numeric variable
'''
demographics_train_data['Age'] = demographics_train_data['Age'].map({'65+': '65-85','18-24':'18-24',
                 '25-34':'25-34','35-44':'35-44','45-54':'45-54','55-64':'55-64',
                 '13-17':'13-17'})

low = []
high = []
for i in demographics_train_data['Age']:
    low.append(int(i[0:2]))
    high.append(int(i[3:5]))
demographics_train_data['low_age'] = low
demographics_train_data['high_age'] = high
demographics_train_data.drop(['Age'], axis=1, inplace=True)

In [5]:
'''
One hot encoding
'''
def one_hot(data):
    for col_name in data.columns:
        if(data[col_name].dtype == 'object'):
            one_hot = pd.get_dummies(data[col_name])
            data.drop([col_name], axis = 1, inplace = True)
            data = data.join(one_hot)
    return data

placement_train_data = one_hot(placement_train_data)
demographics_train_data = one_hot(demographics_train_data)

In [6]:
demographics_train_with_feature = demographics_train_data
demographics_train_without_feature = demographics_train_data.drop('Total investment',axis=1)

placement_train_with_feature = placement_train_data
placement_train_without_feature = placement_train_data.drop('Total investment',axis=1)

In [7]:
'''
train, validation split
'''
demographics_with_val_data = demographics_train_with_feature[int(len(demographics_train_with_feature)*0.7):]
demographics_with_train_data = demographics_train_with_feature[:int(len(demographics_train_with_feature)*0.7)]

placement_with_val_data = placement_train_with_feature[int(len(placement_train_with_feature)*0.7):]
placement_with_train_data = placement_train_with_feature[:int(len(placement_train_with_feature)*0.7)]

demographics_without_val_data = demographics_train_without_feature[int(len(demographics_train_without_feature)*0.7):]
demographics_without_train_data = demographics_train_without_feature[:int(len(demographics_train_without_feature)*0.7)]

placement_without_val_data = placement_train_without_feature[int(len(placement_train_without_feature)*0.7):]
placement_without_train_data = placement_train_without_feature[:int(len(placement_train_without_feature)*0.7)]

In [8]:
'''
Splitting data into X,Y
'''
placement_train_with_Y = placement_with_train_data['Ad revenue']
placement_train_with_X = placement_with_train_data.drop('Ad revenue',axis=1)

demographics_train_with_Y = demographics_with_train_data['Ad revenue']
demographics_train_with_X = demographics_with_train_data.drop('Ad revenue',axis=1)

placement_val_with_Y = placement_with_val_data['Ad revenue']
placement_val_with_X = placement_with_val_data.drop('Ad revenue',axis=1)

demographics_val_with_Y = demographics_with_val_data['Ad revenue']
demographics_val_with_X = demographics_with_val_data.drop('Ad revenue',axis=1)



placement_train_without_Y = placement_without_train_data['Ad revenue']
placement_train_without_X = placement_without_train_data.drop('Ad revenue',axis=1)

demographics_train_without_Y = demographics_without_train_data['Ad revenue']
demographics_train_without_X = demographics_without_train_data.drop('Ad revenue',axis=1)

placement_val_without_Y = placement_without_val_data['Ad revenue']
placement_val_without_X = placement_without_val_data.drop('Ad revenue',axis=1)

demographics_val_without_Y = demographics_without_val_data['Ad revenue']
demographics_val_without_X = demographics_without_val_data.drop('Ad revenue',axis=1)

In [9]:
placement_train_without_Y.shape

(2508,)

In [10]:
'''
Finds RMSE
'''
def RMSE(y_predicted,y):
    n = len(y_predicted)
    diff = [m - n for m,n in zip(y_predicted,y)]
    diff = [x**2 for x in diff]
    diff = sum(diff)
    diff = diff/n
    diff = diff**0.5
    return diff

In [15]:
'''
Check which feature is better using linear regression
'''

from sklearn.linear_model import LinearRegression

Model = LinearRegression()

Model.fit(placement_train_with_X,placement_train_with_Y)
print('Placement RMSE with feature is: ' + str(RMSE(Model.predict(placement_train_with_X),placement_train_with_Y)))

Model.fit(demographics_train_with_X,demographics_train_with_Y)
print('Demographics RMSE with feature is: ' + str(RMSE(Model.predict(demographics_train_with_X),demographics_train_with_Y)))

Model.fit(placement_train_without_X,placement_train_without_Y)
print('Placement RMSE without feature is: ' + str(RMSE(Model.predict(placement_train_without_X),placement_train_without_Y)))

Model.fit(demographics_train_without_X,demographics_train_without_Y)
print('Demographics RMSE without feature is: ' + str(RMSE(Model.predict(demographics_train_without_X),demographics_train_without_Y)))

Placement RMSE with feature is: 17310.39421285701
Demographics RMSE with feature is: 10605.074034148593
Placement RMSE without feature is: 17831.47784399221
Demographics RMSE without feature is: 10611.811722854558
