### Inventory Scoring Mode

#### Import Packages

In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from category_encoders import OrdinalEncoder

#### Load the datasets

In [2]:
campaign_df = pd.read_csv("../data/campaign_data.csv")
creative_df = pd.read_csv("../data/creative_data.csv")
inventory_df = pd.read_csv("../data/Inventory_data.csv")

##### Fix inconsistency namings

In [3]:
campaign_df.rename(columns={'campaignid': 'campaign_id'}, inplace=True)

##### Merge the 3 dataframes based on campaign_ig and game_key

In [4]:
# Merge the dataframes based on common columns
merged_df = pd.merge(campaign_df, creative_df, on='campaign_id', how='left')
merged_df = pd.merge(merged_df, inventory_df, on=['campaign_id', 'game_key'], how='left')

merged_df.head()

Unnamed: 0,campaign_id,budget_amount,budget_currencycode,KPI,pricing_model,geo_targeting,vertical,targeting,game_key,adformat,...,renderingcontext,osfamily,devicetype,devicemake,site_name,matchedfoldposition,browser,impression,engagement,click
0,f4p5a01,277.12,USD,Engagement,CPE,,Sports,Opinion Leaders| Investors| Policy Makers| C&I...,b4f31c796c130fd594d253ba9336a01d/7ae9931edb2ed...,320x480,...,MobileOptimizedWeb,6.0,3.0,Samsung,www.yahoo.com,4.0,Chrome,57.0,0.0,0.0
1,f4p5a01,277.12,USD,Engagement,CPE,,Sports,Opinion Leaders| Investors| Policy Makers| C&I...,b4f31c796c130fd594d253ba9336a01d/7ae9931edb2ed...,320x480,...,MobileOptimizedWeb,5.0,4.0,Apple,www.yahoo.com,4.0,Mobile Safari,983.0,265.0,71.0
2,f4p5a01,277.12,USD,Engagement,CPE,,Sports,Opinion Leaders| Investors| Policy Makers| C&I...,b4f31c796c130fd594d253ba9336a01d/7ae9931edb2ed...,320x480,...,MobileOptimizedWeb,6.0,4.0,Samsung,www.yahoo.com,4.0,Chrome,9.0,0.0,0.0
3,f4p5a01,277.12,USD,Engagement,CPE,,Sports,Opinion Leaders| Investors| Policy Makers| C&I...,b4f31c796c130fd594d253ba9336a01d/7ae9931edb2ed...,320x480,...,MobileOptimizedWeb,5.0,4.0,Apple,www.the-sun.com,4.0,Mobile Safari UI/WKWebView,3.0,0.0,0.0
4,f4p5a01,277.12,USD,Engagement,CPE,,Sports,Opinion Leaders| Investors| Policy Makers| C&I...,b4f31c796c130fd594d253ba9336a01d/7ae9931edb2ed...,320x480,...,MobileOptimizedWeb,6.0,4.0,Samsung,currently.att.yahoo.com,4.0,Chrome Mobile,27.0,3.0,0.0


#### Remove rows that have a campaign but no inventory

In [5]:
merged_df = merged_df[~merged_df['campaign_id'].isin(campaign_df[~campaign_df['campaign_id'].isin(inventory_df['campaign_id'])]['campaign_id'])]

#### Compute click through rate

In [6]:
merged_df['CTR'] = np.where(merged_df['impression'] != 0,
                            (merged_df['click'] / merged_df['impression']) * 100,
                            0)

#### Mapping of number to value for 3 colums

In [7]:
os_family_mapping = { 1: "Other", 2: "Windows",  3: "OS X",  4: "Linux",  5: "iOS", 6: "Android",  7: "Windows Phone"}
device_type_mapping = { 1: "Other",2: "PC", 3: "Tablet", 4: "Mobile", 5: "Roku", 6: "ConnectedTV", 7: "OutOfHome", 8: "HomeAssistant"}
foldposition_mapping = {1: "Any", 2: "Above", 3: "Below", 4: "Unknown"}

merged_df["osfamily"] = merged_df["osfamily"].map(os_family_mapping)
merged_df["devicetype"] = merged_df["devicetype"].map(device_type_mapping)
merged_df["matchedfoldposition"] = merged_df["matchedfoldposition"].map( foldposition_mapping)

#### Remove Uncessary columns

In [8]:
merged_df.columns

Index(['campaign_id', 'budget_amount', 'budget_currencycode', 'KPI',
       'pricing_model', 'geo_targeting', 'vertical', 'targeting', 'game_key',
       'adformat', 'creative', 'renderingcontext', 'osfamily', 'devicetype',
       'devicemake', 'site_name', 'matchedfoldposition', 'browser',
       'impression', 'engagement', 'click', 'CTR'],
      dtype='object')

In [9]:
merged_df = merged_df.drop(columns=['campaign_id', "budget_amount", "budget_currencycode", "KPI", "pricing_model", "game_key", "impression", "engagement", "click"])

In [10]:
merged_df.columns

Index(['geo_targeting', 'vertical', 'targeting', 'adformat', 'creative',
       'renderingcontext', 'osfamily', 'devicetype', 'devicemake', 'site_name',
       'matchedfoldposition', 'browser', 'CTR'],
      dtype='object')

#### Handling Missing Data 

In [11]:
# Calculate the percentage of missing values in each column
missing_percentage = merged_df.isnull().sum() / len(merged_df) * 100

# Print the percentage of missing values for each column
print(missing_percentage)

geo_targeting          91.161239
vertical                0.000000
targeting              81.240472
adformat                0.000000
creative                2.900846
renderingcontext        0.000000
osfamily                0.003899
devicetype              0.000101
devicemake              1.916644
site_name               0.000000
matchedfoldposition     0.000067
browser                 0.091890
CTR                     0.000000
dtype: float64


In [12]:
merged_df['geo_targeting'].fillna(value="Unknown", inplace=True)
merged_df['targeting'].fillna(value="Unknown", inplace=True)
merged_df['creative'].fillna(value="Unknown", inplace=True)

merged_df['browser'].fillna(value=merged_df['browser'].mode()[0], inplace=True)
merged_df['devicemake'].fillna(value=merged_df['devicemake'].mode()[0], inplace=True)
merged_df['devicetype'].fillna(value=merged_df['devicetype'].mode()[0], inplace=True)
merged_df['matchedfoldposition'].fillna(value=merged_df['matchedfoldposition'].mode()[0], inplace=True)
merged_df['osfamily'].fillna(value=merged_df['osfamily'].mode()[0], inplace=True)

In [13]:
sample_df = merged_df.iloc[:, :]

In [58]:
sample_df.shape

(2975304, 13)

#### Model Training

In [59]:
# Optional: Encode categorical variables if they're not numerically encoded
# It's important for algorithms like XGBoost that expect numerical input
encoder = OrdinalEncoder(cols=['geo_targeting', 'vertical', 'targeting', 'adformat', 'creative',
                               'renderingcontext', 'osfamily', 'devicetype', 'devicemake', 
                               'site_name', 'matchedfoldposition', 'browser'])
merged_df_encoded = encoder.fit_transform(sample_df)



In [60]:
# Split data into features and target
X = merged_df_encoded.drop('CTR', axis=1)
y = merged_df_encoded['CTR']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [61]:
# Initialize XGBoost regressor
# Note: You might need to tweak hyperparameters based on your specific dataset characteristics
model = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100)

In [62]:
# Train the model
model.fit(X_train, y_train)

In [63]:
# Predict CTR for the test set
predictions = model.predict(X_test)

In [64]:
# Evaluate the model
rmse = mean_squared_error(y_test, predictions, squared=False)  # Set squared=False for RMSE
print("RMSE: %f" % (rmse))

RMSE: 15.366780


#### Hello

In [73]:
merged_df.columns

Index(['geo_targeting', 'vertical', 'targeting', 'adformat', 'creative',
       'renderingcontext', 'osfamily', 'devicetype', 'devicemake', 'site_name',
       'matchedfoldposition', 'browser', 'CTR'],
      dtype='object')

In [74]:
merged_df.dtypes

geo_targeting           object
vertical                object
targeting               object
adformat                object
creative                object
renderingcontext        object
osfamily                object
devicetype              object
devicemake              object
site_name               object
matchedfoldposition     object
browser                 object
CTR                    float64
dtype: object

In [65]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming merged_df is your DataFrame

# Define categorical columns
categorical_columns = ['matchedfoldposition', 'devicetype', 'osfamily']

# Map categorical columns to their categories
categories = {
    'matchedfoldposition': ['Any', 'Above', 'Below', 'Unknown'],
    'devicetype': ['Other', 'PC', 'Tablet', 'Mobile', 'Roku', 'ConnectedTV', 'OutOfHome', 'HomeAssistant'],
    'osfamily': ['Other', 'Windows', 'OS X', 'Linux', 'iOS', 'Android', 'Windows Phone']
}

# Preprocessing for categorical data
categorical_preprocessor = OneHotEncoder(categories=[categories[col] for col in categorical_columns])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_preprocessor, categorical_columns)
    ], remainder='passthrough')

In [70]:
# Define the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[
                      ('model', model)])

In [71]:
# Separate target from predictors
X = merged_df.drop('CTR', axis=1)
y = merged_df['CTR']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)

In [32]:
sample_df = merged_df.iloc[:, :]

In [38]:
sample_df.iloc[:1,:1].dtypes

geo_targeting    object
dtype: object

In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Assuming 'merged_df' is your DataFrame and 'CTR' is your target

# Splitting features and target
X = sample_df.drop(['CTR'], axis=1)
y = sample_df['CTR']

# Identifying categorical columns (excluding target variable 'CTR')
categorical_cols = ["matchedfoldposition", "devicetype", "osfamily"]

# No explicit numerical features to preprocess in this dataset except the target
# If there were numerical features, they would be listed here

# Preprocessor for categorical data with OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])


In [18]:
# Defining the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the model
model.fit(X_train, y_train)

# Predicting CTR
predictions = model.predict(X_test)

# Calculating the Root Mean Square Error (RMSE)
rmse = mean_squared_error(y_test, predictions, squared=False)
print(f"RMSE: {rmse}")


RMSE: 15.923724654407854


In [19]:
from sklearn.ensemble import GradientBoostingRegressor

# Updating the model in the pipeline to GradientBoostingRegressor
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(n_estimators=100, random_state=42))
])

# Training the model
model.fit(X_train, y_train)

# Predicting CTR
predictions = model.predict(X_test)

# Calculating the RMSE
rmse = mean_squared_error(y_test, predictions, squared=False)
print(f"RMSE: {rmse}")

RMSE: 15.923745907743601


In [20]:
from xgboost import XGBRegressor

# Updating the model in the pipeline to XGBRegressor
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, random_state=42))
])

# Training the model
model.fit(X_train, y_train)

# Predicting CTR
predictions = model.predict(X_test)

# Calculating the RMSE
rmse = mean_squared_error(y_test, predictions, squared=False)
print(f"RMSE: {rmse}")

RMSE: 15.923730551651596


In [22]:
from lightgbm import LGBMRegressor

# Define a new model pipeline with LightGBM
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.05, n_estimators=100, random_state=42))
])

# Splitting the dataset (ensure X and y are already defined as per your dataset)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the model
model.fit(X_train, y_train)

# Predicting and evaluating
predictions = model.predict(X_test)
rmse = mean_squared_error(y_test, predictions, squared=False)
print(f"RMSE: {rmse}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.082810 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28
[LightGBM] [Info] Number of data points in the train set: 2380243, number of used features: 14
[LightGBM] [Info] Start training from score 3.819118
RMSE: 15.92369691961791


In [48]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import numpy as np

# Assuming 'categorical_cols' contains the names of your categorical columns
# And 'X' and 'y' have been defined

# Setup the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ], remainder='passthrough')

# Apply preprocessing to create a transformed feature matrix
X_transformed = preprocessor.fit_transform(X)

# Since we've now transformed X, we need to split again after preprocessing
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# Now, build, compile, and fit your model as before, using the transformed X_train


In [49]:
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler

# Data preprocessing
# Standardize the numerical features (if any exist beyond 'CTR') and encode categorical features
# For simplicity, let's continue with the preprocessed X_train, X_test, y_train, y_test you already have

# Define the neural network model
def build_model(input_shape):
    model = tf.keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=[input_shape]),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)  # Output layer for regression; no activation function
    ])
    
    optimizer = tf.keras.optimizers.RMSprop(0.001)

    model.compile(loss='mse',
                  optimizer=optimizer,
                  metrics=['mae', 'mse'])
    return model

# Get the number of features in the input
input_shape = X_train.shape[1]

# Build the model
model = build_model(input_shape)

# Print the model summary
model.summary()

# Train the model
EPOCHS = 100

history = model.fit(
    X_train, y_train,
    epochs=EPOCHS, validation_split = 0.2, verbose=0)

# Predicting and evaluating
predictions = model.predict(X_test).flatten()

# Calculating the RMSE
rmse = mean_squared_error(y_test, predictions, squared=False)
print(f"RMSE: {rmse}")


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 64)                1536      
                                                                 
 dense_10 (Dense)            (None, 64)                4160      
                                                                 
 dense_11 (Dense)            (None, 1)                 65        
                                                                 
Total params: 5761 (22.50 KB)


Trainable params: 5761 (22.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).

In [50]:
[print(i.shape, i.dtype) for i in model.inputs]
[print(o.shape, o.dtype) for o in model.outputs]
[print(l.name, l.input_shape, l.dtype) for l in model.layers]

(None, 23) <dtype: 'float32'>
(None, 1) <dtype: 'float32'>
dense_9 (None, 23) float32
dense_10 (None, 64) float32
dense_11 (None, 64) float32


[None, None, None]

In [58]:
X = np.asarray(X).astype(np.float32)

ValueError: could not convert string to float: 'Unknown'

In [59]:
# X = X.drop(columns=["adformat"])
X.head()

Unnamed: 0,geo_targeting,vertical,targeting,creative,renderingcontext,osfamily,devicetype,devicemake,site_name,matchedfoldposition,browser
0,Unknown,Sports,Opinion Leaders| Investors| Policy Makers| C&I...,Sensory Video,MobileOptimizedWeb,Android,Tablet,Samsung,www.yahoo.com,Unknown,Chrome
1,Unknown,Sports,Opinion Leaders| Investors| Policy Makers| C&I...,Sensory Video,MobileOptimizedWeb,iOS,Mobile,Apple,www.yahoo.com,Unknown,Mobile Safari
2,Unknown,Sports,Opinion Leaders| Investors| Policy Makers| C&I...,Sensory Video,MobileOptimizedWeb,Android,Mobile,Samsung,www.yahoo.com,Unknown,Chrome
3,Unknown,Sports,Opinion Leaders| Investors| Policy Makers| C&I...,Sensory Video,MobileOptimizedWeb,iOS,Mobile,Apple,www.the-sun.com,Unknown,Mobile Safari UI/WKWebView
4,Unknown,Sports,Opinion Leaders| Investors| Policy Makers| C&I...,Sensory Video,MobileOptimizedWeb,Android,Mobile,Samsung,currently.att.yahoo.com,Unknown,Chrome Mobile
