In [3]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split

In [4]:


# List of days to merge (06 to 10 for training)
days = ["06", "07", "08", "09", "10"]

# Initialize empty DataFrame for impressions
df_imp = pd.DataFrame()

# Load & merge impression logs from multiple days
for day in days:
    temp_imp = pd.read_csv(f"/Users/adityarajtandon/Documents/RTB_MODEL/dataset/imp.{day}.txt", sep="\t", header=None)
    temp_imp.columns = ["BidID", "Timestamp", "Logtype", "VisitorID", "UserAgent", "IP",
                        "Region", "City", "AdExchange", "Domain", "URL", "AnonURLID",
                        "AdslotID", "AdslotWidth", "AdslotHeight", "AdslotVisibility",
                        "AdslotFormat", "AdslotFloorPrice", "CreativeID", "BiddingPrice",
                        "PayingPrice", "KeypageURL", "AdvertiserID", "ExtraColumn"]
    
    df_imp = pd.concat([df_imp, temp_imp], ignore_index=True)

print(f"Total Impressions (Merged 06-10): {len(df_imp)}")


Total Impressions (Merged 06-10): 8834027


In [6]:
# Initialize empty DataFrame for clicks
df_clk = pd.DataFrame()

# Load & merge click logs
for day in days:
    temp_clk = pd.read_csv(f"/Users/adityarajtandon/Documents/RTB_MODEL/dataset/clk.{day}.txt", sep="\t", header=None)[[0]]
    temp_clk.columns = ["BidID"]
    temp_clk["Clicked"] = 1  # Label clicks as 1
    df_clk = pd.concat([df_clk, temp_clk], ignore_index=True)

print(f"Total Clicks (Merged 06-10): {len(df_clk)}")


Total Clicks (Merged 06-10): 6778


In [7]:
# Initialize empty DataFrame for conversions
df_conv = pd.DataFrame()

# Load & merge conversion logs
for day in days:
    temp_conv = pd.read_csv(f"/Users/adityarajtandon/Documents/RTB_MODEL/dataset/conv.{day}.txt", sep="\t", header=None)[[0]]
    temp_conv.columns = ["BidID"]
    temp_conv["Converted"] = 1  # Label conversions as 1
    df_conv = pd.concat([df_conv, temp_conv], ignore_index=True)

print(f"Total Conversions (Merged 06-10): {len(df_conv)}")


Total Conversions (Merged 06-10): 327


In [8]:
# Merge Clicks & Conversions with Impressions
df_imp = df_imp.merge(df_clk, on="BidID", how="left").fillna(0)
df_imp = df_imp.merge(df_conv, on="BidID", how="left").fillna(0)

# Final dataset summary
print(f"Total Impressions: {len(df_imp)}")
print(f"Total Clicks: {df_imp['Clicked'].sum()}")
print(f"Total Conversions: {df_imp['Converted'].sum()}")


Total Impressions: 8834903
Total Clicks: 6945.0
Total Conversions: 493.0


In [9]:
df_imp.head()

Unnamed: 0,BidID,Timestamp,Logtype,VisitorID,UserAgent,IP,Region,City,AdExchange,Domain,...,AdslotFormat,AdslotFloorPrice,CreativeID,BiddingPrice,PayingPrice,KeypageURL,AdvertiserID,ExtraColumn,Clicked,Converted
0,8a15b98c8f9e60d4f92aaab01acf52a4,20130606000104192,1,VhTVORqG36N6qMj,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1...,114.100.37.*,106,117,1,lsxSl559Xql7FmMs,...,1,0,23d6dade7ed21cea308205b37594003e,227,207,b2e35064f3549d447edbbdfb1f707c8c,3427,"10063,10684,10083,13403,10059,10024,10048,1005...",0.0,0.0
1,5bd0cbeb2f82fb94e56b7dc2e6b77ec,20130606000104252,1,Vhkr1uaROqKsXmb,Mozilla/4.0 (compatible; MSIE 7.0; Windows NT ...,222.220.35.*,308,320,2,eSMvBpa0jqmUagk4JKTI,...,0,5,13606a7c541dcd9ca1948875a760bb31,238,72,d29e59bf0f7f8243858b8183f14d4412,3358,1380010024,0.0,0.0
2,faf17eac9cabf1be598f4e75f40d501d,20130606000104253,1,VhL01pk8OTkW3Mc,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1...,58.100.240.*,94,95,1,tK1NTu1YP5scFsf,...,1,0,d5cecca9a6cbd7a0a48110f1306b26d1,227,108,d29e59bf0f7f8243858b8183f14d4412,3358,10059138661006310111,0.0,0.0
3,234870d3864ad1852fe04b172f340be3,20130606000104308,1,VhT3La5uDlaywOj,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1...,122.233.40.*,94,95,1,trqRTummPvas1m58uG,...,1,0,44966cc8da1ed40c95d59e863c8c75f0,300,81,361e128affece850342293213691a043,3386,100571006310024138001386610110,0.0,0.0
4,20fb85784972aebed3dda8d20ca87afb,20130606000104329,1,VhkSPnNDP8L8eYl,Opera/9.80 (Android; Opera Mini/7.7.33548/29.3...,58.67.157.*,216,217,1,trqRTuS8jZL7FmMs,...,5,0,d881a6c788e76c2c27ed1ef04f119544,227,89,d29e59bf0f7f8243858b8183f14d4412,3358,1386610111,0.0,0.0


In [10]:


# List of days to merge (06 to 10 for training)
days = ["11","12"]

# Initialize empty DataFrame for impressions
df_val = pd.DataFrame()

# Load & merge impression logs from multiple days
for day in days:
    temp_val = pd.read_csv(f"/Users/adityarajtandon/Documents/RTB_MODEL/dataset/imp.{day}.txt", sep="\t", header=None)
    temp_val.columns = ["BidID", "Timestamp", "Logtype", "VisitorID", "UserAgent", "IP",
                        "Region", "City", "AdExchange", "Domain", "URL", "AnonURLID",
                        "AdslotID", "AdslotWidth", "AdslotHeight", "AdslotVisibility",
                        "AdslotFormat", "AdslotFloorPrice", "CreativeID", "BiddingPrice",
                        "PayingPrice", "KeypageURL", "AdvertiserID", "ExtraColumn"]
    
    df_val = pd.concat([df_val, temp_val], ignore_index=True)

print(f"Total Impressions (Merged 11-12): {len(df_val)}")


Total Impressions (Merged 11-12): 3403060


In [11]:
# Initialize empty DataFrame for clicks
df_clk = pd.DataFrame()

# Load & merge click logs
for day in days:
    temp_clk = pd.read_csv(f"/Users/adityarajtandon/Documents/RTB_MODEL/dataset/clk.{day}.txt", sep="\t", header=None)[[0]]
    temp_clk.columns = ["BidID"]
    temp_clk["Clicked"] = 1  # Label clicks as 1
    df_clk = pd.concat([df_clk, temp_clk], ignore_index=True)

print(f"Total Clicks (Merged 11-12): {len(df_clk)}")

Total Clicks (Merged 11-12): 3200


In [12]:
# Initialize empty DataFrame for conversions
df_conv = pd.DataFrame()

# Load & merge conversion logs
for day in days:
    temp_conv = pd.read_csv(f"/Users/adityarajtandon/Documents/RTB_MODEL/dataset/conv.{day}.txt", sep="\t", header=None)[[0]]
    temp_conv.columns = ["BidID"]
    temp_conv["Converted"] = 1  # Label conversions as 1
    df_conv = pd.concat([df_conv, temp_conv], ignore_index=True)

print(f"Total Conversions (Merged 11-12): {len(df_conv)}")


Total Conversions (Merged 11-12): 167


In [13]:
# Merge Clicks & Conversions with Impressions
df_val = df_val.merge(df_clk, on="BidID", how="left").fillna(0)
df_val = df_val.merge(df_conv, on="BidID", how="left").fillna(0)

# Final dataset summary
print(f"Total Impressions: {len(df_val)}")
print(f"Total Clicks: {df_val['Clicked'].sum()}")
print(f"Total Conversions: {df_val['Converted'].sum()}")


Total Impressions: 3403564
Total Clicks: 3265.0
Total Conversions: 248.0


In [14]:
# Finalize Features
features = ["AdslotWidth", "AdslotHeight", "Region", "City", "AdExchange", "AdslotVisibility", "AdslotFormat", "AdslotFloorPrice"]

X_train = df_imp[features]
y_ctr_train = df_imp["Clicked"]
y_cvr_train = df_imp[df_imp["Clicked"] == 1]["Converted"]

# Validation Data
X_val = df_val[features]
y_ctr_val = df_val["Clicked"]
y_cvr_val = df_val[df_val["Clicked"] == 1]["Converted"]

# Handle missing values for market price prediction and 
y_market_train = df_imp["PayingPrice"].fillna(df_imp["PayingPrice"].mean())
y_market_val = df_val["PayingPrice"].fillna(df_val["PayingPrice"].mean())


In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

def build_ctr_cvr_model():
    """Define CTR and CVR prediction models."""
    model = Sequential([
        Dense(128, activation="relu", input_shape=(X_train.shape[1],)),
        Dense(64, activation="relu"),
        Dense(32, activation="relu"),
        Dense(1, activation="sigmoid")  # Probabilistic output
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss="binary_crossentropy", metrics=["accuracy"])
    return model


In [16]:
def build_market_model():
    """Define market price prediction model."""
    model = Sequential([
        Dense(128, activation="relu", input_shape=(X_train.shape[1],)),
        Dense(64, activation="relu"),
        Dense(32, activation="relu"),
        Dense(1, activation="linear")  # Regression output
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss="mean_squared_error")
    return model


In [17]:
#make a temp dataframe to store df_imp
df_temp = df_imp.copy()

In [43]:
# Load region and city mappings
region_mapping = pd.read_csv("/Users/adityarajtandon/Documents/RTB_MODEL/dataset/region.txt", sep="\t", header=None, names=["RegionCode", "RegionName"])
city_mapping = pd.read_csv("/Users/adityarajtandon/Documents/RTB_MODEL/dataset/city.txt", sep="\t", header=None, names=["CityCode", "CityName"])

# Create dictionaries for mapping
region_dict = dict(zip(region_mapping["RegionCode"], region_mapping["RegionName"]))
city_dict = dict(zip(city_mapping["CityCode"], city_mapping["CityName"]))
# Print a preview to verify mappings
print(region_dict)
print(city_dict)
# Replace codes with names in the original columns
df_imp["Region"] = df_imp["Region"].map(region_dict)
df_imp["City"] = df_imp["City"].map(city_dict)

# Print a preview to verify changes



{0: 'unknown', 1: 'beijing', 2: 'tianjin', 3: 'hebei', 15: 'shanxi', 27: 'neimenggu', 40: 'liaoning', 55: 'jilin', 65: 'heilongjiang', 79: 'shanghai', 80: 'jiangsu', 94: 'zhejiang', 106: 'anhui', 124: 'fujian', 134: 'jiangxi', 146: 'shandong', 164: 'henan', 183: 'hubei', 201: 'hunan', 216: 'guangdong', 238: 'guangxi', 253: 'hainan', 275: 'chongqing', 276: 'sichuan', 298: 'guizhou', 308: 'yunnan', 325: 'xizang', 333: 'shannxi', 344: 'gansu', 359: 'qinghai', 368: 'ningxia', 374: 'xinjiang', 393: 'taiwan', 394: 'xianggang', 395: 'aomen'}
{0: 'unknown', 4: 'shijiazhuang', 5: 'tangshan', 6: 'qinhuangdao', 7: 'handan', 8: 'xingtai', 9: 'baoding', 10: 'zhangjiakou', 11: 'chengde', 12: 'cangzhou', 13: 'langfang', 14: 'hengshui', 16: 'taiyuan', 17: 'datong', 18: 'yangquan', 19: 'changzhi', 20: 'jincheng', 21: 'shuozhou', 22: 'jinzhongshi', 23: 'yuncheng', 24: 'xinzhou', 25: 'linfen', 26: 'lvliang', 28: 'huhehaote', 29: 'baotou', 30: 'wuhai', 31: 'chifeng', 32: 'tongliao', 33: 'eerduosi', 34: 'h

In [44]:
print(region_dict)
print(city_dict)

{0: 'unknown', 1: 'beijing', 2: 'tianjin', 3: 'hebei', 15: 'shanxi', 27: 'neimenggu', 40: 'liaoning', 55: 'jilin', 65: 'heilongjiang', 79: 'shanghai', 80: 'jiangsu', 94: 'zhejiang', 106: 'anhui', 124: 'fujian', 134: 'jiangxi', 146: 'shandong', 164: 'henan', 183: 'hubei', 201: 'hunan', 216: 'guangdong', 238: 'guangxi', 253: 'hainan', 275: 'chongqing', 276: 'sichuan', 298: 'guizhou', 308: 'yunnan', 325: 'xizang', 333: 'shannxi', 344: 'gansu', 359: 'qinghai', 368: 'ningxia', 374: 'xinjiang', 393: 'taiwan', 394: 'xianggang', 395: 'aomen'}
{0: 'unknown', 4: 'shijiazhuang', 5: 'tangshan', 6: 'qinhuangdao', 7: 'handan', 8: 'xingtai', 9: 'baoding', 10: 'zhangjiakou', 11: 'chengde', 12: 'cangzhou', 13: 'langfang', 14: 'hengshui', 16: 'taiyuan', 17: 'datong', 18: 'yangquan', 19: 'changzhi', 20: 'jincheng', 21: 'shuozhou', 22: 'jinzhongshi', 23: 'yuncheng', 24: 'xinzhou', 25: 'linfen', 26: 'lvliang', 28: 'huhehaote', 29: 'baotou', 30: 'wuhai', 31: 'chifeng', 32: 'tongliao', 33: 'eerduosi', 34: 'h

In [47]:
# Load region and city mappings
region_mapping = pd.read_csv("/Users/adityarajtandon/Documents/RTB_MODEL/dataset/region.txt", sep="\t", header=None, names=["RegionCode", "RegionName"])
city_mapping = pd.read_csv("/Users/adityarajtandon/Documents/RTB_MODEL/dataset/city.txt", sep="\t", header=None, names=["CityCode", "CityName"])

# Create dictionaries for mapping
region_dict = dict(zip(region_mapping["RegionCode"], region_mapping["RegionName"]))
city_dict = dict(zip(city_mapping["CityCode"], city_mapping["CityName"]))
# Print a preview to verify mappings
print(region_dict)
print(city_dict)
# Replace codes with names in the original columns
df_val["Region"] = df_val["Region"].map(region_dict)
df_val["City"] = df_val["City"].map(city_dict)

# Print a preview to verify changes



{0: 'unknown', 1: 'beijing', 2: 'tianjin', 3: 'hebei', 15: 'shanxi', 27: 'neimenggu', 40: 'liaoning', 55: 'jilin', 65: 'heilongjiang', 79: 'shanghai', 80: 'jiangsu', 94: 'zhejiang', 106: 'anhui', 124: 'fujian', 134: 'jiangxi', 146: 'shandong', 164: 'henan', 183: 'hubei', 201: 'hunan', 216: 'guangdong', 238: 'guangxi', 253: 'hainan', 275: 'chongqing', 276: 'sichuan', 298: 'guizhou', 308: 'yunnan', 325: 'xizang', 333: 'shannxi', 344: 'gansu', 359: 'qinghai', 368: 'ningxia', 374: 'xinjiang', 393: 'taiwan', 394: 'xianggang', 395: 'aomen'}
{0: 'unknown', 4: 'shijiazhuang', 5: 'tangshan', 6: 'qinhuangdao', 7: 'handan', 8: 'xingtai', 9: 'baoding', 10: 'zhangjiakou', 11: 'chengde', 12: 'cangzhou', 13: 'langfang', 14: 'hengshui', 16: 'taiyuan', 17: 'datong', 18: 'yangquan', 19: 'changzhi', 20: 'jincheng', 21: 'shuozhou', 22: 'jinzhongshi', 23: 'yuncheng', 24: 'xinzhou', 25: 'linfen', 26: 'lvliang', 28: 'huhehaote', 29: 'baotou', 30: 'wuhai', 31: 'chifeng', 32: 'tongliao', 33: 'eerduosi', 34: 'h

In [48]:
#print city and region of df_val
print(df_val["City"])
print(df_val["Region"])

0             daqing
1            baoding
2            unknown
3          guangzhou
4            jingmen
             ...    
3403559      unknown
3403560        wuhan
3403561    guangzhou
3403562      kunming
3403563     quanzhou
Name: City, Length: 3403564, dtype: object
0          heilongjiang
1                 hebei
2               unknown
3             guangdong
4                 hubei
               ...     
3403559         unknown
3403560           hubei
3403561       guangdong
3403562          yunnan
3403563          fujian
Name: Region, Length: 3403564, dtype: object


In [50]:
#print column names of df_imp
print(df_imp.columns)

Index(['BidID', 'Timestamp', 'Logtype', 'VisitorID', 'UserAgent', 'IP',
       'Region', 'City', 'AdExchange', 'Domain', 'URL', 'AnonURLID',
       'AdslotID', 'AdslotWidth', 'AdslotHeight', 'AdslotVisibility',
       'AdslotFormat', 'AdslotFloorPrice', 'CreativeID', 'BiddingPrice',
       'PayingPrice', 'KeypageURL', 'AdvertiserID', 'ExtraColumn', 'Clicked',
       'Converted'],
      dtype='object')


In [19]:
dnn_ctr = build_ctr_cvr_model()
dnn_ctr.fit(X_train, y_ctr_train, epochs=7, batch_size=512, validation_data=(X_val, y_ctr_val))
dnn_ctr.save("/Users/adityarajtandon/Documents/RTB_MODEL/models/dnn_ctr_model.h5")




Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


  saving_api.save_model(


In [20]:
dnn_cvr = build_ctr_cvr_model()
dnn_cvr.fit(X_train[y_ctr_train == 1], y_cvr_train, epochs=7, batch_size=512, validation_data=(X_val[y_ctr_val == 1], y_cvr_val))
dnn_cvr.save("models/dnn_cvr_model.h5")




Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


  saving_api.save_model(


In [22]:
# Handle missing values for market price prediction and fill missing with 0
from sklearn.preprocessing import StandardScaler


dnn_market = build_market_model()
dnn_market.fit(X_train, y_market_train, epochs=15, batch_size=512, validation_data=(X_val, y_market_val))
dnn_market.save("/Users/adityarajtandon/Documents/RTB_MODEL/models/dnn_market_model.h5")




Epoch 1/15

KeyboardInterrupt: 