# COMP0124: Multi-Agent Artificial Intelligence

# Group project: Real-time bidding auctions

**Group #7: Oliviero Balbinetti, Mauricio Caballero, Paul Melkert**

Importing libraries.

In [1]:
import os
import operator
import numpy as np
import pandas as pd
import collections as coll

from glob import glob

Defining functions.

In [2]:
#Feature map file.
def Feature_map_gen(Data, Features, Column, Maps, Special_maps, Index):
    
    #Indexing columns for outliers.
    for col_idx in range(0,len(Data.columns)):
        Column[Data.columns[col_idx]] = col_idx
        
        if col_idx > 0:
            Features[str(col_idx) + ':other'] = Index
            Index += 1
            
    #Regular maps.
    for col_idx in Maps:
        content = list(set(Data[col_idx]))
        
        for item in content:
            feature = str(Column[col_idx]) + ':' + str(item)
            Features[feature] = Index
            Index += 1
            
    #Special maps.
    for col_idx in Special_maps:
        
        if col_idx == 'useragent':
            content = list(set(Data[col_idx]))
            
            for item in content:
                feature = str(Column[col_idx]) + ':' + str(item)
                Features[feature] = Index
                Index += 1
                
        if col_idx == 'slotprice':
            content = list(set(Data[col_idx]))
            
            for item in content:
                if item > 100: value = '101+'
                elif item > 50: value = '51-100'
                elif item > 10: value = '11-50'
                elif item > 0: value = '1-10'
                else: value = '0'
                
                feature = str(Column[col_idx]) + ':' + value
                Features[feature] = Index
                Index += 1
                
    #User tags.
    Temp = [item for sublist in Data['usertag list'] for item in sublist]
    Tags = list(set(Temp))
    
    for tag in Tags:
        feature = str(Column['usertag']) + ':' + tag
        Features[feature] = Index
        Index += 1
        
    print('Feature vectors size: %d' %Index)
    Outcome = sorted(Features.items(), key=operator.itemgetter(1))
    return Outcome
    
#Logistic regression files.
def Logistic_regression_gen(Data, Features, Column, Maps, Special_maps):
    Outcome = Data['bidid'] + ',' + Data['click'].map(str) + ','\
              + Data['payprice'].map(str) + ',' + '0'
    
    #Regular maps.
    for col_idx in Maps:
        List = []
        idx = Column[col_idx]
        
        y = str(idx) + ':' + 'other'
        content = str(idx) + ':' + Data[col_idx].map(str)
        for x in content:
            try: 
                List.append(Features[x]) 
            except: 
                List.append(Features[y])
        
        values = pd.Series(List)
        Outcome += ' ' + values.map(str)
        
    #Special maps.
    for col_idx in Maps:
        List = []
        idx = Column[col_idx]
        
        if col_idx == 'useragent':
            y = str(idx) + ':' + 'other'
            content = str(idx) + ':' + Data[col_idx].map(str)
            for x in content:
                try: 
                    List.append(Features[x]) 
                except: 
                    List.append(Features[y])
        
            values = pd.Series(List)
            Outcome += ' ' + values.map(str)
            
        if col_idx == 'slotprice':
            List = []
            Temp = []
            Array = Data[col_idx].values
            
            for item in Array:
                if item > 100: Temp.append('101+')
                elif item > 50: Temp.append('51-100')
                elif item > 10: Temp.append('11-50')
                elif item > 0: Temp.append('1-10')
                else: Temp.append('0')
                
            y = str(idx) + ':' + 'other'
            content = str(idx) + ':' + pd.Series(Temp)
            for x in content:
                try: 
                    List.append(Features[x]) 
                except: 
                    List.append(Features[y])
            
            values = pd.Series(List)
            Outcome += ' ' + values.map(str)
            
    #User tags.
    mapped = []
    idx = Column['usertag']
    Tags = list(Data['usertag list'].values)
    
    for sublist in Tags:
        List = []
        y = str(idx) + ':' + 'other'
        content = str(idx) + ':' + pd.Series(sublist)
        for x in content:
            try: 
                List.append(Features[x]) 
            except: 
                List.append(Features[y])
        
        values = pd.Series(List)
        joined = list(' ' + values.map(str))
        mapped.append(''.join(joined))
        
    Outcome += pd.Series(mapped).map(str)
    return Outcome

Importing data in pandas DataFrames.

In [3]:
#Defining directory.
Path = '/Users/olivierobalbinetti/Desktop/University College London/Term 2'\
       '/Multi Agents Artificial Intelligence/Courseworks/Group coursework/'\
       'Data/Original'

#Importing data.
Datasets = {}
os.chdir(Path)

for Filename in glob('*.csv'):
    Datasets[Filename[:-4].title()] = pd.read_csv(Filename, sep = ',')
    
#Defining variables.
Test = Datasets['Test']
Train = Datasets['Train']
Validation = Datasets['Validation']

#Feature engineering: ad exchange.
Test['adexchange'] = Test['adexchange'].fillna('nan')
Train['adexchange'] = Train['adexchange'].fillna('nan')
Validation['adexchange'] = Validation['adexchange'].fillna('nan')

#Feature engineering: slot size.
Test['slotsize'] = Test['slotwidth'].astype('str') + '*' +\
                   Test['slotheight'].astype('str')
Train['slotsize'] = Train['slotwidth'].astype('str') + '*' +\
                    Train['slotheight'].astype('str')
Validation['slotsize'] = Validation['slotwidth'].astype('str') + '*' +\
                         Validation['slotheight'].astype('str')

#Feature engineering: browser user agent.
Temp = Train['useragent'].str.split('_', n=1, expand=True)
Train['OS'] = Temp[0]
Train['Browser'] = Temp[1]

Temp = Test['useragent'].str.split('_', n=1, expand=True)
Test['OS'] = Temp[0]
Test['Browser'] = Temp[1]

Temp = Validation['useragent'].str.split('_', n=1, expand=True)
Validation['OS'] = Temp[0]
Validation['Browser'] = Temp[1]

#Feature engineering: user tags.
Train['usertag'] = Train['usertag'].fillna('nan')                      #Train.
Train['usertag list'] = Train['usertag'].str.split(',').values
Temp = list(Train['usertag list'].values)
Tags = [item for sublist in Temp for item in sublist]

Dict = dict(coll.Counter(Tags).most_common())
Dict['nan'] = max(Dict.values())+1
Dict = dict(sorted(Dict.items(), key=lambda kv: kv[1], reverse=True))
Dict = dict(enumerate(Dict.keys()))

Dict = {value:key for key,value in Dict.items()}
Dict['nan'] = 'nan'
Train = Train.assign(mapped=[[Dict[k] for k in row if Dict.get(k)]
                             for row in Train['usertag list'].values])


Test['usertag'] = Test['usertag'].fillna('nan')                         #Test.
Test['usertag list'] = Test['usertag'].str.split(',').values
Temp = list(Test['usertag list'].values)
Tags = [item for sublist in Temp for item in sublist]

Dict = dict(coll.Counter(Tags).most_common())
Dict['nan'] = max(Dict.values())+1
Dict = dict(sorted(Dict.items(), key=lambda kv: kv[1], reverse=True))
Dict = dict(enumerate(Dict.keys()))

Dict = {value:key for key,value in Dict.items()}
Dict['nan'] = 'nan'
Test = Test.assign(mapped=[[Dict[k] for k in row if Dict.get(k)]
                           for row in Test['usertag list'].values])


Validation['usertag'] = Validation['usertag'].fillna('nan')       #Validation.
Validation['usertag list'] = Validation['usertag'].str.split(',').values
Temp = list(Validation['usertag list'].values)
Tags = [item for sublist in Temp for item in sublist]

Dict = dict(coll.Counter(Tags).most_common())
Dict['nan'] = max(Dict.values())+1
Dict = dict(sorted(Dict.items(), key=lambda kv: kv[1], reverse=True))
Dict = dict(enumerate(Dict.keys()))

Dict = {value:key for key,value in Dict.items()}
Dict['nan'] = 'nan'
Validation = Validation.assign(mapped=[[Dict[k] for k in row if Dict.get(k)]
                                       for row in Validation['usertag list'].values])

#Feature engineering: renaming.
Test = Test.rename(columns={'mapped':'tagcodes'})
Train = Train.rename(columns={'mapped':'tagcodes'})
Validation = Validation.rename(columns={'mapped':'tagcodes'})

Feature engineering files.

In [4]:
#Feature engineering files.
print('Feature engineering for logistic regression CTR estimation:\n')
Path = '/Users/olivierobalbinetti/Desktop/University College London/Term 2/'\
       'Multi Agents Artificial Intelligence/Courseworks/Group coursework/'\
       'Data/Feature engineering'

#Defining general variables.
Index = 0
Column = {}
Features = {}

Maps = ['weekday', 'hour', 'IP', 'region', 'city', 'adexchange', 'domain',
        'slotid', 'slotwidth', 'slotheight', 'slotvisibility', 'slotformat',
        'creative', 'advertiser']
Special_maps = ['useragent', 'slotprice']

#Saving map file.
os.chdir(Path)
Features['truncate'] = Index
Header = 'Column:Value,Mapindex'

with open('Feature_map.csv', 'w') as file:
    file.write(Header + '\n')
    Feature_values = Feature_map_gen(Train, Features, Column, Maps,
                                     Special_maps, Index+1)
    
    for item in Feature_values:
        file.write(item[0] + ',' + str(item[1]) + '\n')
        
    print('• [File %s]: Process completed!\n' %('Feature_map.csv'))
    
#Saving logistic regression files.
Header = 'bidid,click,payprice,feature'

for key in ['Train', 'Validation']:
    Filename = key + '.csv'
    
    with open(Filename, 'w') as file:
        file.write(Header + '\n')
        String_rows = Logistic_regression_gen(Datasets[key], Features, Column,
                                              Maps, Special_maps)
        
        for item in String_rows:
            file.write(item + '\n')
        
    print('• [File %s]: Process completed!' %Filename)

Feature engineering for logistic regression CTR estimation:

Feature vectors size: 580325
• [File Feature_map.csv]: Process completed!

• [File Train.csv]: Process completed!
• [File Validation.csv]: Process completed!
