#### by Fatemeh Zahed - March 2020


# About the DATA:
### The data was generated by serving ad impressions on an ad exchange and observing which of these led to actions, such as making a purchase, by the user. 
### Positive data (i.e. impressions that led to actions) is in the "*.pos" file. 
### Negative data (i.e. impressions that did not lead to actions) is in the "*.neg" file.

# Problem Definition: 
### The task is to build a model that can generate a probability, score or ranking of impressions such that those impressions that are more likely to lead to actions have higher scores/probabilities/rankings. This would be used to determine which future ad impressions to buy and which to skip.

### Other notes:
### The data is tab separated and conforms to the header.csv file. It is randomly sampled so you should not assume anything about the order of impressions or proportions of the positive and negative examples.The "campaign_uid" field denotes which advertising campaign served the data. All your data is for a single campaign.


In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
import numpy as np

In [2]:
header = pd.read_csv('./data_classification_small/header.csv')
data_pos_original = pd.read_csv('./data_classification_small/0CH9UhFrWY.pos', delimiter='\t')
data_neg_original = pd.read_csv('./data_classification_small/0CH9UhFrWY.neg', delimiter='\t')

data_pos_original.columns = list(header.columns)
data_neg_original.columns = list(header.columns)

#drop the campaign_uid which is the same for all
data_pos = data_pos_original.drop('campaign_uid', axis = 1)
data_neg = data_neg_original.drop('campaign_uid', axis = 1)

#drop user_agent_string because it's irrelevant
data_pos = data_pos.drop('user_agent_string', axis = 1)
data_neg = data_neg.drop('user_agent_string', axis = 1)

In [3]:
data_pos.head()

Unnamed: 0,creative_uid,exchange,exchange_publisher,user_local_hour,sitename,user_gender,top_category,categories_list,abovebelow_fold
0,0RuDzY7CkR,ADMELD,AM948,12,dollardays.com,1.0,Shopping,"[""Shopping\u0004Business""]",0
1,0RuDzY7CkR,ADMELD,AM948,10,dollardays.com,0.0,Shopping,"[""Shopping\u0004Business""]",0
2,0RVWazSlec,APPNEXUS,AP127996,11,jeetyetmedia.com,,Business,"[""Business\u0004Non-SpecificUserGeneratedConte...",0
3,0RVWazSlec,OPENX,OX32427,17,ffog.net,,Business,"[""Business""]",0
4,0RVWazSlec,OPENX,OX9538,14,profilecraze.com,1.0,Technology-Computing,"[""Technology-Computing""]",0


In [4]:
#check the skewness of the data
print('pos vs neg data ratio = ', data_pos.size/data_neg.size)

pos vs neg data ratio =  1.0


In [38]:
# merge the pos and neg data
Data = pd.concat([data_pos, data_neg], axis=0)

l_pos = len(data_pos)
l_neg = len(data_neg)
y = pd.concat([pd.DataFrame(np.ones(l_pos)), pd.DataFrame(np.zeros(l_neg))], axis=0)

Data

Unnamed: 0,creative_uid,exchange,exchange_publisher,user_local_hour,sitename,user_gender,top_category,categories_list,abovebelow_fold
0,0RuDzY7CkR,ADMELD,AM948,12,dollardays.com,1.0,Shopping,"[""Shopping\u0004Business""]",0
1,0RuDzY7CkR,ADMELD,AM948,10,dollardays.com,0.0,Shopping,"[""Shopping\u0004Business""]",0
2,0RVWazSlec,APPNEXUS,AP127996,11,jeetyetmedia.com,,Business,"[""Business\u0004Non-SpecificUserGeneratedConte...",0
3,0RVWazSlec,OPENX,OX32427,17,ffog.net,,Business,"[""Business""]",0
4,0RVWazSlec,OPENX,OX9538,14,profilecraze.com,1.0,Technology-Computing,"[""Technology-Computing""]",0
...,...,...,...,...,...,...,...,...,...
99994,0RuDzY7CkR,CONTEXTWEB,CW545477,9,nj.com,,Uncategorized,"[""Uncategorized""]",2
99995,0R6P807N0u,ADMELD,AM710,20,animecrazy.net,,Non-SpecificUserGeneratedContent-UGC,"[""Non-SpecificUserGeneratedContent-UGC\u0004Ar...",1
99996,0RuDzY7CkR,CONTEXTWEB,CW540797,16,dollardays.com,,Pets,"[""Pets""]",1
99997,0R6P807N0u,RUBICON,RB13695,23,cloudfront.net,,Uncategorized,"[""Uncategorized""]",0


In [40]:
# convert categorical features to numerical features
Data['creative_uid'] = pd.get_dummies(Data['creative_uid'])
Data['exchange'] = pd.get_dummies(Data['exchange'])
Data['exchange_publisher'] = pd.get_dummies(Data['exchange_publisher'])
Data['sitename'] = pd.get_dummies(Data['sitename'])
Data['top_category'] = pd.get_dummies(Data['top_category'])
Data['categories_list'] = pd.get_dummies(Data['categories_list'])

Data['creative_uid']

0        1
1        1
2        1
3        1
4        1
        ..
99994    1
99995    0
99996    1
99997    0
99998    1
Name: creative_uid, Length: 199998, dtype: uint8

In [31]:
# outliers?
Data['user_local_hour'].value_countsounts()

20    12425
21    12081
12    11909
22    11860
10    11622
11    11605
13    11255
15    10983
17    10823
19    10770
9     10586
18    10573
16    10463
14    10322
8      7834
23     6535
7      6168
0      4961
6      3643
1      3583
5      2982
2      2777
3      2267
4      1971
Name: user_local_hour, dtype: int64

In [41]:
# replace the NAN data with the most popular data
# maybe not the best solution though...
Data.isna().any()
Data['user_gender'] = Data['user_gender'].fillna(Data['user_gender'].value_counts().index[0])

0        1.0
1        0.0
2        1.0
3        1.0
4        1.0
        ... 
99994    1.0
99995    1.0
99996    1.0
99997    1.0
99998    1.0
Name: user_gender, Length: 199998, dtype: float64

In [42]:
# create training and testing vars
X_train, X_test, y_train, y_test = train_test_split(Data, y, test_size=0.3)

print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(139998, 9) (139998, 1)
(60000, 9) (60000, 1)


In [27]:
# MODEL1
model = SVC(gamma = 'auto', kernel = 'linear')
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [28]:
metrics.accuracy_score(y_test, model.predict(X_test))

0.7688833333333334

In [43]:
# MODEL2
model2 = SVC(gamma = 'auto', kernel = 'rbf')
model2.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [44]:
print( 'accuracy with gaussian kernel = ', metrics.accuracy_score(y_test, model2.predict(X_test)) )
# strange! 

accuracy with gaussian kernel =  0.58185
