In [1]:
#load all necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.cm as cm
import seaborn as sns
import cartopy.crs as ccrs
import cartopy.io.shapereader as shpreader
import pycountry
import itertools

%matplotlib inline

from mpl_toolkits.axes_grid1 import make_axes_locatable
from IPython.display import display

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('Data/bids.csv')
data.drop(['ip', 'url'], axis = 1, inplace = True)
data.head()

Unnamed: 0,bid_id,bidder_id,auction,merchandise,device,time,country
0,0,8dac2b259fd1c6d1120e519fb1ac14fbqvax8,ewmzr,jewelry,phone0,9759243157894736,us
1,1,668d393e858e8126275433046bbd35c6tywop,aeqok,furniture,phone1,9759243157894736,in
2,2,aa5f360084278b35d746fa6af3a7a1a5ra3xe,wa00e,home goods,phone2,9759243157894736,py
3,3,3939ac3ef7d472a59a9c5f893dd3e39fh9ofi,jefix,jewelry,phone4,9759243157894736,in
4,4,8393c48eaf4b8fa96886edc7cf27b372dsibi,jefix,jewelry,phone5,9759243157894736,in


In [3]:
bidder_train = pd.read_csv('Data/train.csv')
bidder_train.drop(['payment_account', 'address'], axis=1, inplace = True)
bidder_train.head()

Unnamed: 0,bidder_id,outcome
0,91a3c57b13234af24875c56fb7e2b2f4rb56a,0.0
1,624f258b49e77713fc34034560f93fb3hu3jo,0.0
2,1c5f4fc669099bfbfac515cd26997bd12ruaj,0.0
3,4bee9aba2abda51bf43d639013d6efe12iycd,0.0
4,4ab12bc61c82ddd9c2d65e60555808acqgos1,0.0


In [4]:
bidder_test = pd.read_csv('Data/test.csv')
bidder_test.drop(['payment_account', 'address'], axis=1, inplace = True)
bidder_test.head()

Unnamed: 0,bidder_id
0,49bb5a3c944b8fc337981cc7a9ccae41u31d7
1,a921612b85a1494456e74c09393ccb65ylp4y
2,6b601e72a4d264dab9ace9d7b229b47479v6i
3,eaf0ed0afc9689779417274b4791726cn5udi
4,cdecd8d02ed8c6037e38042c7745f688mx5sf


In [5]:
num_auction = pd.DataFrame(data[['bidder_id', 'auction', 'merchandise']]\
                           .groupby(['merchandise','bidder_id'])['auction'].nunique())
num_bids = data[['bidder_id','bid_id','merchandise']].groupby(['merchandise','bidder_id']).count()

In [6]:
num_auction.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,auction
merchandise,bidder_id,Unnamed: 2_level_1
auto parts,631f923b7bcf91f61a2e9f96520bd8c8vtl14,12
auto parts,8ebc5b544d62622e45e1ee22a56265e0xwwsb,66
books and music,0061edfc5b07ff3d70d693883a38d370oy4fs,38
books and music,00a0517965f18610417ee784a05f494d4dw6e,74
books and music,01c5692d487454cdbe731df330bef608f48zz,19


In [7]:
num_bids.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,bid_id
merchandise,bidder_id,Unnamed: 2_level_1
auto parts,631f923b7bcf91f61a2e9f96520bd8c8vtl14,2413
auto parts,8ebc5b544d62622e45e1ee22a56265e0xwwsb,7344
books and music,0061edfc5b07ff3d70d693883a38d370oy4fs,134
books and music,00a0517965f18610417ee784a05f494d4dw6e,141
books and music,01c5692d487454cdbe731df330bef608f48zz,82


In [8]:
num_bids_auction = pd.DataFrame(num_bids.bid_id/num_auction.auction).rename(columns = {0:'avg_num_bids'})

In [9]:
num_bids_auction = num_bids_auction.reset_index()

In [10]:
merged_data = num_bids_auction.merge(data, left_on = ['bidder_id','merchandise'],
                                     right_on=['bidder_id','merchandise'], how = 'right')

In [11]:
merged_data.country = merged_data.country.astype('category')
merged_data.device = merged_data.device.astype('category')

In [12]:
merged_data[['country', 'device']] = merged_data[['country', 'device']].apply(lambda x: x.cat.codes)

In [13]:
merged_data.head()

Unnamed: 0,merchandise,bidder_id,avg_num_bids,bid_id,auction,device,time,country
0,auto parts,631f923b7bcf91f61a2e9f96520bd8c8vtl14,201.083333,1117,0bxy9,6022,9759249421052631,132
1,auto parts,631f923b7bcf91f61a2e9f96520bd8c8vtl14,201.083333,1494,ca3p4,6022,9759251736842105,132
2,auto parts,631f923b7bcf91f61a2e9f96520bd8c8vtl14,201.083333,3327,vev1t,469,9759261736842105,15
3,auto parts,631f923b7bcf91f61a2e9f96520bd8c8vtl14,201.083333,4193,04hdj,1379,9759265631578947,83
4,auto parts,631f923b7bcf91f61a2e9f96520bd8c8vtl14,201.083333,7938,ggsdh,1379,9759274315789473,83


In [14]:
countries = pd.DataFrame(data.groupby(['bidder_id', 'country']).count().reset_index()\
                 .groupby('bidder_id')['country'].count()).rename(columns={'country':'num_countries'})

In [15]:
merged_data_1 = countries.merge(merged_data, left_index = True, right_on = 'bidder_id', how = 'right')

In [16]:
devices = pd.DataFrame(data.groupby(['bidder_id', 'device']).count().reset_index()\
                 .groupby('bidder_id')['device'].count()).rename(columns={'device':'num_device'})

In [17]:
merged_data_2 = devices.merge(merged_data_1, left_index = True, right_on = 'bidder_id', how = 'right')

In [18]:
train_data = merged_data_2.merge(bidder_train, left_on = 'bidder_id', right_on = 'bidder_id', how = 'right')
labels = train_data.pop('outcome')

In [96]:
train_data.head()

Unnamed: 0,num_device,num_countries,merchandise,bidder_id,avg_num_bids,bid_id,auction,device,time,country
0,154.0,34.0,auto parts,631f923b7bcf91f61a2e9f96520bd8c8vtl14,201.083333,1117.0,0bxy9,6022.0,9759249000000000.0,132.0
1,154.0,34.0,auto parts,631f923b7bcf91f61a2e9f96520bd8c8vtl14,201.083333,1494.0,ca3p4,6022.0,9759252000000000.0,132.0
2,154.0,34.0,auto parts,631f923b7bcf91f61a2e9f96520bd8c8vtl14,201.083333,3327.0,vev1t,469.0,9759262000000000.0,15.0
3,154.0,34.0,auto parts,631f923b7bcf91f61a2e9f96520bd8c8vtl14,201.083333,4193.0,04hdj,1379.0,9759266000000000.0,83.0
4,154.0,34.0,auto parts,631f923b7bcf91f61a2e9f96520bd8c8vtl14,201.083333,7938.0,ggsdh,1379.0,9759274000000000.0,83.0


In [19]:
test_data = merged_data_2.merge(bidder_test, left_on = 'bidder_id', right_on = 'bidder_id', how = 'right')

In [20]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_data, labels, test_size = 0.2)

In [1]:
from sklearn.linear_model import SGDClassifier
 
def iter_minibatches(chunksize):
    # Provide chunks one by one
    chunkstartmarker = 0
    while chunkstartmarker < len(x_train):
        chunkrows = range(chunkstartmarker,chunkstartmarker+chunksize)
        X_chunk = x_train.drop(['merchandise', 'bidder_id', 'auction'], axis = 1)\
                         .fillna(method='ffill').iloc[list(chunkrows)]
        y_chunk = y_train.iloc[list(chunkrows)]
        yield X_chunk, y_chunk
        chunkstartmarker += chunksize

In [None]:
#from sklearn.linear_model import SGDClassifier
batcherator = iter_minibatches(chunksize=10000)
model = SGDClassifier()
# Train model
for X_chunk, y_chunk in batcherator:
    model.partial_fit(X_chunk, y_chunk, classes=np.unique(y_chunk))
    # Now make predictions with trained model
    y_predicted = model.predict(x_test.fillna('0').drop(['merchandise', 'bidder_id', 'auction'], axis = 1))

In [35]:
from lightning.classification import LinearSVC
svc = LinearSVC()

In [36]:
svc.fit(x_train.drop(['merchandise', 'bidder_id', 'auction'], axis = 1).fillna(method='ffill'), y_train)

LinearSVC(C=1.0, callback=None, criterion='accuracy', loss='hinge',
     max_iter=1000, n_calls=100, permute=True, random_state=None,
     shrinking=True, tol=0.001, verbose=0, warm_start=False)

In [37]:
svc.score(x_test.fillna(method='ffill').drop(['merchandise', 'bidder_id', 'auction'], axis = 1), y_test)

0.86646012786303972

In [45]:
from lightning.classification import SGDClassifier
sgd = SGDClassifier(learning_rate="constant",
                      alpha=1e-3,
                      max_iter=20,
random_state=0)

In [47]:
sgd.fit(X=x_train.drop(['merchandise', 'bidder_id', 'auction'], axis = 1).fillna(method='ffill'), y=y_train)

SGDClassifier(alpha=0.001, callback=None, epsilon=0.01, eta0=0.03,
       fit_intercept=True, intercept_decay=1.0, learning_rate='constant',
       loss='hinge', max_iter=20, multiclass=False, n_calls=100,
       penalty='l2', power_t=0.5, random_state=0, shuffle=True, verbose=0)

In [61]:
sgd.score(x_test.fillna(method='ffill').drop(['merchandise', 'bidder_id', 'auction'], axis = 1), y_test)

NameError: name 'sgd' is not defined

In [91]:
from lightning.classification import AdaGradClassifier
ada = AdaGradClassifier()

In [92]:
ada.fit(X=x_train.drop(['merchandise', 'bidder_id', 'auction'], axis = 1).fillna(method='ffill'), y=y_train)

AdaGradClassifier(alpha=1.0, callback=None, eta=1.0, gamma=1.0, l1_ratio=0,
         loss='hinge', n_calls=None, n_iter=10, random_state=None,
         shuffle=True)

In [93]:
ada.score(x_test.fillna(method='ffill').drop(['merchandise', 'bidder_id', 'auction'], axis = 1), y_test)

0.86617197204400154

In [79]:
pred = ada.predict(test_data.fillna(method='ffill').drop(['merchandise', 'bidder_id', 'auction'], axis = 1))

In [80]:
pred_df = pd.DataFrame(pred)

In [67]:
bidder_ids = test_data[['bidder_id']]

In [68]:
sub = pd.merge(bidder_ids, pred_df, left_index=True, right_index=True, how = 'left')

In [69]:
sub = sub.rename(columns = {0:'prediction'})

In [70]:
sub_new = sub.groupby('bidder_id')['prediction']

In [33]:
len(sub_new)

4700

In [46]:
sub_final = pd.DataFrame(sub_new).rename(columns={0:'prediction'}).reset_index()

In [48]:
sub_final.to_csv('submission.csv')

In [49]:
tb_ids = bidder_test[['bidder_id']]

In [50]:
sub_ulti = tb_ids.merge(sub_final, left_on = 'bidder_id', right_on='bidder_id', how = 'left')

In [51]:
sub_ulti.set_index('bidder_id').to_csv('submission.csv')

In [58]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7656334 entries, 0 to 7656333
Data columns (total 7 columns):
bid_id         int64
bidder_id      object
auction        object
merchandise    object
device         object
time           int64
country        object
dtypes: int64(2), object(5)
memory usage: 408.9+ MB
