In [1]:
import datetime
import pandas as pd
import numpy as np
import geopy as gp
import xgboost as xgb
import nltk
import random
import time
import zipfile


from geopy.distance import vincenty
from sklearn.cross_validation import train_test_split
from operator import itemgetter
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.corpus import stopwords



In [2]:

def get_features(train, test):
    trainval = list(train.columns.values)
    testval = list(test.columns.values)
    output = intersect(trainval, testval)
    output.remove('itemID_1')
    output.remove('itemID_2')
    return output


def intersect(a, b):
    return list(set(a) & set(b))

In [3]:


def prep_dataset(is_train, dataset_pairs_path, dataset_info_path):
    start_time = time.time()

    types1 = {
        'itemID_1': np.dtype(int),
        'itemID_2': np.dtype(int),
        'isDuplicate': np.dtype(int),
        'generationMethod': np.dtype(int),
    }

    types2 = {
        'itemID': np.dtype(int),
        'categoryID': np.dtype(int),
        'title': np.dtype(str),
        'description': np.dtype(str),
        'images_array': np.dtype(str),
        'attrsJSON': np.dtype(str),
        'price': np.dtype(float),
        'locationID': np.dtype(int),
        'metroID': np.dtype(float),
        'lat': np.dtype(float),
        'lon': np.dtype(float),
    }

    pairs = pd.read_csv(dataset_pairs_path, dtype=types1)

    # Add 'id' column for easy merge
    items = pd.read_csv(dataset_info_path, dtype=types2)
    items.fillna(-1, inplace=True)
    location = pd.read_csv("input/Location.csv")
    category = pd.read_csv("input/Category.csv")

    train = pairs
    
    if is_train:
        train = train.drop(['generationMethod'], axis=1)

    print('Add text features...')
    train['len_title'] = items['title'].str.len()
    train['len_description'] = items['description'].str.len()
    train['len_attrsJSON'] = items['attrsJSON'].str.len()

    print('Merge item 1...')
    item1 = items[['itemID', 'categoryID', 'description', 'price', 'locationID', 'metroID', 'lat', 'lon']]
    item1 = pd.merge(item1, category, how='left', on='categoryID', left_index=True)
    item1 = pd.merge(item1, location, how='left', on='locationID', left_index=True)

    item1 = item1.rename(
        columns={
            'itemID': 'itemID_1',
            'categoryID': 'categoryID_1',
            'description': 'description_1',
            'parentCategoryID': 'parentCategoryID_1',
            'price': 'price_1',
            'locationID': 'locationID_1',
            'regionID': 'regionID_1',
            'metroID': 'metroID_1',
            'lat': 'lat_1',
            'lon': 'lon_1'
        }
    )

    # Add item 1 data
    train = pd.merge(train, item1, how='left', on='itemID_1', left_index=True)

    print('Merge item 2...')
    item2 = items[['itemID', 'categoryID', 'description', 'price', 'locationID', 'metroID', 'lat', 'lon']]
    item2 = pd.merge(item2, category, how='left', on='categoryID', left_index=True)
    item2 = pd.merge(item2, location, how='left', on='locationID', left_index=True)

    item2 = item2.rename(
        columns={
            'itemID': 'itemID_2',
            'categoryID': 'categoryID_2',
            'description': 'description_2',
            'parentCategoryID': 'parentCategoryID_2',
            'price': 'price_2',
            'locationID': 'locationID_2',
            'regionID': 'regionID_2',
            'metroID': 'metroID_2',
            'lat': 'lat_2',
            'lon': 'lon_2'
        }
    )

    # Add item 2 data
    train = pd.merge(train, item2, how='left', on='itemID_2', left_index=True)

    # Create same arrays
    print('Create same arrays')
    train['price_same'] = np.equal(train['price_1'], train['price_2']).astype(np.int32)
    train['locationID_same'] = np.equal(train['locationID_1'], train['locationID_2']).astype(np.int32)
    train['categoryID_same'] = np.equal(train['categoryID_1'], train['categoryID_2']).astype(np.int32)
    train['regionID_same'] = np.equal(train['regionID_1'], train['regionID_2']).astype(np.int32)
    train['metroID_same'] = np.equal(train['metroID_1'], train['metroID_2']).astype(np.int32)
    train['lat_same'] = np.equal(train['lat_1'], train['lat_2']).astype(np.int32)
    train['lon_same'] = np.equal(train['lon_1'], train['lon_2']).astype(np.int32)
    
    # Create distance
    def geopydistance(a, b, c, d):
        p1 = gp.Point([a, b])
        p2 = gp.Point([c, d])
        return vincenty(p1,p2).kilometers
    vecfunc = np.vectorize(geopydistance)
    
    train['distance'] = vecfunc(train['lat_1'], train['lon_1'], train['lat_2'], train['lon_2'])
    
    # Create description distance
    def description_distance(a, b):
        try:
            asp = a.split()
            bsp = b.split()
            same_words = 0
            for w1 in asp:
                for w2 in bsp:
                    if w1 == w2:
                        same_words = same_words + 1
            return same_words
        except:
            return 0
        
    vecfunc = np.vectorize(description_distance)
    
    train['description_distance'] = vecfunc(train['description_1'], train['description_2'])
    
    #tfidf = TfidfVectorizer(stop_words = stopwords.words('russian')).fit_transform([train['description_1'].str, train['description_2'].str])
    #cosine_similarities = linear_kernel(tfidf[0], tfidf[1]).flatten()    
    #train['description_diff'] = cosine_similarities[0]
    
    print('Create train data time: {} seconds'.format(round(time.time() - start_time, 2)))
    train.drop(['description_1', 
                'description_2'], axis=1, inplace=True)
    return train

def read_test_train(from_disk=False):
    
    train = prep_dataset(True, "input/ItemPairs_train.csv", "input/ItemInfo_train.csv")
    train.fillna(-1, inplace=True)
    train.to_csv('input/train_merged.csv')


    test = prep_dataset(False, "input/ItemPairs_test.csv", "input/ItemInfo_test.csv")
    test.fillna(-1, inplace=True)
    test.to_csv('input/test_merged.csv')

    features = get_features(train, test)
    return train, test, features


In [4]:
train, test, features = read_test_train()
print('Length of train: ', len(train))
print('Length of test: ', len(test))
print('Features [{}]: {}'.format(len(features), sorted(features)))


Add text features...
Merge item 1...
Merge item 2...
Create same arrays
Create train data time: 401.69 seconds
Add text features...
Merge item 1...
Merge item 2...
Create same arrays
Create train data time: 116.81 seconds
('Length of train: ', 2991396)
('Length of test: ', 1044196)
Features [28]: ['categoryID_1', 'categoryID_2', 'categoryID_same', 'description_distance', 'distance', 'lat_1', 'lat_2', 'lat_same', 'len_attrsJSON', 'len_description', 'len_title', 'locationID_1', 'locationID_2', 'locationID_same', 'lon_1', 'lon_2', 'lon_same', 'metroID_1', 'metroID_2', 'metroID_same', 'parentCategoryID_1', 'parentCategoryID_2', 'price_1', 'price_2', 'price_same', 'regionID_1', 'regionID_2', 'regionID_same']
