In [103]:
import json, re
import pandas as pd
import numpy as np

from sklearn import preprocessing
from math import sqrt
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import roc_auc_score as AUC

In [2]:
# Data fields

# TripType - a categorical id representing the type of shopping trip the customer made. This is the ground truth that you are predicting. TripType_999 is an "other" category.
# VisitNumber - an id corresponding to a single trip by a single customer
# Weekday - the weekday of the trip
# Upc - the UPC number of the product purchased
# ScanCount - the number of the given item that was purchased. A negative value indicates a product return.
# DepartmentDescription - a high-level description of the item's department
# FinelineNumber - a more refined category for each of the products, created by Walmart

train_data = pd.read_csv('train.csv', dtype=str)
test_data = pd.read_csv("test.csv", dtype = str)

In [3]:
train_data['TripType'].unique()

array(['999', '30', '26', '8', '35', '41', '21', '6', '42', '7', '9', '39',
       '25', '38', '15', '36', '20', '37', '32', '40', '5', '3', '4', '24',
       '33', '43', '31', '27', '34', '18', '29', '44', '19', '23', '22',
       '28', '14', '12'], dtype=object)

In [4]:
# train_data.columns
# test_data.columns
test_data.dtypes
# train_data.dtypes

VisitNumber              object
Weekday                  object
Upc                      object
ScanCount                object
DepartmentDescription    object
FinelineNumber           object
dtype: object

In [71]:
train_data.columns

Index([u'TripType', u'VisitNumber', u'Weekday', u'Upc', u'ScanCount',
       u'DepartmentDescription', u'FinelineNumber'],
      dtype='object')

In [43]:
train_data.DepartmentDescription.unique()

array(['FINANCIAL SERVICES', 'SHOES', 'PERSONAL CARE',
       'PAINT AND ACCESSORIES', 'DSD GROCERY', 'MEAT - FRESH & FROZEN',
       'DAIRY', 'PETS AND SUPPLIES', 'HOUSEHOLD CHEMICALS/SUPP', 0,
       'IMPULSE MERCHANDISE', 'PRODUCE', 'CANDY, TOBACCO, COOKIES',
       'GROCERY DRY GOODS', 'BOYS WEAR', 'FABRICS AND CRAFTS',
       'JEWELRY AND SUNGLASSES', 'MENS WEAR', 'ACCESSORIES',
       'HOME MANAGEMENT', 'FROZEN FOODS', 'SERVICE DELI',
       'INFANT CONSUMABLE HARDLINES', 'PRE PACKED DELI', 'COOK AND DINE',
       'PHARMACY OTC', 'LADIESWEAR', 'COMM BREAD', 'BAKERY',
       'HOUSEHOLD PAPER GOODS', 'CELEBRATION', 'HARDWARE', 'BEAUTY',
       'AUTOMOTIVE', 'BOOKS AND MAGAZINES', 'SEAFOOD', 'OFFICE SUPPLIES',
       'LAWN AND GARDEN', 'SHEER HOSIERY', 'WIRELESS', 'BEDDING',
       'BATH AND SHOWER', 'HORTICULTURE AND ACCESS', 'HOME DECOR', 'TOYS',
       'INFANT APPAREL', 'LADIES SOCKS', 'PLUS AND MATERNITY',
       'ELECTRONICS', 'GIRLS WEAR, 4-6X  AND 7-14', 'BRAS & SHAPEWEAR',
 

In [47]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
enc = OneHotEncoder()
le = LabelEncoder()
# enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])  
# OneHotEncoder(categorical_features='all', dtype=<'float'>,
#        handle_unknown='error', n_values='auto', sparse=True)
# enc.n_values_

In [54]:
le.fit(['FINANCIAL SERVICES', 'SHOES', 'PERSONAL CARE',
       'PAINT AND ACCESSORIES', 'DSD GROCERY', 'MEAT - FRESH & FROZEN',
       'DAIRY', 'PETS AND SUPPLIES', 'HOUSEHOLD CHEMICALS/SUPP',
       'IMPULSE MERCHANDISE', 'PRODUCE', 'CANDY, TOBACCO, COOKIES',
       'GROCERY DRY GOODS', 'BOYS WEAR', 'FABRICS AND CRAFTS',
       'JEWELRY AND SUNGLASSES', 'MENS WEAR', 'ACCESSORIES',
       'HOME MANAGEMENT', 'FROZEN FOODS', 'SERVICE DELI',
       'INFANT CONSUMABLE HARDLINES', 'PRE PACKED DELI', 'COOK AND DINE',
       'PHARMACY OTC', 'LADIESWEAR', 'COMM BREAD', 'BAKERY',
       'HOUSEHOLD PAPER GOODS', 'CELEBRATION', 'HARDWARE', 'BEAUTY',
       'AUTOMOTIVE', 'BOOKS AND MAGAZINES', 'SEAFOOD', 'OFFICE SUPPLIES',
       'LAWN AND GARDEN', 'SHEER HOSIERY', 'WIRELESS', 'BEDDING',
       'BATH AND SHOWER', 'HORTICULTURE AND ACCESS', 'HOME DECOR', 'TOYS',
       'INFANT APPAREL', 'LADIES SOCKS', 'PLUS AND MATERNITY',
       'ELECTRONICS', 'GIRLS WEAR, 4-6X  AND 7-14', 'BRAS & SHAPEWEAR',
       'LIQUOR,WINE,BEER', 'SLEEPWEAR/FOUNDATIONS', 'CAMERAS AND SUPPLIES',
       'SPORTING GOODS', 'PLAYERS AND ELECTRONICS', 'PHARMACY RX',
       'MENSWEAR', 'OPTICAL - FRAMES', 'SWIMWEAR/OUTERWEAR',
       'OTHER DEPARTMENTS', 'MEDIA AND GAMING', 'FURNITURE',
       'OPTICAL - LENSES', 'SEASONAL', 'LARGE HOUSEHOLD GOODS',
       '1-HR PHOTO', 'CONCEPT STORES', 'HEALTH AND BEAUTY AIDS'])

LabelEncoder()

In [60]:
list(le.classes_)[1]

'ACCESSORIES'

In [57]:
le.transform(['FINANCIAL SERVICES', 'SHOES', 'PERSONAL CARE',
       'PAINT AND ACCESSORIES', 'DSD GROCERY', 'MEAT - FRESH & FROZEN',
       'DAIRY', 'PETS AND SUPPLIES', 'HOUSEHOLD CHEMICALS/SUPP',
       'IMPULSE MERCHANDISE', 'PRODUCE', 'CANDY, TOBACCO, COOKIES',
       'GROCERY DRY GOODS', 'BOYS WEAR', 'FABRICS AND CRAFTS',
       'JEWELRY AND SUNGLASSES', 'MENS WEAR', 'ACCESSORIES',
       'HOME MANAGEMENT', 'FROZEN FOODS', 'SERVICE DELI',
       'INFANT CONSUMABLE HARDLINES', 'PRE PACKED DELI', 'COOK AND DINE',
       'PHARMACY OTC', 'LADIESWEAR', 'COMM BREAD', 'BAKERY',
       'HOUSEHOLD PAPER GOODS', 'CELEBRATION', 'HARDWARE', 'BEAUTY',
       'AUTOMOTIVE', 'BOOKS AND MAGAZINES', 'SEAFOOD', 'OFFICE SUPPLIES',
       'LAWN AND GARDEN', 'SHEER HOSIERY', 'WIRELESS', 'BEDDING',
       'BATH AND SHOWER', 'HORTICULTURE AND ACCESS', 'HOME DECOR', 'TOYS',
       'INFANT APPAREL', 'LADIES SOCKS', 'PLUS AND MATERNITY',
       'ELECTRONICS', 'GIRLS WEAR, 4-6X  AND 7-14', 'BRAS & SHAPEWEAR',
       'LIQUOR,WINE,BEER', 'SLEEPWEAR/FOUNDATIONS', 'CAMERAS AND SUPPLIES',
       'SPORTING GOODS', 'PLAYERS AND ELECTRONICS', 'PHARMACY RX',
       'MENSWEAR', 'OPTICAL - FRAMES', 'SWIMWEAR/OUTERWEAR',
       'OTHER DEPARTMENTS', 'MEDIA AND GAMING', 'FURNITURE',
       'OPTICAL - LENSES', 'SEASONAL', 'LARGE HOUSEHOLD GOODS',
       '1-HR PHOTO', 'CONCEPT STORES', 'HEALTH AND BEAUTY AIDS'])

array([20, 62, 50, 49, 17, 41, 16, 51, 30, 32, 57, 11, 24,  8, 19, 35, 43,
        1, 28, 21, 60, 34, 56, 15, 52, 37, 13,  3, 31, 12, 25,  5,  2,  7,
       58, 45, 39, 61, 67,  6,  4, 29, 27, 66, 33, 36, 55, 18, 23,  9, 40,
       63, 10, 64, 54, 53, 44, 46, 65, 48, 42, 22, 47, 59, 38,  0, 14, 26])

In [58]:
#http://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

# Create some toy data in a Pandas dataframe
fruit_data = pd.DataFrame({
    'fruit':  ['apple','orange','pear','orange'],
    'color':  ['red','orange','green','green'],
    'weight': [5,6,3,4]
})

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [59]:
MultiColumnLabelEncoder(columns = ['fruit','color']).fit_transform(fruit_data)

Unnamed: 0,color,fruit,weight
0,2,0,5
1,1,1,6
2,0,2,3
3,0,1,4


In [64]:
train_data = MultiColumnLabelEncoder(columns=['Weekday', 'DepartmentDescription']).fit_transform(train_data)

In [67]:
test_data = MultiColumnLabelEncoder(columns=['Weekday', 'DepartmentDescription']).fit_transform(test_data)

In [73]:
features_of_interest = train_data[['VisitNumber', 'Weekday', 'Upc', 'ScanCount','DepartmentDescription', 'FinelineNumber']]

In [78]:
features_of_interest.head(2)

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,5,0,68113152929,-1,21,1000
1,7,0,60538815980,1,63,8931


In [90]:
# scaling
# mean = sum(x)/len(x)
# std_dev = (1/len(x) * sum([ (x_i - mean)**2 for x_i in x]))**0.5

# z_scores = [(x_i - mean)/std_dev for x_i in x]

# # Min-Max scaling

# minmax = [(x_i - min(x)) / (max(x) - min(x)) for x_i in x]

In [88]:
train_data = train_data.convert_objects(convert_numeric=True)

In [89]:
test_data = test_data.convert_objects(convert_numeric=True)

In [92]:
train_data = train_data.applymap(np.int64)

In [93]:
test_data = test_data.applymap(np.int64)

In [95]:
Y = train_data.TripType.values

In [99]:
X = train_data[['VisitNumber', 'Weekday', 'Upc', 'ScanCount',
       'DepartmentDescription', 'FinelineNumber']].values