In [1]:
import json, re
import pandas as pd
import numpy as np

from sklearn import preprocessing
from math import sqrt
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import roc_auc_score as AUC

In [2]:
# Data fields

# TripType - a categorical id representing the type of shopping trip the customer made. This is the ground truth that you are predicting. TripType_999 is an "other" category.
# VisitNumber - an id corresponding to a single trip by a single customer
# Weekday - the weekday of the trip
# Upc - the UPC number of the product purchased
# ScanCount - the number of the given item that was purchased. A negative value indicates a product return.
# DepartmentDescription - a high-level description of the item's department
# FinelineNumber - a more refined category for each of the products, created by Walmart

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv("test.csv")

In [3]:
test_data.head(3)

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,1,Friday,72503389714,1,SHOES,3002
1,1,Friday,1707710732,1,DAIRY,1526
2,1,Friday,89470001026,1,DAIRY,1431


In [4]:
train_data.head(5)

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000
1,30,7,Friday,60538815980,1,SHOES,8931
2,30,7,Friday,7410811099,1,PERSONAL CARE,4504
3,26,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565
4,26,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017


In [5]:
train_data.DepartmentDescription.unique()

array(['FINANCIAL SERVICES', 'SHOES', 'PERSONAL CARE',
       'PAINT AND ACCESSORIES', 'DSD GROCERY', 'MEAT - FRESH & FROZEN',
       'DAIRY', 'PETS AND SUPPLIES', 'HOUSEHOLD CHEMICALS/SUPP', nan,
       'IMPULSE MERCHANDISE', 'PRODUCE', 'CANDY, TOBACCO, COOKIES',
       'GROCERY DRY GOODS', 'BOYS WEAR', 'FABRICS AND CRAFTS',
       'JEWELRY AND SUNGLASSES', 'MENS WEAR', 'ACCESSORIES',
       'HOME MANAGEMENT', 'FROZEN FOODS', 'SERVICE DELI',
       'INFANT CONSUMABLE HARDLINES', 'PRE PACKED DELI', 'COOK AND DINE',
       'PHARMACY OTC', 'LADIESWEAR', 'COMM BREAD', 'BAKERY',
       'HOUSEHOLD PAPER GOODS', 'CELEBRATION', 'HARDWARE', 'BEAUTY',
       'AUTOMOTIVE', 'BOOKS AND MAGAZINES', 'SEAFOOD', 'OFFICE SUPPLIES',
       'LAWN AND GARDEN', 'SHEER HOSIERY', 'WIRELESS', 'BEDDING',
       'BATH AND SHOWER', 'HORTICULTURE AND ACCESS', 'HOME DECOR', 'TOYS',
       'INFANT APPAREL', 'LADIES SOCKS', 'PLUS AND MATERNITY',
       'ELECTRONICS', 'GIRLS WEAR, 4-6X  AND 7-14', 'BRAS & SHAPEWEAR',

In [81]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
enc = OneHotEncoder()
le = LabelEncoder()
# enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])  
# OneHotEncoder(categorical_features='all', dtype=<'float'>,
#        handle_unknown='error', n_values='auto', sparse=True)
# enc.n_values_

In [7]:
#http://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

# Create some toy data in a Pandas dataframe
fruit_data = pd.DataFrame({
    'fruit':  ['apple','orange','pear','orange'],
    'color':  ['red','orange','green','green'],
    'weight': [5,6,3,4]
})

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [7]:
# train_data = MultiColumnLabelEncoder(columns=['Weekday', 'DepartmentDescription']).fit_transform(train_data)

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))


In [9]:
# test_data = MultiColumnLabelEncoder(columns=['Weekday', 'DepartmentDescription']).fit_transform(test_data)

In [90]:
# scaling
# mean = sum(x)/len(x)
# std_dev = (1/len(x) * sum([ (x_i - mean)**2 for x_i in x]))**0.5

# z_scores = [(x_i - mean)/std_dev for x_i in x]

# # Min-Max scaling

# minmax = [(x_i - min(x)) / (max(x) - min(x)) for x_i in x]

In [75]:
train_data = train_data.convert_objects(convert_numeric=True)

In [76]:
test_data = test_data.convert_objects(convert_numeric=True)

In [77]:
train_data = train_data.applymap(np.int64)

In [79]:
# test_data = test_data.applymap(np.int64)

In [None]:
# numeric x

numeric_cols = ['VisitNumber', 'Upc', 'ScanCount', 'FinelineNumber']
x_num_train = train_data[ numeric_cols ].as_matrix()
x_num_test = test_data[ numeric_cols ].as_matrix()

# y

y_train = train_data.DepartmentDescription
y_test = test_data.DepartmentDescription

# # # # categorical

cat_train = train_data.drop(numeric_cols + ['TripType'], axis = 1 )
cat_test = test_data.drop(numeric_cols, axis = 1 )

cat_train.fillna(0, inplace = True )
cat_test.fillna(0, inplace = True )

x_cat_train = cat_train.T.to_dict().values()
x_cat_test = cat_test.T.to_dict().values()

# # # vectorize

vectorizer = DV(sparse = False)
vec_x_cat_train = vectorizer.fit_transform(x_cat_train)
vec_x_cat_test = vectorizer.transform(x_cat_test )

# # complete x

x_train = np.hstack((x_num_train, vec_x_cat_train ))
x_test = np.hstack((x_num_test, vec_x_cat_test ))

In [77]:
vec_x_cat_train

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [78]:
x_train

array([['5', '68113152929', '-1', ..., 0.0, 0.0, 0.0],
       ['7', '60538815980', '1', ..., 0.0, 0.0, 0.0],
       ['7', '7410811099', '1', ..., 0.0, 0.0, 0.0],
       ..., 
       ['191346', '4072', '1', ..., 0.0, 0.0, 0.0],
       ['191347', '4190007664', '1', ..., 0.0, 0.0, 0.0],
       ['191347', '3800059655', '1', ..., 0.0, 0.0, 0.0]], dtype=object)

In [79]:
from sklearn.cross_validation import train_test_split

features_train, features_test, target_train, target_test = train_test_split(
    x_train, y_train, test_size=0.20, random_state=0)

In [1]:
# from sklearn.linear_model import LogisticRegression

# logreg = LogisticRegression(C=1)
# logreg.fit(features_train, target_train)

In [6]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.nonparametric.kde import KDEUnivariate
from statsmodels.nonparametric import smoothers_lowess
from pandas import Series, DataFrame
from patsy import dmatrices
from sklearn import datasets, svm

In [3]:
# -*- coding: utf-8 -*-
""" Small script that shows hot to do one hot encoding
    of categorical columns in a pandas DataFrame.
    See:
    http://scikit-learn.org/dev/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder
    http://scikit-learn.org/dev/modules/generated/sklearn.feature_extraction.DictVectorizer.html
"""
import pandas
import random
import numpy
from sklearn.feature_extraction import DictVectorizer

def one_hot_dataframe(data, cols, replace=False):
    """ Takes a dataframe and a list of columns that need to be encoded.
        Returns a 3-tuple comprising the data, the vectorized data,
        and the fitted vectorizor."""
    vec = DictVectorizer(sparse=False)
    vecData = pd.DataFrame(vec.fit_transform(data[cols].T.to_dict().values()))
    vecData.columns = vec.get_feature_names()
    vecData.index = data.index
    if replace is True:
        data = data.drop(cols, axis=1)
        data = data.join(vecData)
    return (data, vecData, vec)


# def main():

#     # Get a random DataFrame
#     df = pandas.DataFrame(numpy.random.randn(25, 3), columns=['a', 'b', 'c'])

#     # Make some random categorical columns
#     df['e'] = [random.choice(('Chicago', 'Boston', 'New York')) for i in range(df.shape[0])]
#     df['f'] = [random.choice(('Chrome', 'Firefox', 'Opera', "Safari")) for i in range(df.shape[0])]
#     print df

#     # Vectorize the categorical columns: e & f
#     df, _, _ = one_hot_dataframe(df, ['e', 'f'], replace=True)
#     print df

# if __name__ == '__main__':
#     main()

In [9]:
# Vectorize the categorical columns: Weekday & DepartmentDescription
train_data, _, _ = one_hot_dataframe(train_data, ['Weekday', 'DepartmentDescription'], replace=True)

In [10]:
test_data, _, _ = one_hot_dataframe(test_data, ['Weekday', 'DepartmentDescription'], replace = True)

In [11]:
train_as_dicts = [dict(r.iteritems()) for _, r in train_data.iterrows()]#dictionary of keys (columns) with instances (values)
train_features = DictVectorizer(sparse=False).fit_transform(train_as_dicts)

In [12]:
import pandas as pd
from sklearn import tree
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing

train = pd.DataFrame({'a' : ['a', 'b', 'a'], 'd' : ['e', 'x', 'f'],
                  'b' : [0, 1, 1], 'c' : ['b', 'c', 'b']})

# encode labels
labels = train[['c']]
le = preprocessing.LabelEncoder()
labels_fea = le.fit_transform(labels) 
# vectorize training data
del train['c']
train_as_dicts = [dict(r.iteritems()) for _, r in train.iterrows()]
train_fea = DictVectorizer(sparse=False).fit_transform(train_as_dicts)
# use decision tree
dt = tree.DecisionTreeClassifier()
dt.fit(train_fea, labels_fea)
# transform result
predictions = le.inverse_transform(dt.predict(train_fea).astype('I'))
predictions_as_dataframe = train.join(pd.DataFrame({"Prediction": predictions}))
print predictions_as_dataframe

   a  b  d Prediction
0  a  0  e          b
1  b  1  x          c
2  a  1  f          b


  y = column_or_1d(y, warn=True)


In [9]:
test_data.head(3)

Unnamed: 0,VisitNumber,Upc,ScanCount,FinelineNumber,DepartmentDescription,DepartmentDescription=1-HR PHOTO,DepartmentDescription=ACCESSORIES,DepartmentDescription=AUTOMOTIVE,DepartmentDescription=BAKERY,DepartmentDescription=BATH AND SHOWER,...,DepartmentDescription=SWIMWEAR/OUTERWEAR,DepartmentDescription=TOYS,DepartmentDescription=WIRELESS,Weekday=Friday,Weekday=Monday,Weekday=Saturday,Weekday=Sunday,Weekday=Thursday,Weekday=Tuesday,Weekday=Wednesday
0,1,72503389714,1,3002,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1,1707710732,1,1526,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,1,89470001026,1,1431,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [13]:
train_data['TripType'][ np.isnan(train_data['TripType'])]

Series([], Name: TripType, dtype: int64)

In [4]:
import pandas as pd
import numpy as np

from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

data = [
    ['a', 1, 2],
    ['b', 1, 1],
    ['b', 2, 2],
    [np.nan, np.nan, np.nan]
]

X = pd.DataFrame(data)
xt = DataFrameImputer().fit_transform(X)

print('before...')
print(X)
print('after...')
print(xt)

before...
     0   1   2
0    a   1   2
1    b   1   1
2    b   2   2
3  NaN NaN NaN
after...
   0         1         2
0  a  1.000000  2.000000
1  b  1.000000  1.000000
2  b  2.000000  2.000000
3  b  1.333333  1.666667


In [5]:
train_data = DataFrameImputer().fit_transform(train_data)

In [20]:
from sklearn import tree

dt = tree.DecisionTreeClassifier()
dt.fit(train_features, target)
# transform result
predictions = le.inverse_transform(dt.predict(train_features).astype('I'))
predictions_as_dataframe = train.join(pd.DataFrame({"Prediction": predictions}))
print predictions_as_dataframe

In [13]:
target = train_data.TripType.values

In [16]:
train_data.groupby('TripType').count()

Unnamed: 0_level_0,VisitNumber,Upc,ScanCount,FinelineNumber,DepartmentDescription,DepartmentDescription=1-HR PHOTO,DepartmentDescription=ACCESSORIES,DepartmentDescription=AUTOMOTIVE,DepartmentDescription=BAKERY,DepartmentDescription=BATH AND SHOWER,...,DepartmentDescription=SWIMWEAR/OUTERWEAR,DepartmentDescription=TOYS,DepartmentDescription=WIRELESS,Weekday=Friday,Weekday=Monday,Weekday=Saturday,Weekday=Sunday,Weekday=Thursday,Weekday=Tuesday,Weekday=Wednesday
TripType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,6827,6822,6827,6822,6822,6827,6827,6827,6827,6827,...,6827,6827,6827,6827,6827,6827,6827,6827,6827,6827
4,901,896,901,896,900,901,901,901,901,901,...,901,901,901,901,901,901,901,901,901,901
5,13836,11253,13836,11253,13823,13836,13836,13836,13836,13836,...,13836,13836,13836,13836,13836,13836,13836,13836,13836,13836
6,3405,3402,3405,3402,3402,3405,3405,3405,3405,3405,...,3405,3405,3405,3405,3405,3405,3405,3405,3405,3405
7,23199,23168,23199,23168,23170,23199,23199,23199,23199,23199,...,23199,23199,23199,23199,23199,23199,23199,23199,23199,23199
8,22844,22766,22844,22766,22776,22844,22844,22844,22844,22844,...,22844,22844,22844,22844,22844,22844,22844,22844,22844,22844
9,16820,16738,16820,16738,16741,16820,16820,16820,16820,16820,...,16820,16820,16820,16820,16820,16820,16820,16820,16820,16820
12,2108,2104,2108,2104,2104,2108,2108,2108,2108,2108,...,2108,2108,2108,2108,2108,2108,2108,2108,2108,2108
14,35,35,35,35,35,35,35,35,35,35,...,35,35,35,35,35,35,35,35,35,35
15,7147,7126,7147,7126,7126,7147,7147,7147,7147,7147,...,7147,7147,7147,7147,7147,7147,7147,7147,7147,7147
