# Market basket analysis of Walmart data to classify shopping trips.

Data fields
TripType - a categorical id representing the type of shopping trip the customer made. This is the ground truth that you are predicting. TripType_999 is an "other" category.

VisitNumber - an id corresponding to a single trip by a single customer

Weekday - the weekday of the trip

Upc - the UPC number of the product purchased

ScanCount - the number of the given item that was purchased. A negative value indicates a product return.

DepartmentDescription - a high-level description of the item's department

FinelineNumber - a more refined category for each of the products, created by Walmart

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

#gridsearch for plausible hyperparameters values

In [3]:
# Read data into dataframe
df = pd.read_csv("C:\\kaggle\\WalmartRecruiting\\train.csv")
df.head(10)

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113150000.0,-1,FINANCIAL SERVICES,1000.0
1,30,7,Friday,60538820000.0,1,SHOES,8931.0
2,30,7,Friday,7410811000.0,1,PERSONAL CARE,4504.0
3,26,8,Friday,2238404000.0,2,PAINT AND ACCESSORIES,3565.0
4,26,8,Friday,2006614000.0,2,PAINT AND ACCESSORIES,1017.0
5,26,8,Friday,2006619000.0,2,PAINT AND ACCESSORIES,1017.0
6,26,8,Friday,2006614000.0,1,PAINT AND ACCESSORIES,1017.0
7,26,8,Friday,7004803000.0,1,PAINT AND ACCESSORIES,2802.0
8,26,8,Friday,2238495000.0,1,PAINT AND ACCESSORIES,4501.0
9,26,8,Friday,2238400000.0,-1,PAINT AND ACCESSORIES,3565.0


In [39]:
# Total data rows
len(df)

647054

In [40]:
# Count the frequency of values per category
#df['FinelineNumber'].value_counts()

In [41]:
# Get number of unique visits
print("Unique visits:")
print(df["VisitNumber"].nunique())

# Get total visits
print("Total Visits:")
print(df["VisitNumber"].count())

# Get count of null values in VisitNumber column
print("Null values in visitnumber column:")
print(df["VisitNumber"].isnull().values.ravel().sum())

# Count of unique product ids
print("Unique product ids:")
print(df["Upc"].nunique())

print("Trip Types:")
print(df["TripType"].unique())

print("Unique fine line numbers:")
print(df["FinelineNumber"].nunique())

# Unique department categories
print("Unique departent categories:")
print(df["DepartmentDescription"].nunique())

Unique visits:
95674
Total Visits:
647054
Null values in visitnumber column:
0
Unique product ids:
97714
Trip Types:
[999  30  26   8  35  41  21   6  42   7   9  39  25  38  15  36  20  37
  32  40   5   3   4  24  33  43  31  27  34  18  29  44  19  23  22  28
  14  12]
Unique fine line numbers:
5195
Unique departent categories:
68


In [42]:
# Get all rows where Weekday column is missing value
df.loc[df["Weekday"].isnull()]

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber


In [43]:
#Check if there are any NaNs
df.isnull().values.any()

True

In [44]:
# Count number of NaNs
df.isnull().sum().sum()

9619

In [45]:
# Another way to get the count of null/NaN values --->  df["Upc"].isnull().sum()
print(df["Upc"].isnull().values.ravel().sum())
print(df["FinelineNumber"].isnull().values.ravel().sum())
print(df["DepartmentDescription"].isnull().values.ravel().sum())

4129
4129
1361


In [46]:
# Find out which colums have NaN values
df.columns[df.isnull().any()].tolist()

['Upc', 'DepartmentDescription', 'FinelineNumber']

In [47]:
# Drop Upc column and fill NaN in FinelineNumber and DepartmentDescription
df.drop(["Upc"],axis=1, inplace=True)


df["FinelineNumber"].fillna(-9999 ,inplace = True)
df["DepartmentDescription"].fillna('unspecified' ,inplace = True)

In [48]:
# Set up a factorplot
#g = sns.factorplot("ScanCount", "VisitNumber", "FinelineNumber", data=df, kind="bar", palette="muted", legend=False)
                   
# Show plot
#plt.show()

In [49]:
# Confirm if missing values are handled
df.columns[df.isnull().any()].tolist()

[]

In [50]:
# Check if returned items are more than one
#df.loc[df["ScanCount"] < -1]
df[(df["ScanCount"] < -1)].count()

TripType                 1064
VisitNumber              1064
Weekday                  1064
ScanCount                1064
DepartmentDescription    1064
FinelineNumber           1064
dtype: int64

In [51]:
df.dtypes

TripType                   int64
VisitNumber                int64
Weekday                   object
ScanCount                  int64
DepartmentDescription     object
FinelineNumber           float64
dtype: object

In [52]:
df.head()

Unnamed: 0,TripType,VisitNumber,Weekday,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,-1,FINANCIAL SERVICES,1000.0
1,30,7,Friday,1,SHOES,8931.0
2,30,7,Friday,1,PERSONAL CARE,4504.0
3,26,8,Friday,2,PAINT AND ACCESSORIES,3565.0
4,26,8,Friday,2,PAINT AND ACCESSORIES,1017.0


In [53]:
# Use pd.concat to join the new columns with your original dataframe
# drop_first = True to avoid the dummy variable trap
df = pd.concat([df,pd.get_dummies(df["Weekday"], prefix="Weekday", drop_first=True)],axis=1)
df = pd.concat([df,pd.get_dummies(df["DepartmentDescription"], prefix="DepartmentDescription", drop_first=True)],axis=1)

# Drop the original 'Weekday' and 'DepartmentDescription' columns
df.drop(["Weekday"],axis=1, inplace=True)
df.drop(["DepartmentDescription"],axis=1, inplace=True)

In [54]:
# Create a label (category) encoder object
le = LabelEncoder()

# Fit the encoder to the pandas column
le.fit(df['FinelineNumber'])
df['FinelineNumber'] = le.fit_transform(df['FinelineNumber'].astype(float))

#list(le.classes_)

In [55]:
df.head()

Unnamed: 0,TripType,VisitNumber,ScanCount,FinelineNumber,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday,...,DepartmentDescription_SEASONAL,DepartmentDescription_SERVICE DELI,DepartmentDescription_SHEER HOSIERY,DepartmentDescription_SHOES,DepartmentDescription_SLEEPWEAR/FOUNDATIONS,DepartmentDescription_SPORTING GOODS,DepartmentDescription_SWIMWEAR/OUTERWEAR,DepartmentDescription_TOYS,DepartmentDescription_WIRELESS,DepartmentDescription_unspecified
0,999,5,-1,747,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,30,7,1,4886,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,30,7,1,2853,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,26,8,2,2323,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,26,8,2,764,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
# Get a boolean mask back with True for positions containing NaNs
#np.isnan(df)

# Get back a tuple with i, j coordinates of NaNs
#np.where(np.isnan(df))

# Replace nan with zero and inf with finite numbers
#np.nan_to_num(df)

In [57]:
# Split X and y
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values

In [58]:
# 2 ways to do feature selection

# 1 ##########
'''from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

X_new = SelectKBest(chi2, k=2).fit_transform(X, y)'''

#################################################################################
# Feature significance trial with from ExtraTreesClassifier and SelectFromModel
#################################################################################

# 2 ##########

'''
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

clf = ExtraTreesClassifier()
clf = clf.fit(X, y)

clf.feature_importances_'''

# Select top N significant features
'''model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)
X_new.shape'''

'model = SelectFromModel(clf, prefit=True)\nX_new = model.transform(X)\nX_new.shape'

In [59]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [60]:
# Feature Scaling
'''sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)'''

'sc = StandardScaler()\nX_train = sc.fit_transform(X_train)\nX_test = sc.transform(X_test)'

In [61]:
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 25, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [62]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [63]:
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[1060,    0,    3, ...,    3,    7,   73],
       [   1,   47,   20, ...,    2,    0,    2],
       [   4,    6, 1310, ...,    6,   36,   42],
       ..., 
       [   5,    2,   33, ...,  218,   39,   12],
       [   6,    1,   57, ...,   33, 1061,   44],
       [  97,    3,   30, ...,   11,   30, 2251]], dtype=int64)

In [64]:
#cm = confusion_matrix(y_test, classifier.predict(X_test))
#sns.heatmap(cm,annot=True,fmt="d")
#plt.show()

In [65]:
print("y_pred.size:")
print(y_pred.size)

print("y_pred:")
print(y_pred)

print("y_test:")
print(y_test)

y_diff = (y_pred - y_test)

np.count_nonzero(y_diff==0)

y_pred.size:
129411
y_pred:
[32 39 31 ..., 24 40 40]
y_test:
[32 39 38 ..., 24 40 40]


50840

In [66]:
#count elements with 0 values in numpy array np.count_nonzero(y_diff==0)
accurate_predictions =  np.count_nonzero(y_diff==0) / len(y_pred)
accurate_predictions

0.39285686688148613

#Looking at the confusion matrix the model is classifying trips correctly 40 % of the time.