Amazon Customer Data

In [3]:
from zipfile import ZipFile
import pandas as pd

In [4]:
with ZipFile('amazon-employee-access-challenge.zip') as zip:
    zip.extractall()

In [5]:
df = pd.read_csv('train.csv')

In [6]:
df

Unnamed: 0,ACTION,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,1,39353,85475,117961,118300,123472,117905,117906,290919,117908
1,1,17183,1540,117961,118343,123125,118536,118536,308574,118539
2,1,36724,14457,118219,118220,117884,117879,267952,19721,117880
3,1,36135,5396,117961,118343,119993,118321,240983,290919,118322
4,1,42680,5905,117929,117930,119569,119323,123932,19793,119325
...,...,...,...,...,...,...,...,...,...,...
32764,1,23497,16971,117961,118300,119993,118321,240983,290919,118322
32765,1,25139,311198,91261,118026,122392,121143,173805,249618,121145
32766,1,34924,28805,117961,118327,120299,124922,152038,118612,124924
32767,1,80574,55643,118256,118257,117945,280788,280788,292795,119082


In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
cols = df.drop('ACTION',axis=1).columns

In [9]:
encoders = [LabelEncoder() for i in cols]

In [10]:
def label_encoder(n):
    encoded_data = encoders[n].fit_transform(df[cols[n]])
    return encoded_data

In [11]:
for i in range(len(cols)):
    df[cols[i]] = label_encoder(i)

In [12]:
df['ACTION'].value_counts()

ACTION
1    30872
0     1897
Name: count, dtype: int64

In [13]:
df.dtypes

ACTION              int64
RESOURCE            int64
MGR_ID              int64
ROLE_ROLLUP_1       int64
ROLE_ROLLUP_2       int64
ROLE_DEPTNAME       int64
ROLE_TITLE          int64
ROLE_FAMILY_DESC    int64
ROLE_FAMILY         int64
ROLE_CODE           int64
dtype: object

In [14]:
def frequency_encoder(col):
    frequency = df[col].value_counts().reset_index()
    return pd.merge(df[[col]],frequency,how='left',on=col).iloc[:,-1].values

In [15]:
df['ROLE_ROLLUP_2'].value_counts()

ROLE_ROLLUP_2
64     4424
68     3945
66     2641
58     2547
71     1796
       ... 
164       1
175       1
151       1
161       1
166       1
Name: count, Length: 177, dtype: int64

In [16]:
for col in cols:
    df[col+'_counts'] = frequency_encoder(col)

In [17]:
df

Unnamed: 0,ACTION,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE,RESOURCE_counts,MGR_ID_counts,ROLE_ROLLUP_1_counts,ROLE_ROLLUP_2_counts,ROLE_DEPTNAME_counts,ROLE_TITLE_counts,ROLE_FAMILY_DESC_counts,ROLE_FAMILY_counts,ROLE_CODE_counts
0,1,3050,3862,21,64,307,4,7,64,4,3,55,21407,4424,72,3583,6896,10980,3583
1,1,644,156,21,68,299,34,59,66,38,30,10,21407,3945,159,81,12,1287,81
2,1,2706,1533,50,57,14,0,2083,3,0,2,3,184,184,546,1256,33,2636,1256
3,1,2615,867,21,68,181,22,1901,64,23,1,62,21407,3945,190,4649,1244,10980,4649
4,1,3616,941,15,13,157,70,325,4,77,8,9,276,138,45,75,19,362,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32764,1,1060,1823,21,64,181,22,1901,64,23,51,29,21407,4424,190,4649,1244,10980,4649
32765,1,1162,4231,3,34,284,133,1449,61,142,2,9,721,721,54,24,5,224,24
32766,1,2390,2539,21,66,200,197,1144,20,207,161,5,21407,2641,42,3,2,78,3
32767,1,6082,3252,51,60,20,337,2146,65,71,7,16,275,257,659,394,244,1318,394


In [18]:
df.columns

Index(['ACTION', 'RESOURCE', 'MGR_ID', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2',
       'ROLE_DEPTNAME', 'ROLE_TITLE', 'ROLE_FAMILY_DESC', 'ROLE_FAMILY',
       'ROLE_CODE', 'RESOURCE_counts', 'MGR_ID_counts', 'ROLE_ROLLUP_1_counts',
       'ROLE_ROLLUP_2_counts', 'ROLE_DEPTNAME_counts', 'ROLE_TITLE_counts',
       'ROLE_FAMILY_DESC_counts', 'ROLE_FAMILY_counts', 'ROLE_CODE_counts'],
      dtype='object')

In [19]:
df.dtypes

ACTION                     int64
RESOURCE                   int64
MGR_ID                     int64
ROLE_ROLLUP_1              int64
ROLE_ROLLUP_2              int64
ROLE_DEPTNAME              int64
ROLE_TITLE                 int64
ROLE_FAMILY_DESC           int64
ROLE_FAMILY                int64
ROLE_CODE                  int64
RESOURCE_counts            int64
MGR_ID_counts              int64
ROLE_ROLLUP_1_counts       int64
ROLE_ROLLUP_2_counts       int64
ROLE_DEPTNAME_counts       int64
ROLE_TITLE_counts          int64
ROLE_FAMILY_DESC_counts    int64
ROLE_FAMILY_counts         int64
ROLE_CODE_counts           int64
dtype: object

In [20]:
from sklearn.model_selection import train_test_split

In [24]:
X = df.drop('ACTION',axis=1)
y = df['ACTION']

In [25]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [21]:
import tensorflow as tf

In [26]:
input_len = len(X.columns)

In [69]:
model = tf.keras.Sequential([
    tf.keras.Input(shape=(input_len,)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(10,activation='relu'),
    tf.keras.layers.Dense(1,activation='softmax')
    ])
model.compile(loss='categorical_crossentropy',optimizer='sgd',metrics=['accuracy'])

In [70]:
model.summary()

In [71]:
model.fit(X_train,y_train)

  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m820/820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 888us/step - accuracy: 0.1721 - loss: nan


<keras.src.callbacks.history.History at 0x2601ba03750>