# Exploratory Data Analysis

In [2]:
import pandas as pd
import numpy as np

# this will take sevaral seconds
train = pd.read_csv('data_transactions/train.csv')
test = pd.read_csv('data_transactions/test.csv')

In [3]:
# uncomment the lines bellow to look at the first few rows of training and test sets.
#print(train.head())
#print('----------------------------------------------------------------------')
#print(test.head())

In [4]:
# Shapes of data frames and the types of features
print('train Shape:', train.shape, 'test Shape:', test.shape)
print('---------------------------------')
print('train FEATURE TYPES\n\n', train.dtypes)
print('---------------------------------')
print('test FEATURE TYPES\n\n', test.dtypes)

train Shape: (348978, 51) test Shape: (523466, 50)
---------------------------------
train FEATURE TYPES

 transaction_id     object
num_var_1         float64
num_var_2         float64
num_var_3         float64
num_var_4         float64
num_var_5         float64
num_var_6         float64
num_var_7         float64
cat_var_1          object
cat_var_2          object
cat_var_3          object
cat_var_4          object
cat_var_5          object
cat_var_6          object
cat_var_7          object
cat_var_8          object
cat_var_9          object
cat_var_10         object
cat_var_11         object
cat_var_12         object
cat_var_13         object
cat_var_14         object
cat_var_15         object
cat_var_16         object
cat_var_17         object
cat_var_18         object
cat_var_19          int64
cat_var_20          int64
cat_var_21          int64
cat_var_22          int64
cat_var_23          int64
cat_var_24          int64
cat_var_25          int64
cat_var_26          int64
cat_var_2

In [5]:
# Number of missing values
print("train MISSING VALUES\n\n", train.isnull().sum())
print('---------------------------------')
print("test MISSING VALUES \n\n", test.isnull().sum())

train MISSING VALUES

 transaction_id         0
num_var_1              0
num_var_2              0
num_var_3              0
num_var_4              0
num_var_5              0
num_var_6              0
num_var_7              0
cat_var_1          15782
cat_var_2              0
cat_var_3          43853
cat_var_4              0
cat_var_5              0
cat_var_6              0
cat_var_7              0
cat_var_8         109738
cat_var_9              0
cat_var_10             0
cat_var_11             0
cat_var_12             0
cat_var_13             0
cat_var_14             0
cat_var_15             0
cat_var_16             0
cat_var_17             0
cat_var_18             0
cat_var_19             0
cat_var_20             0
cat_var_21             0
cat_var_22             0
cat_var_23             0
cat_var_24             0
cat_var_25             0
cat_var_26             0
cat_var_27             0
cat_var_28             0
cat_var_29             0
cat_var_30             0
cat_var_31             0
ca

# Discussion

### Sizes

1. training set - (348978, 51),
2. test set - (523466, 50).  
    

### Features

1. **transaction_id** feature in both training and test data is a unique identifyer and we will ignore it at the model fitting stage. It is of type **object**.
2. **Numerical variables**. **7** features, **num_var_1** through **num_var_7**, all are of type **float**. 
3. **Categorical variables**. **42** features ( **cat_var_1** through **cat_var_42**). 
   1. (**cat_var_1** through **cat_var_18**) - **object**.  
   2. (**cat_var_19** through **cat_var_42**) - **int**.  
4. **target** feature as the name says is the target feature, that is why it is missing in the test set. In the training set it is of type **int**. 

### Missing Values

Numerical variables don't have missing values.

1. *Training set*. cat_var_1 - **15782**, cat_var_3 - **43853**, cat_var_8 - **109738**.  
2. *Test set*. cat_var_1 - **18692**, cat_var_3 - **53362**, cat_var_6 - **21943**, cat_var_8 - **8138**. 

# Data Processing

First, copy training and test data sets into new data frames. Then keep **transaction_id** and **target** features in separate arrays and drop those columns.

In [6]:
# copy
train_new = train.copy()
test_new = test.copy()

y_train = train_new.target
id_train = train_new.transaction_id
sub_ids = test_new.transaction_id


train_new.drop(['transaction_id', 'target'], axis = 1, inplace = True)
test_new.drop('transaction_id', axis = 1, inplace = True)

## Numerical Features

Normalize numerical features with feature preprocessing using minmax scaling.

In [7]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
train_new.iloc[:, :7] = scaler.fit_transform(train_new.iloc[:, :7])
test_new.iloc[:, :7] = scaler.transform(test_new.iloc[:, :7])

In [8]:
# Remove values that occur very rarely 
# will think about this later, checked there are no values occuring less than 10 times.

#threshold = 10 # Anything that occurs less than this will be removed.
#for col in train_new.columns[:7]:
#    value_counts = train_new[col].value_counts()
#    to_remove = value_counts[value_counts <= threshold].index
#    train_new[col].replace(to_remove, np.NaN, inplace=True)
    
# Replace nan values with the mean
#for col in train_new.columns[:7]:
#    train_new[col].fillna(train_new[col].mean(), inplace = True)

## Categorical Features (object)

In [9]:
# convert object features into categorical
#train_new[train_new.select_dtypes(['object']).columns] = train_new.select_dtypes(['object']).apply(lambda x: x.astype('category'))
train_new.iloc[:, 7:25].describe()

Unnamed: 0,cat_var_1,cat_var_2,cat_var_3,cat_var_4,cat_var_5,cat_var_6,cat_var_7,cat_var_8,cat_var_9,cat_var_10,cat_var_11,cat_var_12,cat_var_13,cat_var_14,cat_var_15,cat_var_16,cat_var_17,cat_var_18
count,333196,348978,305125,348978,348978,348978,348978,239240,348978,348978,348978,348978,348978,348978,348978,348978,348978,348978
unique,534,60,616,2,2,518,20,462,5,23,5,5,52,12,2,2,2,2
top,gf,ce,qt,tn,tn,zs,ep,dn,ep,ye,ce,tn,hr,db,ep,tn,tn,ep
freq,132170,233079,34912,232909,174898,132059,348539,132170,81529,34614,82301,142790,132770,248013,236413,320148,254043,199168


In [10]:
# convert object features into categorical
#test_new[test_new.select_dtypes(['object']).columns] = test_new.select_dtypes(['object']).apply(lambda x: x.astype('category'))
test_new.iloc[:, 7:25].describe()

Unnamed: 0,cat_var_1,cat_var_2,cat_var_3,cat_var_4,cat_var_5,cat_var_6,cat_var_7,cat_var_8,cat_var_9,cat_var_10,cat_var_11,cat_var_12,cat_var_13,cat_var_14,cat_var_15,cat_var_16,cat_var_17,cat_var_18
count,504774,523466,470104,523466,523466,501523,523466,515328,523466,523466,523466,523466,523466,523466,523466,523466,523466,523466
unique,534,62,617,2,2,515,22,463,5,23,5,5,52,12,2,2,2,2
top,gf,ce,qt,tn,ep,zs,ep,dn,ep,ye,ce,tn,hr,db,ep,tn,tn,ep
freq,198593,348977,52325,350336,262085,198423,522824,198593,122473,52252,123097,214360,199583,372861,354042,480857,380330,298498


## categorical Features (int)

In [11]:
# convert int features into categorical
#train_new[train_new.select_dtypes(['int']).columns] = train_new.select_dtypes(['int']).apply(lambda x: x.astype('category'))
train_new.iloc[:, 25:].describe()

Unnamed: 0,cat_var_19,cat_var_20,cat_var_21,cat_var_22,cat_var_23,cat_var_24,cat_var_25,cat_var_26,cat_var_27,cat_var_28,...,cat_var_33,cat_var_34,cat_var_35,cat_var_36,cat_var_37,cat_var_38,cat_var_39,cat_var_40,cat_var_41,cat_var_42
count,348978.0,348978.0,348978.0,348978.0,348978.0,348978.0,348978.0,348978.0,348978.0,348978.0,...,348978.0,348978.0,348978.0,348978.0,348978.0,348978.0,348978.0,348978.0,348978.0,348978.0
mean,0.520279,0.479721,0.672257,0.321797,0.005946,0.999097,0.000123,0.000553,3e-06,2.6e-05,...,6e-06,3e-06,0.0,0.0,0.0,0.0,9e-06,0.0,3e-06,0.0
std,0.499589,0.499589,0.469391,0.467166,0.07688,0.03003,0.0111,0.02351,0.001693,0.005078,...,0.002394,0.001693,0.0,0.0,0.0,0.0,0.002932,0.0,0.001693,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [12]:
#test_new[test_new.select_dtypes(['int']).columns] = test_new.select_dtypes(['int']).apply(lambda x: x.astype('category'))
test_new.iloc[:, 25:].describe()

Unnamed: 0,cat_var_19,cat_var_20,cat_var_21,cat_var_22,cat_var_23,cat_var_24,cat_var_25,cat_var_26,cat_var_27,cat_var_28,...,cat_var_33,cat_var_34,cat_var_35,cat_var_36,cat_var_37,cat_var_38,cat_var_39,cat_var_40,cat_var_41,cat_var_42
count,523466.0,523466.0,523466.0,523466.0,523466.0,523466.0,523466.0,523466.0,523466.0,523466.0,...,523466.0,523466.0,523466.0,523466.0,523466.0,523466.0,523466.0,523466.0,523466.0,523466.0
mean,0.522489,0.477511,0.671914,0.322013,0.006073,0.9991,0.000101,0.000585,6e-06,2.3e-05,...,8e-06,6e-06,6e-06,2e-06,2e-06,0.0,6e-06,4e-06,0.0,0.0
std,0.499494,0.499494,0.469517,0.467248,0.077692,0.029983,0.010062,0.024171,0.002394,0.004788,...,0.002764,0.002394,0.002394,0.001382,0.001382,0.0,0.002394,0.001955,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0


In [13]:
# The first line bellow (commented as it takes some time to run) checks
# if there are any duplicate columns in the data set and if so, drops them.
# In this case, there are no such columns (checked by the second line).

#train_duplicate = train_new.T.drop_duplicates().T
#len(train_duplicate.columns) == len(train_new.columns)

## Remove Features With Only One Distinct Value

In [14]:
cat_vars = [x for x in train_new.columns if 'cat_' in x]

# in training set
cat_to_drop_train = []
for x in cat_vars:
    if train_new[x].nunique() == 1:
        cat_to_drop_train.append(x)

# in test set
cat_to_drop_test = []
for x in cat_vars:
    if test_new[x].nunique() == 1:
        cat_to_drop_test.append(x)

# drop these features
cat_to_drop = list(set(cat_to_drop_train + cat_to_drop_test))
train_new = train_new.drop(cat_to_drop, axis = 1)
test_new = test_new.drop(cat_to_drop, axis = 1)

print(train_new.shape)
print(test_new.shape)

(348978, 41)
(523466, 41)


## Encode Categorical Features

In [15]:
from sklearn.preprocessing import LabelEncoder

cat_vars = [x for x in train_new.columns if 'cat_' in x]

for x in cat_vars:
    train_new[x] = train_new[x].fillna('NaN')
    test_new[x] = test_new[x].fillna('NaN')
    encoder = LabelEncoder()
    encoder.fit(list(set(list(train_new[x]) + list(test_new[x]))))
    train_new[x] = encoder.transform(train_new[x])
    test_new[x] = encoder.transform(test_new[x])
    
print(train_new.shape)
print(test_new.shape)

(348978, 41)
(523466, 41)


# Random Forest

In [16]:
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

forest_clf = RandomForestClassifier(random_state = 7)

y_probas_forest = cross_val_predict(forest_clf, train_new, y_train, cv = 3, method = 'predict_proba')
y_scores_forest = y_probas_forest[:, 1]

roc_auc_score(y_train, y_scores_forest)

0.71579479560958092

In [17]:
y_scores_forest

array([ 0.01784977,  0.1       ,  0.04836848, ...,  0.07268586,
        0.05640239,  0.02659553])

# Predict on Test Set

In [18]:
# fit on the whole training set
forest_clf = RandomForestClassifier(random_state=7)
forest_clf.fit(train_new, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=7,
            verbose=0, warm_start=False)

In [19]:
preds = forest_clf.predict_proba(test_new)[:,1]
preds

array([ 0.01973466,  0.05091312,  0.23585393, ...,  0.00357143,
        0.32531595,  0.        ])

# Submission

In [20]:
from IPython.display import FileLink

sub = pd.DataFrame({'transaction_id': sub_ids, 'target': preds})
sub = sub[['transaction_id','target']]    

filename='sub_transactions.csv'
sub.to_csv(filename, index=False)
FileLink(filename) # lb 0.72157