In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import roc_auc_score

In [2]:
train = pd.read_csv('train.csv')
train.head(5)

Unnamed: 0,PERIOD,cl_id,MCC,channel_type,currency,TRDATETIME,amount,trx_category,target_flag,target_sum
0,01/10/2017,0,5200,,810,21OCT17:00:00:00,5023.0,POS,0,0.0
1,01/10/2017,0,6011,,810,12OCT17:12:24:07,20000.0,DEPOSIT,0,0.0
2,01/12/2017,0,5921,,810,05DEC17:00:00:00,767.0,POS,0,0.0
3,01/10/2017,0,5411,,810,21OCT17:00:00:00,2031.0,POS,0,0.0
4,01/10/2017,0,6012,,810,24OCT17:13:14:24,36562.0,C2C_OUT,0,0.0


In [3]:
train.drop('target_sum', axis=1, inplace=True)

In [4]:
train['trx_category'].value_counts()

POS               416425
DEPOSIT            21216
WD_ATM_ROS         19104
WD_ATM_PARTNER      9948
C2C_IN              7306
WD_ATM_OTHER        7140
C2C_OUT             5456
BACK_TRX            2687
CAT                 1197
CASH_ADV              34
Name: trx_category, dtype: int64

In [5]:
train['channel_type'].value_counts()

type1    299247
type2    175013
type5      5587
type4      4476
type3      3280
Name: channel_type, dtype: int64

In [41]:
train['MCC'].value_counts()

5411    121640
6011     54382
5814     41351
5812     30027
5499     27237
5541     19816
5912     18728
5999     13073
6012     10056
5921      8578
5331      7641
4121      6266
5211      6262
4829      6205
5691      5161
5261      4803
4111      4362
5977      4300
5200      3460
5732      3271
5311      3258
5533      2946
5651      2915
4812      2857
5641      2710
8099      2674
5722      2667
5661      2601
8999      2599
5941      2597
         ...  
3064         1
3678         1
8675         1
3548         1
3607         1
3730         1
3029         1
3535         1
3722         1
7333         1
7339         1
3387         1
3715         1
3508         1
3640         1
3513         1
3515         1
3191         1
5937         1
3634         1
3520         1
7217         1
3659         1
3628         1
3778         1
5960         1
3625         1
7211         1
7631         1
3703         1
Name: MCC, Length: 344, dtype: int64

In [6]:
def get_amount_sum(r):
    return r.amount.sum()

In [33]:
data_full = pd.DataFrame({
        'amount_sum' : train.groupby( [ 'cl_id', 'target_flag', 'channel_type' ] ).apply(get_amount_sum),
    }).reset_index()
data_dm = pd.get_dummies(data_full, columns=['channel_type'])

In [34]:
data_dm.head(10)

Unnamed: 0,cl_id,target_flag,amount_sum,channel_type_type1,channel_type_type2,channel_type_type3,channel_type_type4,channel_type_type5
0,50,1,267108.91,0,0,0,0,1
1,52,0,514679.46,0,0,0,0,1
2,54,0,175289.16,0,0,0,0,1
3,55,0,352452.82,0,0,0,0,1
4,56,1,1106460.77,0,0,0,0,1
5,58,1,2620202.39,0,0,0,0,1
6,59,0,248823.59,0,0,0,0,1
7,61,1,387793.86,0,0,0,0,1
8,63,0,12419.0,0,0,0,0,1
9,68,0,495.0,0,0,0,0,1


In [35]:
from sklearn.model_selection import ShuffleSplit
splitter = ShuffleSplit(n_splits=1, test_size=0.2, random_state=777)

for train_index, test_index in splitter.split(data_dm, data_dm.target_flag):
    X_train = data_dm.iloc[train_index]
    X_test = data_dm.iloc[test_index]
    
    y_train = data_dm['target_flag'].iloc[train_index]
    y_test = data_dm['target_flag'].iloc[test_index]

In [36]:
X_train.count()

cl_id                 3980
target_flag           3980
amount_sum            3980
channel_type_type1    3980
channel_type_type2    3980
channel_type_type3    3980
channel_type_type4    3980
channel_type_type5    3980
dtype: int64

In [97]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

params = [ {'max_depth': list( range(1, 20) )} ]
gs = GridSearchCV( RandomForestClassifier(), param_grid = params, scoring = 'accuracy', return_train_score = False )
gs.fit( X_train, y_train )

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='accuracy', verbose=0)

In [98]:
gs.best_params_

{'max_depth': 3}

In [99]:
clf_rf = RandomForestClassifier(n_estimators=10, max_depth=3, min_samples_leaf=10, max_features=0.8, n_jobs=-1)

clf_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features=0.8, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [100]:
predictions_clf = clf_rf.predict_proba( X_test )

In [101]:
roc_auc_score( y_test, predictions_clf[:, 1] )

1.0

In [82]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [85]:
predictions = lr.predict_proba( X_test )

In [87]:
roc_auc_score( y_test, predictions[:, 1] )

0.7132205544418911