###  Imports

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

from sklearn import metrics

### Const and Func

In [4]:
path = Path('data/csi_analyze/').expanduser()

In [9]:
! ls {path}

bs_avg_kpi.csv  bs_chnn_kpi.csv [1m[36mtest[m[m            [1m[36mtrain[m[m


In [11]:
! ls {path/'train'}

subs_bs_consumption_train.csv   subs_csi_train.csv
subs_bs_data_session_train.csv  subs_features_train.csv
subs_bs_voice_session_train.csv


In [10]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [14]:
subs_csi_tr_df = pd.read_csv(path/'train/subs_csi_train.csv', sep=';')

In [22]:
subs_csi_te_df = pd.read_csv(path/'test/subs_csi_test.csv', sep=';')

### Fast and First EDA 

In [15]:
subs_csi_tr_df.head()

Unnamed: 0,SK_ID,CSI,CONTACT_DATE
0,1973,0,13.05
1,987,1,2.05
2,351,0,2.05
3,81,0,4.05
4,4427,0,13.05


In [23]:
subs_csi_te_df.head()

Unnamed: 0,SK_ID,CONTACT_DATE
0,308,27.05
1,1789,26.05
2,3386,22.05
3,2429,9.05
4,693,29.05


In [16]:
subs_csi_tr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3754 entries, 0 to 3753
Data columns (total 3 columns):
SK_ID           3754 non-null int64
CSI             3754 non-null int64
CONTACT_DATE    3754 non-null float64
dtypes: float64(1), int64(2)
memory usage: 88.1 KB


In [24]:
subs_csi_te_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 948 entries, 0 to 947
Data columns (total 2 columns):
SK_ID           948 non-null int64
CONTACT_DATE    948 non-null float64
dtypes: float64(1), int64(1)
memory usage: 14.9 KB


In [17]:
# 3754 unique users in train, no duplicates
subs_csi_tr_df.SK_ID.nunique()

3754

In [25]:
# 948 unique users in test, no duplicates
subs_csi_te_df.SK_ID.nunique()

948

In [18]:
# ~15% unsutisfied customers 
subs_csi_tr_df.CSI.mean()

0.1568993074054342

In [19]:
subs_csi_tr_df.describe()

Unnamed: 0,SK_ID,CSI,CONTACT_DATE
count,3754.0,3754.0,3754.0
mean,2354.045818,0.156899,14.958364
std,1363.57905,0.363754,8.857586
min,2.0,0.0,1.05
25%,1174.25,0.0,8.05
50%,2360.5,0.0,13.05
75%,3549.5,0.0,23.05
max,4701.0,1.0,31.05


In [20]:
subs_csi_tr_df.CONTACT_DATE.nunique()

31

In [21]:
subs_csi_tr_df.CONTACT_DATE.unique()

array([13.05,  2.05,  4.05,  9.05, 18.05, 31.05,  3.05,  5.05, 27.05, 23.05, 30.05, 22.05, 24.05, 28.05,
        6.05, 20.05, 11.05,  8.05, 25.05,  1.05, 19.05,  7.05, 29.05, 26.05, 10.05, 17.05, 12.05, 21.05,
       16.05, 14.05, 15.05])

In [26]:
subs_csi_te_df.CONTACT_DATE.nunique()

31

In [27]:
subs_csi_te_df.CONTACT_DATE.unique()

array([27.05, 26.05, 22.05,  9.05, 29.05, 24.05, 30.05, 28.05, 12.05,  8.05,  7.05, 23.05,  1.05, 18.05,
       10.05,  4.05, 14.05, 20.05,  2.05, 19.05, 17.05, 13.05, 25.05, 16.05, 21.05,  6.05,  5.05,  3.05,
       11.05, 15.05, 31.05])

In [28]:
# difference of dates in train and test
set(subs_csi_tr_df.CONTACT_DATE.unique()).difference(subs_csi_te_df.CONTACT_DATE.unique())

set()

### Random validation

In [29]:
np.random.seed(111)

#### Validate 

In [32]:
full_tr_num = len(subs_csi_tr_df)

In [35]:
val_num = full_tr_num // 7

In [43]:
shuffle_ids = np.random.permutation(len(subs_csi_tr_df))

In [44]:
val_ids = shuffle_ids[:val_num]
tr_ids = shuffle_ids[val_num:]

In [46]:
train = subs_csi_tr_df.iloc[tr_ids].copy()
valid = subs_csi_tr_df.iloc[val_ids].copy()

In [48]:
val_y = valid.CSI

In [51]:
pred_y = [train.CSI.mean()] * len(valid)

In [52]:
len(val_y) == len(pred_y)

True

In [54]:
metrics.roc_auc_score(val_y, pred_y)

0.5

#### Test

In [55]:
pred_y = [subs_csi_tr_df.CSI.mean()] * len(subs_csi_te_df)

In [57]:
subm = pd.DataFrame(pred_y)

In [60]:
subm.to_csv('submits/tr_mean.csv', index=False, header=False)

In [61]:
# test ROC AUC = 0.5

### Extrapolation

In [62]:
train_ext = subs_csi_tr_df.drop('CSI', axis=1).copy()
train_ext['is_test'] = 0
test_ext = subs_csi_te_df.copy()
test_ext['is_test'] = 1

merged_ext = pd.concat([train_ext, test_ext])
x, y, nas = proc_df(merged_ext, 'is_test')

In [66]:
m = RandomForestClassifier(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(x, y);
m.oob_score_

0.764568268821778

In [67]:
fi = rf_feat_importance(m, x); fi[:10]

Unnamed: 0,cols,imp
0,SK_ID,0.874838
1,CONTACT_DATE,0.125162


In [68]:
subs_csi_tr_df.SK_ID.describe()

count    3754.000000
mean     2354.045818
std      1363.579050
min         2.000000
25%      1174.250000
50%      2360.500000
75%      3549.500000
max      4701.000000
Name: SK_ID, dtype: float64

In [69]:
subs_csi_te_df.SK_ID.describe()

count     948.000000
mean     2341.418776
std      1333.791553
min         1.000000
25%      1182.750000
50%      2329.000000
75%      3462.250000
max      4702.000000
Name: SK_ID, dtype: float64

In [75]:
subs_csi_tr_df.SK_ID.iloc[:10]

0    1973
1     987
2     351
3      81
4    4427
5    1314
6    1589
7    1750
8    4238
9    2507
Name: SK_ID, dtype: int64

In [74]:
subs_csi_te_df.SK_ID.iloc[:10]

0     308
1    1789
2    3386
3    2429
4     693
5    2003
6     830
7    1003
8    2940
9    2352
Name: SK_ID, dtype: int64