In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

train_path = '/Users/Bin/Downloads/datasets/BDCI2018/train.csv'
test_path = '/Users/Bin/Downloads/datasets/BDCI2018/test.csv'

train_raw = pd.read_csv(train_path, sep=',')
test_raw = pd.read_csv(test_path, sep=',')

In [15]:
# construct the training data and testing data
y_train = train_raw.pop('current_service')
train_id = train_raw.pop('user_id')
X_train = train_raw
train_cols = X_train.columns

X_test = test_raw[train_cols]
test_id = test_raw.pop('user_id')

In [16]:
# deal with the raw data
# TODO
for col in train_cols:
    X_train[col] = X_train[col].replace('\\N', -1)
    X_test[col] = X_test[col].replace('\\N', -1)
    
# X_train, y, X_test = X_train.values, y_train, X_test.values

In [17]:
# standrad scale encoding
from sklearn.preprocessing import StandardScaler

feat_one_hot = ['service_type', 'is_mix_service', 'many_over_bill', 'contract_type', 
                'is_promise_low_consume', 'net_service', 'gender']

std_encoder = StandardScaler()
feat_std_enc = [item for item in train_cols if item not in feat_one_hot]

train = X_train
train[feat_std_enc] = std_encoder.fit_transform(X_train[feat_std_enc])
# print X_train.describe()
# print train.describe()

test = X_test
test[feat_std_enc] = std_encoder.fit_transform(X_test[feat_std_enc])

In [18]:
# one-hot encoding
# feat_one_hot = ['service_type', 'is_mix_service', 'many_over_bill', 'contract_type', 
#                 'is_promise_low_consume', 'net_service', 'gender']
for feat in feat_one_hot:
    # Type cast
    X_train[feat] = X_train[feat].map(lambda x: int(x))
    train_feat = pd.get_dummies(X_train[feat], prefix=feat)
    X_train = X_train.drop([feat], axis=1)
    X_train = pd.concat([X_train, train_feat], axis=1)
    
    X_test[feat] = X_test[feat].map(lambda x: int(x))
    test_feat = pd.get_dummies(X_test[feat], prefix=feat)
    X_test = X_test.drop([feat], axis=1)
    X_test = pd.concat([X_test, test_feat], axis=1)

In [19]:
# use the dict to map the label and raw number
label2current_service = dict(zip(range(0, len(set(y_train))), sorted(list(set(y_train)))))
print label2current_service
current_service2label = dict(zip(sorted(list(set(y_train))), range(0, len(set(y_train)))))
print current_service2label

{0: 89950166, 1: 89950167, 2: 89950168, 3: 90063345, 4: 90109916, 5: 90155946, 6: 99999825, 7: 99999826, 8: 99999827, 9: 99999828, 10: 99999830}
{99999830: 10, 99999825: 6, 90155946: 5, 90063345: 3, 99999826: 7, 99999827: 8, 99999828: 9, 89950166: 0, 89950167: 1, 89950168: 2, 90109916: 4}


In [20]:
# map the raw training data
y_train = y_train.map(current_service2label)

In [21]:
# use k-fold corss-validation
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import numpy as np

def f1_score_val(preds, data_val):
    labels = data_val.get_label()
    preds = np.argmax(preds.reshape(15, -1), axis=0)
    #     Returns the indices of the maximum values along an axis.
    score_val = f1_score(y_true=labels, y_pred=preds, average='weighted')
    return 'f1 score', score_val, True

In [22]:
# reference configuration
n_splits = 5
seed = 42
print len(X_train), len(y_train)

743990 743990


In [23]:
X_test = X_test.values
X_train = X_train.values

In [27]:
from sklearn.neighbors import KNeighborsClassifier


x_score = []
cv_pred = []

skf = StratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=True)

for index, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
    print('---------------->', index) # 0-4
#     print train_index, test_index
    
    X_tra, X_val, y_tra, y_val = X_train[train_index], X_train[test_index], y_train[train_index], y_train[test_index]
    
    clf = KNeighborsClassifier(n_neighbors=15)
    clf.fit(X_tra, y_tra)
    
    y_pred = clf.predict(X_val)
    y_pred = [np.argmax(item) for item in y_pred]
    # TODO
    x_score.append(f1_score(y_val, y_pred, average='weighted'))
    
    # for whole testing set
    y_test = clf.predict(X_test)
    y_test = [np.argmax(item) for item in y_test]
    
    if index == 0:
        cv_pred = np.array(y_test).reshape(-1, 1)
    else:
        cv_pred = np.hstack((cv_pred, np.array(y_test).reshape(-1, 1)))

('---------------->', 0)


KeyboardInterrupt: 

In [117]:
# vote for the results
y_result = []

for line in cv_pred:
    # bincount: Count number of occurrences of each value in array of non-negative ints.
    y_result.append(np.argmax(np.bincount(line)))

# save the result
df_summit = pd.DataFrame()
df_summit['user_id'] = list(test_id.unique())
df_summit['predict'] = y_result
df_summit['predict'] = df_summit['predict'].map(label2current_service)

df_summit.to_csv('./submission.csv', index=False)

print(x_score, np.mean(x_score))

([0.8572329014288665, 0.8574373279964715, 0.8556611411738219, 0.8560228566949167, 0.8574183506768198], 0.8567545155941794)
