In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


def log_progress(sequence, every=None, size=None):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = size / 200     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{index} / ?'.format(index=index)
                else:
                    progress.value = index
                    label.value = u'{index} / {size}'.format(
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = str(index or '?')

In [2]:
location = ''
events = pd.DataFrame.from_csv(location+"events_modifyed.csv",index_col=None)
structure = pd.DataFrame.from_csv(location+"structure.csv",index_col=None)
targets = pd.DataFrame.from_csv(location+"targets.csv",index_col=None)
events_test = pd.DataFrame.from_csv(location+"events_test_modifyed.csv",index_col=None)

In [3]:
events.sort_values(by='time', inplace=True)

In [None]:
for i, row in events_test.iterrows():
    if events_test.loc[i,'action'] == 'discovered':
        events_test.set_value(i, 'action', 1)
    if events_test.loc[i, 'action'] == 'passed':
        events_test.set_value(i, 'action', 2)
    if events_test.loc[i, 'action'] == 'viewed':
        events_test.set_value(i, 'action', 3)
    if events_test.loc[i, 'action'] == 'started_attempt':
        events_test.set_value(i, 'action', 4)
    if events_test.loc[i, 'step_type'] == 'choice':
        events_test.set_value(i, 'step_type', 1)
    if events_test.loc[i, 'step_type'] == 'code':
        events_test.set_value(i, 'step_type', 2)
    if events_test.loc[i, 'step_type'] == 'number':
        events_test.set_value(i, 'step_type', 3)
    if events_test.loc[i, 'step_type'] == 'string':
        events_test.set_value(i, 'step_type', 4)
    if events_test.loc[i, 'step_type'] == 'text':
        events_test.set_value(i, 'step_type', 5)
    if events_test.loc[i, 'step_type'] == 'video':
        events_test.set_value(i, 'step_type', 6)
        

In [4]:
events.to_csv('events_modifyed.csv')

In [4]:
structure.sort_values(['module_position','lesson_position', 'step_position'], inplace=True)

In [5]:
step_id_position = structure.step_id.values.tolist()

In [6]:
X = []
for us_id in log_progress(set(events.user_id.tolist())):
    temp_us_data = events[events.user_id == us_id]
    temp_us_to_app_vec = []
    
    temp_us_to_app_vec.append(us_id)
    
    temp_us_to_app_vec.append(temp_us_data.step_cost.sum())
    temp_us_to_app_vec.append(temp_us_data.step_cost.mean())
    temp_us_to_app_vec.extend(temp_us_data.step_cost.describe().tolist())
    
    temp_us_to_app_vec.append(temp_us_data.step_type.max())
    temp_us_to_app_vec.append(temp_us_data.step_type.min())
    temp_us_to_app_vec.extend(temp_us_data.step_type.describe().tolist())
    temp_us_to_app_vec.extend([len(temp_us_data[temp_us_data.step_type == i]) for i in range(1,7)])
    
    temp_us_to_app_vec.append(temp_us_data.time.mean())
    temp_us_to_app_vec.append(temp_us_data.time.max() - temp_us_data.time.min())
    temp_us_to_app_vec.extend(temp_us_data.time.describe().values.tolist())
    diff = []
    vec = temp_us_data.time.values.tolist()
    for i in range(len( vec) - 1):
        diff.append(vec[i + 1] - vec[i])
    if (len(diff) > 3):
        diff = pd.DataFrame(diff)
        for elem in diff.describe().values.tolist():
            temp_us_to_app_vec.append(elem[0])
        temp_us_to_app_vec.append(diff.sum())
        temp_us_to_app_vec.append(diff.max() - diff.min())
    else:
        temp_us_to_app_vec.extend([0,0,0,0,0,0,0,0,0,0])
    temp_us_to_app_vec.append(len(temp_us_data.time))

    
    temp_us_to_app_vec.extend(temp_us_data.action.describe().tolist())
    temp_us_to_app_vec.extend([len(temp_us_data[temp_us_data.action == i]) for i in range(1,5)])
    
    pos_frame = pd.DataFrame([step_id_position.index(elem) for elem in temp_us_data.step_id])
    for elem in pos_frame.describe().values.tolist():
        temp_us_to_app_vec.append(elem[0])
    
    
    X.append(temp_us_to_app_vec)
Y = []

for elem in log_progress(X):
    Y.append([targets[targets.user_id == elem[0]].passed.values[0]])

In [7]:
X_test = []
for us_id in log_progress(set(events_test.user_id.tolist())):
    temp_us_data = events_test[events_test.user_id == us_id]
    temp_us_to_app_vec = []
    
    temp_us_to_app_vec.append(us_id)
    
    temp_us_to_app_vec.append(temp_us_data.step_cost.sum())
    temp_us_to_app_vec.append(temp_us_data.step_cost.mean())
    temp_us_to_app_vec.extend(temp_us_data.step_cost.describe().tolist())
    
    temp_us_to_app_vec.append(temp_us_data.step_type.max())
    temp_us_to_app_vec.append(temp_us_data.step_type.min())
    temp_us_to_app_vec.extend(temp_us_data.step_type.describe().tolist())
    temp_us_to_app_vec.extend([len(temp_us_data[temp_us_data.step_type == i]) for i in range(1,7)])
    
    temp_us_to_app_vec.append(temp_us_data.time.mean())
    temp_us_to_app_vec.append(temp_us_data.time.max() - temp_us_data.time.min())
    temp_us_to_app_vec.extend(temp_us_data.time.describe().values.tolist())
    diff = []
    vec = temp_us_data.time.values.tolist()
    for i in range(len( vec) - 1):
        diff.append(vec[i + 1] - vec[i])
    if (len(diff) > 3):
        diff = pd.DataFrame(diff)
        for elem in diff.describe().values.tolist():
            temp_us_to_app_vec.append(elem[0])
        temp_us_to_app_vec.append(diff.sum())
        temp_us_to_app_vec.append(diff.max() - diff.min())
    else:
        temp_us_to_app_vec.extend([0,0,0,0,0,0,0,0,0,0])
    temp_us_to_app_vec.append(len(temp_us_data.time))

    
    temp_us_to_app_vec.extend(temp_us_data.action.describe().tolist())
    temp_us_to_app_vec.extend([len(temp_us_data[temp_us_data.action == i]) for i in range(1,5)])
    
    pos_frame = pd.DataFrame([step_id_position.index(elem) for elem in temp_us_data.step_id])
    for elem in pos_frame.describe().values.tolist():
        temp_us_to_app_vec.append(elem[0])
    
    
    X_test.append(temp_us_to_app_vec)

In [8]:
X_new = pd.DataFrame(X).fillna(method='bfill')
X_test = pd.DataFrame(X_test).fillna(method='bfill')

X_new = X_new.as_matrix()
X_test = X_test.as_matrix()

Y = np.ravel(Y)
Y = Y.astype(np.int32)

X_new = X_new.astype(np.float32)
X_test = X_test.astype(np.float32)

Обучение

In [57]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import f1_score

Xtr,Xval,Ytr,Yval = train_test_split(X_new,Y,test_size=0.1,random_state=128)


In [63]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures

model = RandomForestClassifier(verbose=1, n_jobs = -1, n_estimators = 15000)

X_new3 = PolynomialFeatures(3).fit_transform(X_new)

In [64]:
#Обучим модель
model.fit(X_new3, Y)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   46.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 16.3min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed: 20.4min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed: 24.7min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed: 29.1min
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed: 34.0min
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed: 39.9min
[Parallel(n_jobs=-1)]: Done 11234 tasks      | elapsed: 46.4min
[Parallel(n_jobs=-1)]: Done 12784 tasks      | elapsed: 53

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=15000, n_jobs=-1, oob_score=False,
            random_state=None, verbose=1, warm_start=False)

In [60]:
f1_score(Yval, model.predict(Xval))

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 2434 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 4034 tasks      | elapsed:    1.4s
[Parallel(n_jobs=8)]: Done 4984 tasks      | elapsed:    1.7s
[Parallel(n_jobs=8)]: Done 6034 tasks      | elapsed:    2.0s
[Parallel(n_jobs=8)]: Done 7184 tasks      | elapsed:    2.4s
[Parallel(n_jobs=8)]: Done 8434 tasks      | elapsed:    2.8s
[Parallel(n_jobs=8)]: Done 9784 tasks      | elapsed:    3.3s
[Parallel(n_jobs=8)]: Done 11234 tasks      | elapsed:    3.7s
[Parallel(n_jobs=8)]: Done 12784 tasks      | elapsed:    4.3s
[Parallel(

0.60000000000000009

In [61]:
pred_prob = model.predict_proba(Xval)


f1_score_temp = []

for i in range(10000):
    ans = []
    eps = i * 0.0001
    for elem in pred_prob:
        if elem[1] > eps:
            ans.append(1)
        else:
            ans.append(0)
    f1_score_temp.append(f1_score(Yval, ans))


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 2434 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 4034 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 4984 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 6034 tasks      | elapsed:    2.0s
[Parallel(n_jobs=8)]: Done 7184 tasks      | elapsed:    2.4s
[Parallel(n_jobs=8)]: Done 8434 tasks      | elapsed:    2.8s
[Parallel(n_jobs=8)]: Done 9784 tasks      | elapsed:    3.2s
[Parallel(n_jobs=8)]: Done 11234 tasks      | elapsed:    3.7s
[Parallel(n_jobs=8)]: Done 12784 tasks      | elapsed:    4.2s
[Parallel(

In [62]:
maxnum = 0
pos = 0
for i in range(len(f1_score_temp)):
    if maxnum < f1_score_temp[i]:
        maxnum = f1_score_temp[i]
        pos = i
print(maxnum)
print(pos)

0.694214876033
4404


In [65]:
ans = []
pred_prob = model.predict_proba(PolynomialFeatures(3).fit_transform(X_test))

for elem in pred_prob:
    if elem[1] > 0.44:
        ans.append(1)
    else:
        ans.append(0)


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    1.5s
[Parallel(n_jobs=8)]: Done 2434 tasks      | elapsed:    2.0s
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed:    2.6s
[Parallel(n_jobs=8)]: Done 4034 tasks      | elapsed:    3.2s
[Parallel(n_jobs=8)]: Done 4984 tasks      | elapsed:    4.1s
[Parallel(n_jobs=8)]: Done 6034 tasks      | elapsed:    5.1s
[Parallel(n_jobs=8)]: Done 7184 tasks      | elapsed:    6.1s
[Parallel(n_jobs=8)]: Done 8434 tasks      | elapsed:    7.2s
[Parallel(n_jobs=8)]: Done 9784 tasks      | elapsed:    8.4s
[Parallel(n_jobs=8)]: Done 11234 tasks      | elapsed:    9.6s
[Parallel(n_jobs=8)]: Done 12784 tasks      | elapsed:   10.7s
[Parallel(

In [142]:
f1_score(Yval, ans)

0.60000000000000009

In [67]:
ind = []
for elem in X_test:
    ind.append(int(elem[0]))

In [69]:
def create_submission(X, name):
    np.savetxt('%s.csv'%name, X, delimiter=',', fmt="%d", header='user_id,passed', comments='')
    
ans = model.predict(PolynomialFeatures(3).fit_transform(X_test))
#ans = model.predict(X_test)

result = np.concatenate((np.asarray(ind, dtype=int).reshape(-1, 1), np.asarray(ans, dtype=int).reshape(-1, 1)), axis=1)
create_submission(result, '69full_randforest_pol_feat3')