In [10]:
# imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from datetime import datetime
from sklearn import set_config
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
set_config(display='diagram')

In [11]:
def remove_duplicates(data):
    return pd.DataFrame(data).drop_duplicates()

def age_constructor(data):
    return pd.DatetimeIndex(pd.to_datetime(data['start'], format='%Y/%m/%d')).year - pd.DatetimeIndex(pd.to_datetime(data['birth'], format='%Y')).year

def array_reshape(data):
    output = np.reshape(data, (-1, 1))
    return output

def action_time(data):
    ohe_action = OneHotEncoder(sparse = False) # Instanciate encoder
    ohe_action.fit(data[['action']]) # Fit encoder
    action_encoded = ohe_action.transform(data[['action']])
    action_X = pd.DataFrame(action_encoded)
    timediff = list(data['timediff'])
    for row in range(len(action_X.index)):
        action_X.iloc[row,action_X.columns[action_X.iloc[row,] == 1][0]] = timediff[row]
    action_X['username'] = data[['username']]
    action_X['id'] = data[['id']]
    return action_X

def combining_actions(dataframe):
    output_df = dataframe.groupby(['username', 'id']).sum()
    return output_df

def keep_unchanged(data):
    return pd.DataFrame(data[["truth", "username", "id"]])

def calc_percentage_course(df):
    df['course_start'] = pd.to_datetime(df['start'])
    df['course_start'] = df.course_start.values.astype(np.int64) // 10 ** 9
    df['course_end'] = pd.to_datetime(df['end'])
    df['course_end'] = df.course_end.values.astype(np.int64) // 10 ** 9
    #df = df.drop(['start'], axis = 1)
    #df = df.drop(['end'], axis = 1)
    df['percent_course'] = (df['timestamp'] - df['course_start'])/(df['course_end'] - df['course_start'])
    users_min_30per_comp = df[df['percent_course'] >= 0.3]['username'].unique()
    df = df[df['username'].isin(users_min_30per_comp)][df['percent_course'] < 0.3]
    return df

# Reading data

In [12]:
df = pd.read_csv('../raw_data/df_full_withtime.csv') #, nrows=1000
df.head(2)

Unnamed: 0.1,Unnamed: 0,username,session_id,action,truth,id,start,end,category,gender,education,birth,timestamp,timediff
0,7084339,24057,df8ec8baa5af2f61e5426307be9a5e75,load_video,1,809,2015-05-11 20:00:00,2015-07-17 00:00:00,economics,male,Master's,1987.0,1433436084,0.0
1,7084345,24057,df8ec8baa5af2f61e5426307be9a5e75,play_video,1,809,2015-05-11 20:00:00,2015-07-17 00:00:00,economics,male,Master's,1987.0,1433436087,3.0


# Preprocessing pipeline for train data
Keeping target column, for removing duplicates together

In [2]:
# from ChangeDEEPly.feature_encoding import remove_duplicates, age_constructor, array_reshape, time_delta_action

ImportError: cannot import name 'time_delta_action' from 'ChangeDEEPly.feature_encoding' (/Users/zuzanna/code/Zuza-b/ChangeDEEPly/ChangeDEEPly/feature_encoding.py)

In [19]:
filtering_pipe = Pipeline([
    ('first_part_of_the_course', FunctionTransformer(calc_percentage_course))
])

gender_pipe = Pipeline([
    ('ohe_gender', OneHotEncoder(drop='if_binary', sparse = False, handle_unknown='ignore'))
])

category_edu_pipe = Pipeline([
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse = False))
])

keep_unchanged_pipe = Pipeline([
    ('keep_unchanged', FunctionTransformer(keep_unchanged))#,
    #('reshape', FunctionTransformer(array_reshape))
])

age_pipe = Pipeline([
    ('age_calc', FunctionTransformer(age_constructor)),
    ('reshape', FunctionTransformer(array_reshape)),
    ('stdscaler', StandardScaler())
])

actions_pipe = Pipeline([
    ('ohe_action', FunctionTransformer(action_time)),
    ('combining_actions', FunctionTransformer(combining_actions)),
    ('stdscaler', StandardScaler())
])

basic_encoding_pipe = ColumnTransformer([
    ('keep_unchanged', keep_unchanged_pipe, ["truth","username", "id"]),
    ('category_edu_pipe', category_edu_pipe, ["category", "education"]),
    ('gender_pipe', gender_pipe, ["gender"])
], remainder="drop")

preprocessing_pipe = FeatureUnion([
    ('basic_encoding_pipe', basic_encoding_pipe),
    ('age', age_pipe)
])

removing_duplicates_pipe = Pipeline([
    ('preprocessing_pipe', preprocessing_pipe),
    ('remove_duplicates', FunctionTransformer(remove_duplicates))
])

merge_pipe = FeatureUnion([
    ('all_without_duplicates', removing_duplicates_pipe),
    ('actions', actions_pipe)
])

# final_pipe = Pipeline([
#     ('merge_pipe', merge_pipe),
#     ('filtering_pipe', filtering_pipe)
# ])
# final_pipe

In [20]:
merge_pipe

In [15]:
df_test = df.iloc[:10000,]

In [22]:
df_trans = pd.DataFrame(merge_pipe.fit_transform(df_test))
df_trans.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,1.0,24057.0,809.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.237806,-0.323738,-0.141407,0.609644,-0.146901,-0.014031,-0.210586,-0.274926,-0.369773,-0.159049
1,1.0,670347.0,34.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,-0.229122,-0.616006,-0.273707,-0.145414,-0.182118,-0.089887,-0.211422,-0.274926,-0.369773,-0.517379


In [23]:
y = df_trans[0]
y

0      1.0
1      1.0
2      1.0
3      1.0
4      0.0
      ... 
208    0.0
209    0.0
210    0.0
211    1.0
212    1.0
Name: 0, Length: 213, dtype: float64

In [35]:
X = df_trans.drop(columns=[0])
X = X.set_index([1,2])
X.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,3,4,5,6,7,8,9,10,11,12,...,18,19,20,21,22,23,24,25,26,27
1,2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
24057.0,809.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,-0.237806,-0.323738,-0.141407,0.609644,-0.146901,-0.014031,-0.210586,-0.274926,-0.369773,-0.159049
670347.0,34.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.229122,-0.616006,-0.273707,-0.145414,-0.182118,-0.089887,-0.211422,-0.274926,-0.369773,-0.517379
628087.0,34.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.232416,-0.078342,-0.245756,0.005932,-0.182118,-0.089527,-0.210586,-0.274926,-0.288739,2.431221
201332.0,34.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.238305,-0.540182,-0.255073,-0.145414,-0.182118,-0.089887,-0.211422,-0.274926,-0.369773,-0.517379
561486.0,34.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,3.011066,-0.429892,-0.187653,0.05861,0.117224,-0.058969,-0.209749,-0.274926,-0.369773,0.488995


# Final pipeline for preprocessing test or any input data
Without processing y,target column, as it will be predicted

In [None]:
filtering_pipe = Pipeline([
    ('first_part_of_the_course', FunctionTransformer(calc_percentage_course))
])

gender_pipe = Pipeline([
    ('ohe_gender', OneHotEncoder(drop='if_binary', sparse = False, handle_unknown='ignore'))
])

category_edu_pipe = Pipeline([
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse = False))
])

keep_unchanged_pipe = Pipeline([
    ('keep_unchanged', FunctionTransformer(keep_unchanged))#,
    #('reshape', FunctionTransformer(array_reshape))
])

age_pipe = Pipeline([
    ('age_calc', FunctionTransformer(age_constructor)),
    ('reshape', FunctionTransformer(array_reshape)),
    ('stdscaler', StandardScaler())
])

actions_pipe = Pipeline([
    ('ohe_action', FunctionTransformer(action_time)),
    ('combining_actions', FunctionTransformer(combining_actions)),
    ('stdscaler', StandardScaler())
])

basic_encoding_pipe = ColumnTransformer([
    ('gender_pipe', gender_pipe, ["gender"]),
    ('category_edu_pipe', category_edu_pipe, ["category", "education"]),
    ('keep_unchanged', keep_unchanged_pipe, ["username", "id"])
], remainder="drop")

preprocessing_pipe = FeatureUnion([
    ('basic_encoding_pipe', basic_encoding_pipe),
    ('age', age_pipe)
])

removing_duplicates_pipe = Pipeline([
    ('preprocessing_pipe', preprocessing_pipe),
    ('remove_duplicates', FunctionTransformer(remove_duplicates))
])

merge_pipe = FeatureUnion([
    ('all_without_duplicates', removing_duplicates_pipe),
    ('actions', actions_pipe)
])

final_pipe = Pipeline([
    ('filtering_pipe', filtering_pipe),
    ('merge_pipe', merge_pipe)
])
final_pipe

In [None]:
# function based on Azin's code - not used at the end
# def time_delta_action(data):
#     cur_data = data[['username', 'session_id','time']]
#     as_date_time = pd.to_datetime(cur_data['time'])
#     cur_data['timestamp'] = as_date_time.values.astype(np.int64) // 10 ** 9
#     #cur_data = cur_data.drop(['time'], axis = 1)
#     cur_data_sorted = cur_data.sort_values(['username', 'session_id','timestamp'])
#     timediff = pd.DataFrame(cur_data_sorted.groupby('username').timestamp.diff().fillna(0))
#     timediff = timediff.reindex(cur_data.index)
#     return timediff