In [1]:
!pip install --upgrade scikit-learn



In [2]:
# imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from datetime import datetime
from sklearn import set_config
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
set_config(display='diagram')

# Reading data

In [3]:
df = pd.read_csv('../raw_data/df_full_withtime.csv') #, nrows=1000
df.head(2)

Unnamed: 0.1,Unnamed: 0,username,session_id,action,truth,id,start,end,category,gender,education,birth,timestamp,timediff
0,7084339,24057,df8ec8baa5af2f61e5426307be9a5e75,load_video,1,809,2015-05-11 20:00:00,2015-07-17 00:00:00,economics,male,Master's,1987.0,1433436084,0.0
1,7084345,24057,df8ec8baa5af2f61e5426307be9a5e75,play_video,1,809,2015-05-11 20:00:00,2015-07-17 00:00:00,economics,male,Master's,1987.0,1433436087,3.0


In [4]:
df = df.drop(columns=['Unnamed: 0'])

## Combing user column with session_id

In [5]:
user_course_id = []
for i in range(len(df.iloc[:,0])):
    to_merge = [str(df.iloc[i,0]),str(df.iloc[i,4])]
    user_course_id.append('_'.join(to_merge))
df['user_course_id'] = user_course_id
df.head(2)

Unnamed: 0,username,session_id,action,truth,id,start,end,category,gender,education,birth,timestamp,timediff,user_course_id
0,24057,df8ec8baa5af2f61e5426307be9a5e75,load_video,1,809,2015-05-11 20:00:00,2015-07-17 00:00:00,economics,male,Master's,1987.0,1433436084,0.0,24057_809
1,24057,df8ec8baa5af2f61e5426307be9a5e75,play_video,1,809,2015-05-11 20:00:00,2015-07-17 00:00:00,economics,male,Master's,1987.0,1433436087,3.0,24057_809


In [107]:
X = pd.DataFrame(df[['user_course_id','truth']])
X.head()

Unnamed: 0,user_course_id,truth
0,24057_809,1
1,24057_809,1
2,24057_809,1
3,670347_34,1
4,670347_34,1


# Encoding features

## Encoding gender

In [108]:
ohe_gender = OneHotEncoder(drop='if_binary', sparse = False) # Instanciate encoder
ohe_gender.fit(df[['gender']]) # Fit encoder
X['gender'] = ohe_gender.transform(df[['gender']]) #, X['male']
X.head(3)

Unnamed: 0,user_course_id,truth,gender
0,24057_809,1,1.0
1,24057_809,1,1.0
2,24057_809,1,1.0


## Encoding course category

In [25]:
course_names = list(df['category'].unique())
course_names = sorted(course_names)

In [26]:
ohe_course = OneHotEncoder(sparse = False) # Instanciate encoder
ohe_course.fit(df[['category']]) # Fit encoder
course_encoded = ohe_course.transform(df[['category']]) 
course_X = pd.DataFrame(course_encoded)
course_X.columns = course_names
course_X['user_course_id'] = df[['user_course_id']]
course_X.head()

Unnamed: 0,art,biology,business,chemistry,computer,economics,education,electrical,engineering,foreign language,history,literature,math,medicine,philosophy,physics,social science,user_course_id
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24057_809
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24057_809
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24057_809
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,670347_34
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,670347_34


## Encoding education

In [27]:
edu_names = list(df['education'].unique())
edu_names = sorted(edu_names)

In [28]:
ohe_edu = OneHotEncoder(sparse = False) # Instanciate encoder
ohe_edu.fit(df[['education']]) # Fit encoder
edu_encoded = ohe_edu.transform(df[['education']]) 
edu_X = pd.DataFrame(edu_encoded)
edu_X.columns = edu_names
edu_X['user_course_id'] = df[['user_course_id']]
edu_X.head()

Unnamed: 0,Associate,Bachelor's,Doctorate,High,Master's,Middle,Primary,user_course_id
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,24057_809
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,24057_809
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,24057_809
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,670347_34
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,670347_34


## Encoding actions

In [109]:
action_names = list(df['action'].unique())
action_names = sorted(action_names)

In [110]:
ohe_action = OneHotEncoder(sparse = False) # Instanciate encoder
ohe_action.fit(df[['action']]) # Fit encoder
action_encoded = ohe_action.transform(df[['action']]) 
action_X = pd.DataFrame(action_encoded)
action_X.columns = action_names
action_X['user_course_id'] = df[['user_course_id']]
action_X.head()

Unnamed: 0,click_about,click_courseware,click_forum,click_info,click_progress,close_courseware,close_forum,create_comment,create_thread,delete_comment,...,play_video,problem_check,problem_check_correct,problem_check_incorrect,problem_get,problem_save,reset_problem,seek_video,stop_video,user_course_id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24057_809
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24057_809
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24057_809
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,670347_34
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,670347_34


In [111]:
def combining_actions(dataframe, col_name):
    output_df = dataframe.groupby(col_name).sum()
    return output_df

In [112]:
action_X = combining_actions(action_X, 'user_course_id')
action_X.head(3)

Unnamed: 0_level_0,click_about,click_courseware,click_forum,click_info,click_progress,close_courseware,close_forum,create_comment,create_thread,delete_comment,...,pause_video,play_video,problem_check,problem_check_correct,problem_check_incorrect,problem_get,problem_save,reset_problem,seek_video,stop_video
user_course_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000607_1926,6.0,40.0,0.0,10.0,0.0,20.0,0.0,0.0,0.0,0.0,...,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000607_2070,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000607_2909,6.0,54.0,0.0,2.0,0.0,27.0,0.0,0.0,0.0,0.0,...,37.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0


In [113]:
list_indexes = list(action_X.index)
action_X['user_course_id'] = list_indexes
action_X.head()

Unnamed: 0_level_0,click_about,click_courseware,click_forum,click_info,click_progress,close_courseware,close_forum,create_comment,create_thread,delete_comment,...,play_video,problem_check,problem_check_correct,problem_check_incorrect,problem_get,problem_save,reset_problem,seek_video,stop_video,user_course_id
user_course_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000607_1926,6.0,40.0,0.0,10.0,0.0,20.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000607_1926
1000607_2070,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000607_2070
1000607_2909,6.0,54.0,0.0,2.0,0.0,27.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,1000607_2909
1000607_536,0.0,0.0,0.0,0.0,0.0,63.0,0.0,0.0,0.0,0.0,...,51.0,3.0,3.0,0.0,22.0,4.0,0.0,129.0,5.0,1000607_536
1000607_747,0.0,0.0,0.0,0.0,0.0,39.0,0.0,0.0,0.0,0.0,...,49.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,18.0,1000607_747


In [17]:
action_X.shape

(42631, 23)

In [114]:
std_scaler = StandardScaler()
action_X_scaled = pd.DataFrame(std_scaler.fit_transform(action_X), columns=action_X.columns)
action_X_scaled['user_course_id'] = list_indexes
action_X_scaled.head(3)

Unnamed: 0,click_about,click_courseware,click_forum,click_info,click_progress,close_courseware,close_forum,create_comment,create_thread,delete_comment,...,play_video,problem_check,problem_check_correct,problem_check_incorrect,problem_get,problem_save,reset_problem,seek_video,stop_video,user_course_id
0,0.545988,0.277166,-0.182451,0.478221,-0.296769,0.235181,-0.004843,-0.109157,-0.136354,-0.046093,...,-0.306546,-0.307915,-0.339353,-0.238328,-0.056653,-0.188241,-0.114953,-0.243711,-0.050669,1000607_1926
1,0.236112,-0.41969,-0.182451,-0.459058,-0.296769,-0.497146,-0.004843,-0.109157,-0.136354,-0.046093,...,-0.306546,-0.307915,-0.339353,-0.238328,-0.056653,-0.188241,-0.114953,-0.243711,-0.050669,1000607_2070
2,0.545988,0.521065,-0.182451,-0.271602,-0.296769,0.491495,-0.004843,-0.109157,-0.136354,-0.046093,...,-0.306546,-0.307915,-0.339353,-0.238328,-0.02771,-0.188241,-0.114953,-0.243711,-0.050669,1000607_2909


## Encoding age

In [116]:
df.head(2)

Unnamed: 0,username,session_id,action,truth,id,start,end,category,gender,education,birth,timestamp,timediff,user_course_id
0,24057,df8ec8baa5af2f61e5426307be9a5e75,load_video,1,809,2015-05-11 20:00:00,2015-07-17 00:00:00,economics,male,Master's,1987.0,1433436084,0.0,24057_809
1,24057,df8ec8baa5af2f61e5426307be9a5e75,play_video,1,809,2015-05-11 20:00:00,2015-07-17 00:00:00,economics,male,Master's,1987.0,1433436087,3.0,24057_809


In [137]:
test = pd.DatetimeIndex(pd.to_datetime(df['start'], format='%Y-%m-%d')).year
test

Int64Index([2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015,
            ...
            2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017],
           dtype='int64', name='start', length=8322174)

In [138]:
def age_calc(birth_column, start_column):
    course_start_year = pd.DatetimeIndex(pd.to_datetime(start_column, format='%Y-%m-%d')).year
    age = course_start_year - birth_column
    return age

In [142]:
#age_constructor = FunctionTransformer(lambda data: pd.DataFrame(data["product_length_cm"] * data["product_height_cm"] * data["product_width_cm"]))
age_constructor = FunctionTransformer(lambda data: pd.DatetimeIndex(pd.to_datetime(df['start'], format='%Y-%m-%d')).year - df['birth'])




In [140]:
X['age'] = age_calc(df['birth'],df['start'])
X.head(3)

Unnamed: 0,user_course_id,truth,gender,age
0,24057_809,1,1.0,28.0
1,24057_809,1,1.0,28.0
2,24057_809,1,1.0,28.0


In [141]:
std_scaler = StandardScaler()
X['age'] = std_scaler.fit_transform(X[['age']])
X.head(3)

Unnamed: 0,user_course_id,truth,gender,age
0,24057_809,1,1.0,0.162372
1,24057_809,1,1.0,0.162372
2,24057_809,1,1.0,0.162372


## merge into one data frame

In [None]:
def remove_duplicates(dataframe):
    return dataframe.drop_duplicates()

In [32]:
display(X.shape)
X = X.drop_duplicates()
edu_X = edu_X.drop_duplicates()
course_X = course_X.drop_duplicates()
display(X.shape)
display(edu_X.shape)
display(course_X.shape)

(8322174, 4)

(42631, 4)

(42631, 8)

(42631, 18)

## making sure data frames are in the same order

In [67]:
X['new_order'] = pd.Categorical(
    X['user_course_id'], 
    categories=list_indexes, 
    ordered=True
)
X = X.sort_values('new_order')
X = X.drop(columns=['new_order'])
X.head()

Unnamed: 0,user_course_id,truth,female,male,age
3559549,1000607_1926,0,0.0,1.0,-0.08537
2978561,1000607_2070,0,0.0,1.0,-0.08537
3577486,1000607_2909,0,0.0,1.0,-0.08537
312370,1000607_536,0,0.0,1.0,-0.08537
2758502,1000607_747,0,0.0,1.0,-0.08537


In [42]:
edu_X['new_order'] = pd.Categorical(
    edu_X['user_course_id'], 
    categories=list_indexes, 
    ordered=True
)
edu_X = edu_X.sort_values('new_order')
edu_X = edu_X.drop(columns=['new_order'])
edu_X.head()

Unnamed: 0,Associate,Bachelor's,Doctorate,High,Master's,Middle,Primary,user_course_id
3559549,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1000607_1926
2978561,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1000607_2070
3577486,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1000607_2909
312370,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1000607_536
2758502,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1000607_747


In [43]:
course_X['new_order'] = pd.Categorical(
    course_X['user_course_id'], 
    categories=list_indexes, 
    ordered=True
)
course_X = course_X.sort_values('new_order')
course_X = course_X.drop(columns=['new_order'])
course_X.head()

Unnamed: 0,art,biology,business,chemistry,computer,economics,education,electrical,engineering,foreign language,history,literature,math,medicine,philosophy,physics,social science,user_course_id
3559549,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000607_1926
2978561,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000607_2070
3577486,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000607_2909
312370,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000607_536
2758502,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000607_747


## merging

In [68]:
list_of_df = [edu_X,course_X,action_X_scaled]

In [69]:
def merging_dfs(core_df, df_list):
    
    core_df = pd.DataFrame(core_df)
    core_df = core_df.reset_index()
    for i in range(len(df_list)):
        cur_df = pd.DataFrame(df_list[i])
        cur_df = cur_df.reset_index()
        column_list = cur_df.columns
        column_list = column_list.drop("user_course_id")
    
        for column in column_list:
            core_df[column] = cur_df[column]
    
    return core_df

In [71]:
df_featured = merging_dfs(X,list_of_df)
df_featured.head()

Unnamed: 0,index,user_course_id,truth,female,male,age,Associate,Bachelor's,Doctorate,High,...,pause_video,play_video,problem_check,problem_check_correct,problem_check_incorrect,problem_get,problem_save,reset_problem,seek_video,stop_video
0,0,1000607_1926,0,0.0,1.0,-0.08537,0.0,0.0,0.0,0.0,...,0.02598,-0.306546,-0.307915,-0.339353,-0.238328,-0.056653,-0.188241,-0.114953,-0.243711,-0.050669
1,1,1000607_2070,0,0.0,1.0,-0.08537,0.0,0.0,0.0,0.0,...,-0.327739,-0.306546,-0.307915,-0.339353,-0.238328,-0.056653,-0.188241,-0.114953,-0.243711,-0.050669
2,2,1000607_2909,0,0.0,1.0,-0.08537,0.0,0.0,0.0,0.0,...,0.094442,-0.306546,-0.307915,-0.339353,-0.238328,-0.02771,-0.188241,-0.114953,-0.243711,-0.050669
3,3,1000607_536,0,0.0,1.0,-0.08537,0.0,0.0,0.0,0.0,...,-0.008251,0.312685,-0.176667,-0.130004,-0.238328,0.007021,0.541546,-0.114953,2.138872,-0.042257
4,4,1000607_747,0,0.0,1.0,-0.08537,0.0,0.0,0.0,0.0,...,-0.031071,0.288401,-0.307915,-0.339353,-0.238328,-0.056653,-0.188241,-0.114953,-0.022075,-0.020386


In [72]:
df_featured = df_featured.drop(columns = ['index'])
df_featured.head(2)

Unnamed: 0,user_course_id,truth,female,male,age,Associate,Bachelor's,Doctorate,High,Master's,...,pause_video,play_video,problem_check,problem_check_correct,problem_check_incorrect,problem_get,problem_save,reset_problem,seek_video,stop_video
0,1000607_1926,0,0.0,1.0,-0.08537,0.0,0.0,0.0,0.0,1.0,...,0.02598,-0.306546,-0.307915,-0.339353,-0.238328,-0.056653,-0.188241,-0.114953,-0.243711,-0.050669
1,1000607_2070,0,0.0,1.0,-0.08537,0.0,0.0,0.0,0.0,1.0,...,-0.327739,-0.306546,-0.307915,-0.339353,-0.238328,-0.056653,-0.188241,-0.114953,-0.243711,-0.050669


In [73]:
df_featured.shape

(42631, 51)

# Encoding in a pipeline

## Let's run it in a pipeline

In [46]:
def remove_duplicates(dataframe):
    return pd.DataFrame(dataframe).drop_duplicates()

age_constructor = FunctionTransformer(lambda data: pd.DatetimeIndex(pd.to_datetime(data['start'], format='%Y-%m-%d')).year - pd.DatetimeIndex(pd.to_datetime(data['birth'], format='%Y')).year)
array_reshape = FunctionTransformer(lambda data: np.reshape(data, (-1, 1)))
rp = FunctionTransformer(lambda data: remove_duplicates(data))

gender_pipe = Pipeline([
    ('ohe', OneHotEncoder(drop='if_binary', sparse = False, handle_unknown='ignore'))
])

category_edu_pipe = Pipeline([
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse = False))
])

age_pipe = Pipeline([
    ('age_calc', age_constructor),
    ('reshape', array_reshape),
    ('stdscaler', StandardScaler())
])

basic_encoding_pipe = ColumnTransformer([
    ('gender_pipe', gender_pipe, ["gender"]),
    ('category_edu_pipe', category_edu_pipe, ["category", "education"])
], remainder="drop")

preprocessing_pipe = FeatureUnion([
    ('basic_encoding_pipe', basic_encoding_pipe),
    ('age', age_pipe)
])

final_pipe = Pipeline([
    ('preprocessing_pipe', preprocessing_pipe),
    ('remove_duplicates', rp)
])

final_pipe

In [47]:
X_trans = final_pipe.fit_transform(df)
X_trans

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.162372
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.462038
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.357970
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.474577
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.253902
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8304657,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.845452
8306233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.994920
8306751,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-2.335270
8309235,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.411193


# Exporting pipeline as pickle file

In [50]:
import pickle

# Export pipeline as pickle file
with open("../pipeline_features.pkl", "wb") as file:
    pickle.dump(final_pipe, file )

PicklingError: Can't pickle <function <lambda> at 0x1669dc550>: attribute lookup <lambda> on __main__ failed

In [None]:
# def set_pipeline():
#     '''returns a pipelined model'''
#     dist_pipe = Pipeline([
#         ('dist_trans', DistanceTransformer()),
#         ('stdscaler', StandardScaler())
#     ])
#     time_pipe = Pipeline([
#         ('time_enc', TimeFeaturesEncoder('pickup_datetime')),
#         ('ohe', OneHotEncoder(handle_unknown='ignore'))
#     ])
#     preproc_pipe = ColumnTransformer([
#         ('distance', dist_pipe, ["pickup_latitude", "pickup_longitude", 'dropoff_latitude', 'dropoff_longitude']),
#         ('time', time_pipe, ['pickup_datetime'])
#     ], remainder="drop")
#     pipe = Pipeline([
#         ('preproc', preproc_pipe),
#         ('linear_model', LinearRegression())
#     ])
#     return pipe