In [1]:
!pip install --upgrade scikit-learn



In [2]:
# imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from datetime import datetime
from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
set_config(display='diagram')

# Reading data

In [3]:
df = pd.read_csv('../raw_data/df_full_withtime.csv') #, nrows=1000
df.head(2)

Unnamed: 0.1,Unnamed: 0,username,session_id,action,truth,id,start,end,category,gender,education,birth,timestamp,timediff
0,7084339,24057,df8ec8baa5af2f61e5426307be9a5e75,load_video,1,809,2015-05-11 20:00:00,2015-07-17 00:00:00,economics,male,Master's,1987.0,1433436084,0.0
1,7084345,24057,df8ec8baa5af2f61e5426307be9a5e75,play_video,1,809,2015-05-11 20:00:00,2015-07-17 00:00:00,economics,male,Master's,1987.0,1433436087,3.0


In [4]:
df = df.drop(columns=['Unnamed: 0'])

## Combing user column with session_id

In [5]:
user_course_id = []
for i in range(len(df.iloc[:,0])):
    to_merge = [str(df.iloc[i,0]),str(df.iloc[i,4])]
    user_course_id.append('_'.join(to_merge))
df['user_course_id'] = user_course_id
df.head(2)

Unnamed: 0,username,session_id,action,truth,id,start,end,category,gender,education,birth,timestamp,timediff,user_course_id
0,24057,df8ec8baa5af2f61e5426307be9a5e75,load_video,1,809,2015-05-11 20:00:00,2015-07-17 00:00:00,economics,male,Master's,1987.0,1433436084,0.0,24057_809
1,24057,df8ec8baa5af2f61e5426307be9a5e75,play_video,1,809,2015-05-11 20:00:00,2015-07-17 00:00:00,economics,male,Master's,1987.0,1433436087,3.0,24057_809


In [6]:
X = pd.DataFrame(df['user_course_id'])
X.head()

Unnamed: 0,user_course_id
0,24057_809
1,24057_809
2,24057_809
3,670347_34
4,670347_34


# Encoding features

## Encoding gender

In [7]:
ohe_gender = OneHotEncoder(sparse = False) # Instanciate encoder
ohe_gender.fit(df[['gender']]) # Fit encoder
gender_encoded = ohe_gender.transform(df[['gender']]) 
X['female'], X['male'] = gender_encoded.T
X.head(3)

Unnamed: 0,user_course_id,female,male
0,24057_809,0.0,1.0
1,24057_809,0.0,1.0
2,24057_809,0.0,1.0


## Encoding course category

In [8]:
course_names = list(df['category'].unique())
course_names = sorted(course_names)

In [9]:
ohe_course = OneHotEncoder(sparse = False) # Instanciate encoder
ohe_course.fit(df[['category']]) # Fit encoder
course_encoded = ohe_course.transform(df[['category']]) 
course_X = pd.DataFrame(course_encoded)
course_X.columns = course_names
course_X['user_course_id'] = df[['user_course_id']]
course_X.head()

Unnamed: 0,art,biology,business,chemistry,computer,economics,education,electrical,engineering,foreign language,history,literature,math,medicine,philosophy,physics,social science,user_course_id
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24057_809
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24057_809
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24057_809
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,670347_34
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,670347_34


## Encoding education

In [10]:
edu_names = list(df['education'].unique())
edu_names = sorted(edu_names)

In [11]:
ohe_edu = OneHotEncoder(sparse = False) # Instanciate encoder
ohe_edu.fit(df[['education']]) # Fit encoder
edu_encoded = ohe_edu.transform(df[['education']]) 
edu_X = pd.DataFrame(edu_encoded)
edu_X.columns = edu_names
edu_X['user_course_id'] = df[['user_course_id']]
edu_X.head()

Unnamed: 0,Associate,Bachelor's,Doctorate,High,Master's,Middle,Primary,user_course_id
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,24057_809
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,24057_809
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,24057_809
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,670347_34
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,670347_34


## Encoding actions

In [12]:
action_names = list(df['action'].unique())
action_names = sorted(action_names)

In [13]:
ohe_action = OneHotEncoder(sparse = False) # Instanciate encoder
ohe_action.fit(df[['action']]) # Fit encoder
action_encoded = ohe_action.transform(df[['action']]) 
action_X = pd.DataFrame(action_encoded)
action_X.columns = action_names
action_X['user_course_id'] = df[['user_course_id']]
action_X.head()

Unnamed: 0,click_about,click_courseware,click_forum,click_info,click_progress,close_courseware,close_forum,create_comment,create_thread,delete_comment,...,play_video,problem_check,problem_check_correct,problem_check_incorrect,problem_get,problem_save,reset_problem,seek_video,stop_video,user_course_id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24057_809
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24057_809
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24057_809
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,670347_34
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,670347_34


In [14]:
def combining_actions(dataframe, col_name):
    col_to_pick = col_name
    unique_user_course = dataframe[col_to_pick].unique()
    output = []
    
    for user_course in unique_user_course:
        cur_data = dataframe[dataframe[col_to_pick] == user_course].sum()
        cur_data[col_to_pick] = user_course
        output.append(cur_data)
    output_df = pd.DataFrame(output)
    
    return output_df

In [15]:
action_X = combining_actions(action_X, 'user_course_id')
action_X.head(3)

KeyboardInterrupt: 

In [25]:
action_X.shape

(8322174, 23)

In [None]:
std_scaler = StandardScaler()
action_X_scaled = pd.DataFrame(std_scaler.fit_transform(action_X), columns=action_X.columns)
action_X_scaled['user_course_id'] = action_X['user_course_id']
action_X_scaled.head(3)

## Encoding age

In [16]:
def age_calc(birth_column):
    cur_year = datetime.today().year
    age = cur_year - birth_column
    return age

In [17]:
X['age'] = age_calc(df['birth'])
X.head(3)

Unnamed: 0,user_course_id,female,male,age
0,24057_809,0.0,1.0,35.0
1,24057_809,0.0,1.0,35.0
2,24057_809,0.0,1.0,35.0


In [18]:
std_scaler = StandardScaler()
X['age'] = std_scaler.fit_transform(X[['age']])
X.head(3)

Unnamed: 0,user_course_id,female,male,age
0,24057_809,0.0,1.0,0.225607
1,24057_809,0.0,1.0,0.225607
2,24057_809,0.0,1.0,0.225607


## merge into one data frame

In [None]:
def remove_duplicates(dataframe):
    return dataframe.drop_duplicates()

In [19]:
display(X.shape)
X = X.drop_duplicates()
edu_X = edu_X.drop_duplicates()
course_X = course_X.drop_duplicates()
display(X.shape)
display(edu_X.shape)
display(course_X.shape)

(8322174, 4)

(42631, 4)

(42631, 8)

(42631, 18)

In [20]:
list_of_df = [edu_X,course_X] #,action_X_scaled

In [21]:
def merging_dfs(core_df, df_list):
    
    core_df = pd.DataFrame(core_df)
    for i in range(len(df_list)):
        cur_df = pd.DataFrame(df_list[i])
        column_list = cur_df.columns
        column_list = column_list.drop("user_course_id")
    
        for column in column_list:
            core_df[column] = cur_df[column]
    
    return core_df

In [22]:
X_featured = merging_dfs(X,list_of_df)
X_featured.head()

Unnamed: 0,user_course_id,female,male,age,Associate,Bachelor's,Doctorate,High,Master's,Middle,...,electrical,engineering,foreign language,history,literature,math,medicine,philosophy,physics,social science
0,24057_809,0.0,1.0,0.225607,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,670347_34,0.0,1.0,-0.396347,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,628087_34,1.0,0.0,-0.292688,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,201332_34,0.0,1.0,0.536584,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9,561486_34,0.0,1.0,-0.189029,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [23]:
X_featured.shape

(42631, 28)

In [24]:
df.to_csv('../raw_data/X_featured_part1.csv')

In [None]:
## pipes not working....

# Encoding in a pipeline

In [None]:
# gender_category_edu_pipe = Pipeline([
#     ('ohe', OneHotEncoder(handle_unknown='ignore'))
# ])

In [None]:
# age_pipe = Pipeline([
#     ('age_calc', age_calc('birth')),
#     ('stdscaler', StandardScaler())
# ])

In [None]:
# gender_category_edu_pipe = Pipeline([
#     ('ohe', OneHotEncoder(handle_unknown='ignore')),
#     ('removing_duplicates', remove_duplicates('ohe'))
# ])

# action_pipe = Pipeline([
#     ('ohe', OneHotEncoder(handle_unknown='ignore')),
#     ('combining_actions', combining_actions()),
#     ('stdscaler', StandardScaler())
# ])

# age_pipe = Pipeline([
#     ('age_calc', age_calc()),
#     ('stdscaler', StandardScaler()),
#     ('removing_duplicates', remove_duplicates())
# ])

# encoding_pipe = ColumnTransformer([
#     ('gender_category_edu_pipe', gender_category_edu_pipe, ["gender", "category", "education"]),
#     ('actions', action_pipe, ["action"]),
#     ('age', age_pipe, ["birth"])
# ], remainder="drop")

In [None]:
# def set_pipeline():
#     '''returns a pipelined model'''
#     dist_pipe = Pipeline([
#         ('dist_trans', DistanceTransformer()),
#         ('stdscaler', StandardScaler())
#     ])
#     time_pipe = Pipeline([
#         ('time_enc', TimeFeaturesEncoder('pickup_datetime')),
#         ('ohe', OneHotEncoder(handle_unknown='ignore'))
#     ])
#     preproc_pipe = ColumnTransformer([
#         ('distance', dist_pipe, ["pickup_latitude", "pickup_longitude", 'dropoff_latitude', 'dropoff_longitude']),
#         ('time', time_pipe, ['pickup_datetime'])
#     ], remainder="drop")
#     pipe = Pipeline([
#         ('preproc', preproc_pipe),
#         ('linear_model', LinearRegression())
#     ])
#     return pipe