### Goal: to prepare the data for exploration

In [1]:
# General Imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Warnings 
import warnings
warnings.filterwarnings("ignore")

# Monitor memory use and time
import psutil
from tqdm import tqdm

import prepare

### Reading Files

In [2]:
# Reading in data from local files
df_train = pd.read_csv('data_2000users/train.csv')
df_validate = pd.read_csv('data_2000users/validate.csv')
df_test = pd.read_csv('data_2000users/test.csv')

print("Momoery usage: ", psutil.virtual_memory().percent)

df_train.shape, df_validate.shape, df_test.shape

Momoery usage:  42.0


((411517, 18), (50842, 18), (52868, 18))

In [3]:
# Reading in data from local files
df_ques = pd.read_csv('questions_with_tag_counts.csv', index_col=0)
df_lects = pd.read_csv('lectures_with_part_name.csv', index_col=0)

print("Momoery usage: ", psutil.virtual_memory().percent)

df_ques.shape, df_lects.shape

Momoery usage:  42.3


((13523, 6), (418, 5))

### Drop columns from Kaggle questions.csv and lectures.csv

In [4]:
# Drop the columns merged from questions.csv and lectures.csv
cols = ['lecture_id', 'tag', 'lecture_part', 'type_of', 'question_id',
        'bundle_id', 'correct_answer', 'question_part', 'tags']

df_train = df_train.drop(columns = cols)
df_validate = df_validate.drop(columns = cols)
df_test = df_test.drop(columns = cols)

print("Momoery usage: ", psutil.virtual_memory().percent)

df_train.shape, df_validate.shape, df_test.shape

Momoery usage:  42.2


((411517, 9), (50842, 9), (52868, 9))

### Add features:
- user_acc_mean
- user_lectures_running_total
- avg_user_q_time

In [41]:
%%time

train = prepare.sam_train_features(df_train)
validate = prepare.sam_valtest_features(train, df_validate)
test = prepare.sam_valtest_features(train, df_test)

print("Momoery usage: ", psutil.virtual_memory().percent)

train.shape, validate.shape, test.shape

Momoery usage:  62.3
CPU times: user 2.18 s, sys: 181 ms, total: 2.36 s
Wall time: 2.35 s


((411517, 13), (50842, 12), (52868, 12))

In [42]:
train.head(1)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_acc_mean,user_lectures_running_total,q_time,avg_user_q_time
0,0,1864702,5720,0,0,1,1,,,0.630049,0,45951.0,11917302.0


In [43]:
# # Drop user ids to save memory

# train.drop(columns='user_id', inplace=True)
# validate.drop(columns='user_id', inplace=True)
# test.drop(columns='user_id', inplace=True)

# train.head(1)

### Handle nulls and the np.inf

In [44]:
train.head(2)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_acc_mean,user_lectures_running_total,q_time,avg_user_q_time
0,0,1864702,5720,0,0,1,1,,,0.630049,0,45951.0,11917302.0
1,45951,1864702,5204,0,1,1,0,inf,False,0.630049,0,28391.0,11917302.0


In [45]:
# handle nulls
train = prepare.handle_null(train)
validate = prepare.handle_null(validate)
test = prepare.handle_null(test)
    
# Handle the inf values
train = prepare.handle_inf(train)
validate = prepare.handle_inf(validate)
test = prepare.handle_inf(test)

train.head(2)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_acc_mean,user_lectures_running_total,q_time,avg_user_q_time
0,0,1864702,5720,0,0,1,1,0.0,False,0.630049,0,45951.0,11917302.0
1,45951,1864702,5204,0,1,1,0,0.0,False,0.630049,0,28391.0,11917302.0


### Seperate lecture rows and question rows

In [46]:
# Define a function to separate the lecture rows and question rows

def seperate_rows(df):
    '''
    separate the lecture rows and question rows
    '''
    mask_question = (df['answered_correctly'] != -1)
    mask_lecture = (df['answered_correctly'] == -1)
    df_question = df[mask_question]
    df_lecture = df[mask_lecture]
    return df_question, df_lecture

In [47]:
# Apply the function on train, validate, and test

train, train_lects = seperate_rows(train)
validate, validate_lects = seperate_rows(validate)
test, test_lects = seperate_rows(test)

train.shape, train_lects.shape

((403377, 13), (8140, 13))

### Merge: 
- train, validate and test with df_ques
- train_lects with df_lects

In [48]:
train.head(2)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_acc_mean,user_lectures_running_total,q_time,avg_user_q_time
0,0,1864702,5720,0,0,1,1,0.0,False,0.630049,0,45951.0,11917302.0
1,45951,1864702,5204,0,1,1,0,0.0,False,0.630049,0,28391.0,11917302.0


In [49]:
# Merge train/validate/test with df_ques

train = train.merge(df_ques, how='left', left_on='content_id', right_on='question_id')
validate = validate.merge(df_ques, how='left', left_on='content_id', right_on='question_id')
test = test.merge(df_ques, how='left', left_on='content_id', right_on='question_id')

print("Momoery usage: ", psutil.virtual_memory().percent)

train.shape, validate.shape, test.shape

Momoery usage:  62.6


((403377, 19), (49945, 18), (51971, 18))

In [50]:
# Inspect train
train.head(1)

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_acc_mean,user_lectures_running_total,q_time,avg_user_q_time,question_id,bundle_id,correct_answer,part,tags,tag_count
0,0,1864702,5720,0,0,1,1,0.0,False,0.630049,0,45951.0,11917302.0,5720,5720,1,5,115,1


**Columns to drop**
- timestamp: used to compute avg time each user to answer 1 question
- content_id (question_id): after drop the lecture rows, used to compute average question mean
- content_type_id: **drop**
- task_container_id: used to compute avereage task mean
- user_answer: **drop**
- answered_correctly: target varibale
- prior_question_elapsed_time: **drop**
- prior_question_had_explanation: used for question_had_explanation
- correct_answer: **drop**

In [51]:
# Drop the redundant column to save memory

train.drop(columns=['content_type_id', 'user_answer', 
                    'prior_question_elapsed_time', 'correct_answer'], inplace=True)
validate.drop(columns=['content_type_id', 'user_answer', 
                       'prior_question_elapsed_time', 'correct_answer'], inplace=True)
test.drop(columns=['content_type_id', 'user_answer', 
                   'prior_question_elapsed_time', 'correct_answer'], inplace=True)

print("Momoery usage: ", psutil.virtual_memory().percent)

train.shape, validate.shape, test.shape

Momoery usage:  63.4


((403377, 15), (49945, 14), (51971, 14))

In [52]:
# Inspect train dataset
train.head(1)

Unnamed: 0,timestamp,user_id,content_id,task_container_id,answered_correctly,prior_question_had_explanation,user_acc_mean,user_lectures_running_total,q_time,avg_user_q_time,question_id,bundle_id,part,tags,tag_count
0,0,1864702,5720,0,1,False,0.630049,0,45951.0,11917302.0,5720,5720,5,115,1


### Add features:
- part accuracy
- bundle accuray

In [53]:
def part_bundle_features(train, validate, test):
    
    # Calculate the average accuracy for each unique bundle id
    bundle_accuracy = train.groupby(['bundle_id'])['answered_correctly'].mean().round(2).to_frame().reset_index()
    bundle_accuracy.columns = ['bundle_id', 'mean_bundle_accuracy']
    
    # Add bundle mean accuracy as a feature to train, validate, and test
    merged_train = train.merge(bundle_accuracy, left_on='bundle_id', right_on='bundle_id', how='left')
    merged_validate = validate.merge(bundle_accuracy, left_on='bundle_id', right_on='bundle_id', how='left')
    merged_test = test.merge(bundle_accuracy, left_on='bundle_id', right_on='bundle_id', how='left')
    
    # Calculate the average part accuracy
    tag_accuracy = train.groupby(['part'])['answered_correctly'].agg(['mean']).round(2).reset_index()
    tag_accuracy.columns = ['part', 'mean_part_accuracy']
    
    # Add average part accuracy
    train_df = merged_train.merge(tag_accuracy, left_on='part', right_on='part')
    validate_df = merged_validate.merge(tag_accuracy, left_on='part', right_on='part')
    test_df = merged_test.merge(tag_accuracy, left_on='part', right_on='part')
    
    # Calculate the mean container accuracy for each part
    tag_bundles = train.groupby(['question_id', 'task_container_id', 'part'])['answered_correctly'].mean().round(2).reset_index()
    tag_bundles.rename(columns={'answered_correctly': 'mean_container_part_accuracy'}, inplace=True)
    tag_bundles.drop(columns='question_id', inplace=True)
    
#     # Add mean container part accuracy
#     train_set = train_df.merge(tag_bundles, how='left', 
#                                left_on=['task_container_id', 'part'], 
#                                right_on=['task_container_id', 'part'])
    
#     validate_set = validate_df.merge(tag_bundles, how='left', 
#                                      left_on=['task_container_id', 'part'], 
#                                      right_on=['task_container_id', 'part'])
    
#     test_set = test_df.merge(tag_bundles, how='left', 
#                              left_on=['task_container_id', 'part'], 
#                              right_on=['task_container_id', 'part'])

    
    return train_df, validate_df, test_df

In [54]:
%%time

train, validate, test = part_bundle_features(train, validate, test)

print("Momoery usage: ", psutil.virtual_memory().percent)

train.shape, validate.shape, test.shape

Momoery usage:  64.5
CPU times: user 338 ms, sys: 93.4 ms, total: 431 ms
Wall time: 431 ms


((403377, 17), (49945, 16), (51971, 16))

In [55]:
# Inspect train
train.head(1)

Unnamed: 0,timestamp,user_id,content_id,task_container_id,answered_correctly,prior_question_had_explanation,user_acc_mean,user_lectures_running_total,q_time,avg_user_q_time,question_id,bundle_id,part,tags,tag_count,mean_bundle_accuracy,mean_part_accuracy
0,0,1864702,5720,0,1,False,0.630049,0,45951.0,11917302.0,5720,5720,5,115,1,0.82,0.61


### Add features
- content accuracy
- task accuracy

In [56]:
%%time

train = prepare.merge_with_stats_train(train)
validate = prepare.merge_with_stats_valortest(train, validate)
test = prepare.merge_with_stats_valortest(train, test)

print("Momoery usage: ", psutil.virtual_memory().percent)

train.shape, validate.shape, test.shape

Momoery usage:  64.6
CPU times: user 7.07 s, sys: 180 ms, total: 7.25 s
Wall time: 7.31 s


((403377, 19), (49945, 18), (51971, 18))

In [57]:
# Inspect train
train.head(1)

Unnamed: 0,timestamp,user_id,content_id,task_container_id,answered_correctly,prior_question_had_explanation,user_acc_mean,user_lectures_running_total,q_time,avg_user_q_time,question_id,bundle_id,part,tags,tag_count,mean_bundle_accuracy,mean_part_accuracy,mean_content_accuracy,mean_task_accuracy
0,0,1864702,5720,0,1,False,0.630049,0,45951.0,11917302.0,5720,5720,5,115,1,0.82,0.61,0.818182,0.682248


### Add features:
- mean tagcount accuracy
- mean tags accuracy

In [58]:
# Compute the accuracy on eveary part

mean_tagcount_accuracy = train.groupby('tag_count').answered_correctly.mean().round(2).rename('mean_tagcount_accuracy')
mean_tagcount_accuracy

tag_count
1    0.62
2    0.68
3    0.67
4    0.69
5    0.71
6    0.76
Name: mean_tagcount_accuracy, dtype: float64

In [59]:
# Compute the accuracy on eveary part

mean_tags_accuracy = train.groupby('tags').answered_correctly.mean().round(2).rename('mean_tags_accuracy')
mean_tags_accuracy.head()

tags
1                0.60
1 162            0.62
10 111 92        0.70
10 164 102       0.79
10 164 162 29    0.94
Name: mean_tags_accuracy, dtype: float64

In [60]:
# Test on train

train = train.merge(mean_tagcount_accuracy, how='left', on='tag_count')
train = train.merge(mean_tags_accuracy, how='left', on='tags')

validate = validate.merge(mean_tagcount_accuracy, how='left', on='tag_count')
validate = validate.merge(mean_tags_accuracy, how='left', on='tags')

test = test.merge(mean_tagcount_accuracy, how='left', on='tag_count')
test = test.merge(mean_tags_accuracy, how='left', on='tags')

In [61]:
train.head()

Unnamed: 0,timestamp,user_id,content_id,task_container_id,answered_correctly,prior_question_had_explanation,user_acc_mean,user_lectures_running_total,q_time,avg_user_q_time,...,bundle_id,part,tags,tag_count,mean_bundle_accuracy,mean_part_accuracy,mean_content_accuracy,mean_task_accuracy,mean_tagcount_accuracy,mean_tags_accuracy
0,0,1864702,5720,0,1,False,0.630049,0,45951.0,11917302.0,...,5720,5,115,1,0.82,0.61,0.818182,0.682248,0.62,0.79
1,45951,1864702,5204,1,0,False,0.630049,0,28391.0,11917302.0,...,5204,5,173,1,0.55,0.61,0.55,0.534988,0.62,0.65
2,74342,1864702,4094,2,1,False,0.630049,0,22436.0,11917302.0,...,4094,5,1,1,0.44,0.61,0.444444,0.445216,0.62,0.6
3,96778,1864702,9699,3,1,False,0.630049,0,36191.0,11917302.0,...,9699,5,55,1,0.41,0.61,0.40625,0.544008,0.62,0.62
4,132969,1864702,5889,4,0,False,0.630049,0,24322.0,11917302.0,...,5889,5,89,1,0.69,0.61,0.6875,0.485282,0.62,0.62


### Fill the Nulls

In [64]:
validate.fillna(0.5, inplace=True)
test.fillna(0.5, inplace=True)

validate.isnull().sum(axis=0)

timestamp                         0
user_id                           0
content_id                        0
task_container_id                 0
answered_correctly                0
prior_question_had_explanation    0
user_acc_mean                     0
user_lectures_running_total       0
avg_user_q_time                   0
question_id                       0
bundle_id                         0
part                              0
tags                              0
tag_count                         0
mean_bundle_accuracy              0
mean_part_accuracy                0
mean_content_accuracy             0
mean_task_accuracy                0
mean_tagcount_accuracy            0
mean_tags_accuracy                0
dtype: int64

### Shift prior question had explanation to current question

In [70]:
# shift prior question had explanation to current question

train.prior_question_had_explanation = train.prior_question_had_explanation.shift(-1)
validate.prior_question_had_explanation = validate.prior_question_had_explanation.shift(-1)
test.prior_question_had_explanation = test.prior_question_had_explanation.shift(-1)

train = train.rename(columns={"prior_question_had_explanation": "question_had_explanation"})
validate = validate.rename(columns={"prior_question_had_explanation": "question_had_explanation"})
test = test.rename(columns={"prior_question_had_explanation": "question_had_explanation"})

In [74]:
train.to_csv('train_exploration.csv')
validate.to_csv('validate_exploration.csv')
test.to_csv('test_exploration.csv')

In [75]:
train.columns

Index(['timestamp', 'user_id', 'content_id', 'task_container_id',
       'answered_correctly', 'question_had_explanation', 'user_acc_mean',
       'user_lectures_running_total', 'q_time', 'avg_user_q_time',
       'question_id', 'bundle_id', 'part', 'tags', 'tag_count',
       'mean_bundle_accuracy', 'mean_part_accuracy', 'mean_content_accuracy',
       'mean_task_accuracy', 'mean_tagcount_accuracy', 'mean_tags_accuracy'],
      dtype='object')

### Drop the q_time in the train

In [76]:
train_s = train.drop(columns='q_time')
train_s.head(1)

Unnamed: 0,timestamp,user_id,content_id,task_container_id,answered_correctly,question_had_explanation,user_acc_mean,user_lectures_running_total,avg_user_q_time,question_id,bundle_id,part,tags,tag_count,mean_bundle_accuracy,mean_part_accuracy,mean_content_accuracy,mean_task_accuracy,mean_tagcount_accuracy,mean_tags_accuracy
0,0,1864702,5720,0,1,False,0.630049,0,11917302.0,5720,5720,5,115,1,0.82,0.61,0.818182,0.682248,0.62,0.79


### Drop columns not needed for modeling

In [77]:
cols = ['timestamp', 'user_id', 'content_id', 'task_container_id',
        'question_id', 'bundle_id', 'part', 'tags', 'tag_count']

train_s = train_s.drop(columns=cols)
validate_s = train.drop(columns=cols)
test_s = train.drop(columns=cols)

### Convert boolean to num

In [83]:
def boolean_to_num(df):
    """
    Accepts df. Converts True and False values into 1's and 0's resepectively, within the 
    question_had_explanation column.
    """
    df = df.fillna(False)
    m = df.question_had_explanation.apply(lambda i: 1 if i == True else 0)
    df.question_had_explanation = m
    return df

In [100]:
train_s = boolean_to_num(train_s)
validate_s = boolean_to_num(validate_s)
test_s = boolean_to_num(test_s)

In [101]:
test_s.isnull().sum(axis=0)

answered_correctly                    0
question_had_explanation              0
user_acc_mean                         0
q_time                                0
mean_bundle_accuracy                  0
mean_part_accuracy                    0
mean_content_accuracy                 0
mean_task_accuracy                    0
mean_tagcount_accuracy                0
mean_tags_accuracy                    0
user_lectures_running_total_scaled    0
avg_user_q_time_scaled                0
dtype: int64

### Scale before modeling

In [91]:
def scale(train, validate, test, columns_to_scale):
    '''
    Accepts train, validate, test and list of columns to scale. Scales listed columns.
    '''
    new_column_names = [c + '_scaled' for c in columns_to_scale]
    
    scaler = MinMaxScaler()
    scaler = scaler.fit(train[columns_to_scale])

    train = pd.concat([
        train,
        pd.DataFrame(scaler.transform(train[columns_to_scale]), columns=new_column_names, index=train.index),
    ], axis=1)

    validate = pd.concat([
        validate,
        pd.DataFrame(scaler.transform(validate[columns_to_scale]), columns=new_column_names, index=validate.index),
    ], axis=1)

    test = pd.concat([
        test,
        pd.DataFrame(scaler.transform(test[columns_to_scale]), columns=new_column_names, index=test.index),
    ], axis=1)
    
    train.drop(columns=columns_to_scale, inplace=True)
    validate.drop(columns=columns_to_scale, inplace=True)
    test.drop(columns=columns_to_scale, inplace=True)
    
    return train, validate, test

In [95]:
from sklearn.preprocessing import MinMaxScaler

columns_to_scale = ['user_lectures_running_total', 'avg_user_q_time']

train_s, validate_s, test_s = scale(train_s, validate_s, test_s, columns_to_scale)

In [96]:
# Inspect train_s
train_s.head(1)

Unnamed: 0,answered_correctly,question_had_explanation,user_acc_mean,mean_bundle_accuracy,mean_part_accuracy,mean_content_accuracy,mean_task_accuracy,mean_tagcount_accuracy,mean_tags_accuracy,user_lectures_running_total_scaled,avg_user_q_time_scaled
0,1,False,0.630049,0.82,0.61,0.818182,0.682248,0.62,0.79,0.0,0.001202


### Select K Best

In [103]:
# Creating train, validate, test DFs that only include non-target variables
X_train = train_s.drop(columns='answered_correctly')
y_train = train_s['answered_correctly']

X_validate = validate_s.drop(columns='answered_correctly')
y_validate = validate_s['answered_correctly']

X_test = test_s.drop(columns='answered_correctly')
y_test = test_s['answered_correctly']

In [104]:
X_train.isnull().sum()

question_had_explanation              0
user_acc_mean                         0
mean_bundle_accuracy                  0
mean_part_accuracy                    0
mean_content_accuracy                 0
mean_task_accuracy                    0
mean_tagcount_accuracy                0
mean_tags_accuracy                    0
user_lectures_running_total_scaled    0
avg_user_q_time_scaled                0
dtype: int64

In [110]:
def KBest_ranker(X, y, n):
    '''
    Returns the top n selected features with their scores based on the SelectKBest calss
    Parameters: scaled predictors(X) in df, target(y) in df, the number of features to select(n)
    '''

    # parameters: f_regression stats test, give me 5 features
    f_selector = SelectKBest(f_classif, k=n)

    # Fit on X and y
    f_selector.fit(X, y)

    # boolean mask of whether the column was selected or not. 
    feature_score = f_selector.scores_.round(2)

    # Put the features in a dataframe
    df_features = pd.DataFrame({'features': X.columns, 
                                'score': feature_score})

    # Sort the features based on their score
    df_features.sort_values(by="score", ascending=False, inplace=True, ignore_index=True)

    # Compute how many features in X
    m = X.shape[1]
    
    # Add a rank column
    df_features['rank'] = range(1, m+1)
    
    return df_features[:n]

In [115]:
from sklearn.feature_selection import SelectKBest, f_classif

df = KBest_ranker(X_train, y_train, 10)
df

Unnamed: 0,features,score,rank
0,mean_content_accuracy,76690.75,1
1,mean_bundle_accuracy,55971.0,2
2,user_acc_mean,22419.1,3
3,mean_tags_accuracy,21599.32,4
4,mean_task_accuracy,15910.84,5
5,question_had_explanation,3954.75,6
6,mean_part_accuracy,3397.75,7
7,mean_tagcount_accuracy,2389.41,8
8,user_lectures_running_total_scaled,211.82,9
9,avg_user_q_time_scaled,116.95,10
