In [2]:
import sys
import os
import pandas as pd
sys.path = [os.path.abspath('..')] + sys.path

In [11]:
from utils import settings

In [15]:
def format_file_name(file_name):
    r""" format file name
    :param file_name (str): raw file name
    :return: formated file name
    """
    res = ''

    for c in file_name:
        if c == '.':
            break   # keep only part before '.'
        elif c.lower() != c:
            res += '_' + c.lower() # change camel separate into underscore separate
        else:
            res += c
    return res

In [16]:
def get_raw_data(file_names=[]):
    r"""get raw data
        Argument:
            file_names (list) a list of file names to load
        Return:
            data (dict): {file_name: data as a pandas.DataFrame}
    """
    if len(file_names) == 0: # load all files if not specified
        file_names = filter(
            lambda f: f.split('.')[-1] == 'csv',
            os.listdir(settings.DATA_DIR))

    data = {}
    for file_name in file_names:
        file_path = os.path.join(settings.DATA_DIR, file_name)
        data_name = format_file_name(file_name)
        data[data_name] = pd.read_csv(file_path)

    return data

In [44]:
def get_assessment_by_year(years=[]):
    r"""
    :param years (list): year codes
    :return: a list of assessment ids that are in these years
    """

    # assessments table records which year an assessment is given
    assessments = get_raw_data(['assessments.csv'])['assessments']
    mask = assessments['code_presentation'].isin(years)

    res = list(assessments[mask]['id_assessment'].unique())
    
    return res

In [53]:
def train_test_split(df, train_ratio=None, train_size=None, train_years=['2013B', '2013J', '2014B']):
    r"""
        train test split, one of the train_ratio/train_size/train_years must be not None.

    :param df (pd.DataFrame): dataframe to be splited
    :param train_ratio: train ratio
    :param train_size: train size
    :param train_years: years of data to be included in the training set
    :return: train, test data frame

        Note: only train_years are implemented
    """

    if train_ratio is not None:
        raise NotImplementedError
    if train_size is not None:
        raise NotImplementedError
    if train_years is None:
        raise Exception('the training years cannot be missing')

    if 'code_presentation' in df:  # the tables call year as 'code_presentation'
        col = 'code_presentation'
        val = train_years
    else:
        col = 'id_assessment'
        val = get_assessment_by_year(train_years)
        
    train_mask = df[col].isin(val)
    train_df = df[train_mask].copy()
    test_df = df[~ train_mask].copy()
    
    return train_df, test_df

In [54]:
data.keys()

dict_keys(['student_assessment', 'student_info', 'student_vle', 'courses', 'vle', 'student_registration', 'assessments'])

In [55]:
train, test = train_test_split(data['student_assessment'])

In [59]:
train.shape

(121738, 5)

In [60]:
test.shape

(52174, 5)