In [1]:
def split_csv(src_csv, n_row, target_dir, prefix='part'):
    """
    Splice `src_csv` into several smaller csv. Each csv contains `n_row` except for the last csv.
    This implementation uses `pandas`.
    @param      src_csv (string): source path.
    @param      n_row (int): how many row for each csv part.
    @param      target_dir (string): path directory to store all file parts.
    @return     (boolean): True if success
    """
    import os
    import pandas as pd
    from tqdm import tqdm
    if n_row <= 0:
        return False
    if not os.path.exists(src_csv):
        raise FileNotFoundError()
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    _counter = 0
    _part_counter = 0
    df = pd.read_csv(src_csv)
    _length = len(df)
    _columns = df.columns.tolist()
    temp_df = pd.DataFrame(columns=_columns)
    for i, r in tqdm(df.iterrows(), total=df.shape[0]):
        _counter += 1
        temp_df = pd.concat([temp_df, r])
        if _counter % n_row == 0:
            _part = _counter / n_row
            _part_counter += 1
            _path = "{}/{}_{}.csv".format(target_dir, prefix, _part)
            temp_df.to_csv(_path, index=False)
            temp_df = pd.DataFrame(columns=_columns) # Reinitialize temp dataframe.
        else:
            # Reach the last element.
            if _counter == _length:
                _part_counter += 1
                temp_df.to_csv("{}/{}_{}.csv".format(target_dir, prefix, _part_counter))
    #endfor
    return True



In [3]:
from data_dir import workspace_dir
parts_dir = "{}/parts".format(workspace_dir)
src_csv = "{}/train.csv".format(workspace_dir)
print("Splitting data {}: {}".format(src_csv, split_csv(src_csv, 750000, parts_dir)))

  7%|▋         | 749999/10234978 [15:28:22<346:18:23,  7.61it/s] 