In [10]:
import pandas as pd
import numpy as np
from typing import List

In [11]:
traincols = ['date_time', 'site_name', 'posa_continent', 'user_location_country',
             'user_location_region', 'user_location_city', 'orig_destination_distance',
             'user_id', 'is_mobile', 'is_package', 'channel', 'srch_ci', 'srch_co',
             'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id',
             'srch_destination_type_id', 'is_booking', 'cnt', 'hotel_continent',
             'hotel_country', 'hotel_market', 'hotel_cluster']

testcols = ['id', 'date_time', 'site_name', 'posa_continent', 'user_location_country',
            'user_location_region', 'user_location_city', 'orig_destination_distance',
            'user_id', 'is_mobile', 'is_package', 'channel', 'srch_ci', 'srch_co',
            'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id',
            'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market']

In [12]:
def read_csv(filename: str, cols: List[str], nrows: int = None) -> pd.DataFrame:
  datecols = ['date_time', 'srch_ci', 'srch_co']
  dateparser = lambda x: pd.to_datetime(x, format='%Y-%m-%d %H:%M:%S', errors='coerce')

  dtypes = {
        'id': np.uint32,
        'site_name': np.uint8,
        'posa_continent': np.uint8,
        'user_location_country': np.uint16,
        'user_location_region': np.uint16,
        'user_location_city': np.uint16,
        'orig_destination_distance': np.float32,
        'user_id': np.uint32,
        'is_mobile': bool,
        'is_package': bool,
        'channel': np.uint8,
        'srch_adults_cnt': np.uint8,
        'srch_children_cnt': np.uint8,
        'srch_rm_cnt': np.uint8,
        'srch_destination_id': np.uint32,
        'srch_destination_type_id': np.uint8,
        'is_booking': bool,
        'cnt': np.uint64,
        'hotel_continent': np.uint8,
        'hotel_country': np.uint16,
        'hotel_market': np.uint16,
        'hotel_cluster': np.uint8,
    }

  df = pd.read_csv(
        filename,
        nrows=nrows,
        usecols=cols,
        dtype=dtypes, # dtype can also specify datatypes for columns that do not excist in the particular datafile
        parse_dates=[col for col in datecols if col in cols], # columns here must be also in usecols
        date_parser=dateparser,
  )
  
  return df

In [14]:
destinations = pd.read_csv("../data/destinations.csv")
test = read_csv("../data/test.csv", cols=testcols)
train = read_csv("../data/train.csv", cols=traincols, nrows=10000000)

In [15]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 24 columns):
 #   Column                     Dtype         
---  ------                     -----         
 0   date_time                  datetime64[ns]
 1   site_name                  uint8         
 2   posa_continent             uint8         
 3   user_location_country      uint16        
 4   user_location_region       uint16        
 5   user_location_city         uint16        
 6   orig_destination_distance  float32       
 7   user_id                    uint32        
 8   is_mobile                  bool          
 9   is_package                 bool          
 10  channel                    uint8         
 11  srch_ci                    datetime64[ns]
 12  srch_co                    datetime64[ns]
 13  srch_adults_cnt            uint8         
 14  srch_children_cnt          uint8         
 15  srch_rm_cnt                uint8         
 16  srch_destination_id        uint32  

In [16]:
destinations.to_pickle('../data/destinations.pkl')
test.to_pickle('../data/test.pkl')
train.to_pickle('../data/train_10m.pkl')