In [1]:
import csv
import pandas as pd
from datetime import datetime

In [2]:
train_df = pd.read_csv('train.csv', nrows=500000)
train_df.to_csv('train_500k.csv', index=False, mode='w')

In [3]:
def clean_data(input_data_path='train_500k.csv', output_data_path='train_cleaned.csv'):
    with open(input_data_path, 'r') as inp, open(output_data_path, 'w', newline='') as out:
        writer = csv.writer(out)
        csv_reader = csv.reader(inp)
        # Skip header
        next(csv_reader)
        for row in csv_reader:
            # Only rows with non-null values
            if len(row) == 8:
                try:
                    fare_amount = float(row[1])
                    pickup_longitude = float(row[3])
                    pickup_latitude = float(row[4])
                    dropoff_longitude = float(row[5])
                    dropoff_latitude = float(row[6])
                    passenger_count = float(row[7])
                    if ((-76 <= pickup_longitude <= -72) and (-76 <= dropoff_longitude <= -72) and
                            (38 <= pickup_latitude <= 42) and (38 <= dropoff_latitude <= 42) and
                            (1 <= passenger_count <= 6) and (0 < fare_amount <= 300) and
                            (pickup_longitude != dropoff_longitude) and (pickup_latitude != dropoff_latitude)):
                        writer.writerow(row)
                except:
                    pass

In [4]:
def pre_process_train_data(input_data_path, output_data_path):
    with open(input_data_path, 'r') as inp, open(output_data_path, 'w', newline='') as out:
        writer = csv.writer(out)
        for row in csv.reader(inp):
            pickup_datetime = datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S %Z')
            hour = pickup_datetime.hour
            weekday = pickup_datetime.weekday()
            night = 0
            late_night = 0
            if ((hour <= 20) or (hour >= 16)) and (weekday < 5):
                night = 1
            if (hour <= 6) or (hour >= 20):
                late_night = 1
            row.append(pickup_datetime.year)
            row.append(pickup_datetime.month)
            row.append(pickup_datetime.day)
            row.append(hour)
            row.append(weekday)
            row.append(night)
            row.append(late_night)
            writer.writerow(row)

In [5]:
def pre_process_test_data(input_data_path, output_data_path):
    with open(input_data_path, 'r') as inp, open(output_data_path, 'w', newline='') as out:
        writer = csv.writer(out)
        csv_reader = csv.reader(inp)
        # Skip header
        next(csv_reader)
        for row in csv_reader:
            pickup_datetime = datetime.strptime(row[1], '%Y-%m-%d %H:%M:%S %Z')
            hour = pickup_datetime.hour
            weekday = pickup_datetime.weekday()
            night = 0
            late_night = 0
            if ((hour <= 20) or (hour >= 16)) and (weekday < 5):
                night = 1
            if (hour <= 6) or (hour >= 20):
                late_night = 1
            row.append(pickup_datetime.year)
            row.append(pickup_datetime.month)
            row.append(pickup_datetime.day)
            row.append(hour)
            row.append(weekday)
            row.append(night)
            row.append(late_night)
            writer.writerow(row)

In [6]:
def split_data(input_data_path, train_data_path,
               validation_data_path, ratio=30):
    with open(input_data_path, 'r') as inp, open(train_data_path, 'w', newline='') as out1, \
            open(validation_data_path, 'w', newline='') as out2:
        writer1 = csv.writer(out1)
        writer2 = csv.writer(out2)
        count = 0
        for row in csv.reader(inp):
            if count % ratio == 0:
                writer2.writerow(row)
            else:
                writer1.writerow(row)
            count += 1

In [7]:
clean_data('train_500k.csv', 'train_cleaned.csv')
pre_process_train_data('train_cleaned.csv', 'train_processed.csv')
pre_process_test_data('test.csv', 'test_processed.csv')
split_data(input_data_path='train_processed.csv', train_data_path='tf_train.csv',
               validation_data_path='tf_validation.csv', ratio=30)