In [2]:
!pip install pandas pyarrow fastavro pyyaml dicttoxml --break-system-packages

Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting pyarrow
  Downloading pyarrow-18.1.0-cp311-cp311-manylinux_2_28_x86_64.whl (40.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
Collecting fastavro
  Downloading fastavro-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0mm
Collecting dicttoxml
  Downloading dicttoxml-1.7.16-py3-none-any.whl (24 kB)
Collecting tzdata>=2

In [11]:
import pandas as pd
import pyarrow as pa
import pyarrow.orc as orc
import pyarrow.parquet as pq
import json
import fastavro
import yaml
from dicttoxml import dicttoxml
import xml.dom.minidom

# Load the CSV file into a pandas DataFrame
filename = 'trips'
df = pd.read_csv(f'{filename}.csv')

# Convert to ORC
def convert_to_orc(df, filename):
    table = pa.Table.from_pandas(df)
    orc.write_table(table, filename)

# Convert to Avro
df['Zip Code'] = df['Zip Code'].fillna('').apply(str)

# Convert to Avro
def convert_to_avro(df, filename):
    records = df.to_dict(orient='records')
    schema = {
        "type": "record",
        "name": "Trip",
        "fields": [
            {"name": "Trip ID", "type": "int"},
            {"name": "Duration", "type": "int"},
            {"name": "Start Date", "type": "string"},
            {"name": "Start Station", "type": "string"},
            {"name": "Start Terminal", "type": "int"},
            {"name": "End Date", "type": "string"},
            {"name": "End Station", "type": "string"},
            {"name": "End Terminal", "type": "int"},
            {"name": "Bike #", "type": "int"},
            {"name": "Subscriber Type", "type": "string"},
            {"name": "Zip Code", "type": "string"}
        ]
    }
    with open(filename, 'wb') as out:
        fastavro.writer(out, schema, records)


# Convert to Parquet
def convert_to_parquet(df, filename):
    table = pa.Table.from_pandas(df)
    pq.write_table(table, filename)

# Convert to TSV
def convert_to_tsv(df, filename):
    df.to_csv(filename, sep='\t', index=False)

# Convert to YAML
def convert_to_yaml(df, filename):
    records = df.to_dict(orient='records')
    with open(filename, 'w') as file:
        yaml.dump(records, file)

# Convert to JSON
def convert_to_json(df, filename):
    df.to_json(filename, orient='records', lines=False, indent=4)

# Convert to XML
def convert_to_xml(df, filename):
    records = df.to_dict(orient='records')
    xml_data = dicttoxml(records, custom_root='Trips', attr_type=False)
    dom = xml.dom.minidom.parseString(xml_data)
    pretty_xml_as_string = dom.toprettyxml()

    with open(filename, 'w') as file:
        file.write(pretty_xml_as_string)

convert_to_orc(df, f'{filename}.orc')
convert_to_avro(df, f'{filename}.avro')
convert_to_parquet(df, f'{filename}.parquet')
convert_to_tsv(df, f'{filename}.tsv')
convert_to_yaml(df, f'{filename}.yaml')
convert_to_json(df, f'{filename}.json')
convert_to_xml(df, f'{filename}.xml')

In [3]:
import pandas as pd
import pyarrow as pa
import pyarrow.orc as orc
import pyarrow.parquet as pq
import json
import fastavro
import yaml
from dicttoxml import dicttoxml
import xml.dom.minidom

# Load the CSV file into a pandas DataFrame
filename = './trimmed/trips_trimmed'
df = pd.read_csv(f'{filename}.csv')

# Convert to ORC
def convert_to_orc(df, filename):
    table = pa.Table.from_pandas(df)
    orc.write_table(table, filename)

# Convert to Avro
df['Zip Code'] = df['Zip Code'].fillna('').apply(str)

# Convert to Avro
def convert_to_avro(df, filename):
    records = df.to_dict(orient='records')
    schema = {
        "type": "record",
        "name": "Trip",
        "fields": [
            {"name": "Trip ID", "type": "int"},
            {"name": "Duration", "type": "int"},
            {"name": "Start Date", "type": "string"},
            {"name": "Start Station", "type": "string"},
            {"name": "Start Terminal", "type": "int"},
            {"name": "End Date", "type": "string"},
            {"name": "End Station", "type": "string"},
            {"name": "End Terminal", "type": "int"},
            {"name": "Bike #", "type": "int"},
            {"name": "Subscriber Type", "type": "string"},
            {"name": "Zip Code", "type": "string"}
        ]
    }
    with open(filename, 'wb') as out:
        fastavro.writer(out, schema, records)


# Convert to Parquet
def convert_to_parquet(df, filename):
    table = pa.Table.from_pandas(df)
    pq.write_table(table, filename)

# Convert to TSV
def convert_to_tsv(df, filename):
    df.to_csv(filename, sep='\t', index=False)

# Convert to YAML
def convert_to_yaml(df, filename):
    records = df.to_dict(orient='records')
    with open(filename, 'w') as file:
        yaml.dump(records, file)

# Convert to JSON
def convert_to_json(df, filename):
    df.to_json(filename, orient='records', lines=False, indent=4)

# Convert to XML
def convert_to_xml(df, filename):
    records = df.to_dict(orient='records')
    xml_data = dicttoxml(records, custom_root='Trips', attr_type=False)
    dom = xml.dom.minidom.parseString(xml_data)
    pretty_xml_as_string = dom.toprettyxml()

    with open(filename, 'w') as file:
        file.write(pretty_xml_as_string)

convert_to_orc(df, f'{filename}.orc')
convert_to_avro(df, f'{filename}.avro')
convert_to_parquet(df, f'{filename}.parquet')
convert_to_tsv(df, f'{filename}.tsv')
convert_to_yaml(df, f'{filename}.yaml')
convert_to_json(df, f'{filename}.json')
convert_to_xml(df, f'{filename}.xml')