In [1]:
import polars as pl
import pandas as pd

import numpy as np
import uuid
import os
from faker import Faker
from faker.providers import BaseProvider
import random
import csv
from datetime import datetime
import functools
import threading
import time
import codecs
import queue  # or queue in Python 3
import os

In [None]:
#pd.read_parquet('path.parquet', engine='pyarrow')

In [None]:
# df = pl.read_parquet("movie.parquet").lazy()
# sql = pl.SQLContext()
# sql.register("df",df)

# sql.query(
#     """
#     SELECT * FROM df where Genre = 'Horror' AND Language = 'Hindi' and Runtime > 100
#     """
# )

In [5]:
def timer(func):
    """Print the runtime of the decorated function"""

    @functools.wraps(func)
    def wrapper_timer(*args, **kwargs):
        start_time = time.perf_counter()  # 1
        value = func(*args, **kwargs)
        end_time = time.perf_counter()  # 2
        run_time = end_time - start_time  # 3
        print(f"Finished {func.__name__!r} in {run_time:.4f} secs")
        return value

    return wrapper_timer

class GenereProvider(BaseProvider):
    def movie_genre(self):
        return random.choice(['Documentary', 'Thriller', 'Mystery', 'Horror', 'Action', 'Comedy', 'Drama', 'Romance'])

class LanguageProvider(BaseProvider):
    def language(self):
        return random.choice(['English', 'Chinese', 'Italian', 'Spanish', 'Hindi', 'Japanese'])

fake = Faker()

fake.add_provider(GenereProvider)
fake.add_provider(LanguageProvider)

# Some of this is a bit verbose now, but doing so for the sake of completion

def get_movie_name():
    words = fake.words()
    capitalized_words = list(map(str.capitalize, words))
    return ' '.join(capitalized_words)

def get_movie_date():
    return datetime.strftime(fake.date_time_this_decade(), "%B %d, %Y")

def get_movie_len():
    return random.randrange(50, 150)

def get_movie_rating():
    return round(random.uniform(1.0, 5.0), 1)

def generate_movie():
    return [get_movie_name(), fake.movie_genre(), get_movie_date(), get_movie_len(), get_movie_rating(), fake.language()]

@timer
def write_file(count = 10000000):
    with open('movie_data.csv', 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Title', 'Genre', 'Premiere', 'Runtime', 'IMDB Score', 'Language'])
        for n in range(1, count):
            writer.writerow(generate_movie())

class WriteThread(threading.Thread):
    def __init__(self, queue):
        threading.Thread.__init__(self)
        self.queue = queue

    def write_files(self, rows):
        with open(output_file_name, "a") as f:
            writer = csv.writer(f)
            for row in rows:
                writer.writerow(row)
                
    def run(self):
        while True:
            result = self.queue.get()
            self.write_files(result)
            self.queue.task_done()

class ProcessThread(threading.Thread):
    def __init__(self, in_queue, out_queue):
        threading.Thread.__init__(self)
        self.in_queue = in_queue
        self.out_queue = out_queue

    def run(self):
        while True:
            size = self.in_queue.get()
            result = self.create_chunk(size)
            self.out_queue.put(result)
            self.in_queue.task_done()

    def process(self, path):
        """
        If you have multiple file_paths you want to append to single file
        Define list of paths and replace "create_chunk" with "process"
        """
        with open(path, "r") as f:
            data = f.read()
        return data

    def create_rows(self,count):
        for i in range(1,count):
            yield generate_movie()

    def create_chunk(self,size):
        return [row for row in self.create_rows(size)]

In [6]:
output_file_name = "test.csv"
row_count = 10000000
chunk_size = 100000

@timer
def write_file_threads(output_file_name,chunk_size,row_count):
    multiplier = int(row_count/chunk_size)
    with open(output_file_name, 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Title', 'Genre', 'Premiere', 'Runtime', 'IMDB Score', 'Language'])

    input_queue = queue.Queue()
    result_queue = queue.Queue()

    for i in range(0, 15):
        t = ProcessThread(input_queue, result_queue)
        if i == 1:
            import time
            time.sleep(3)
        t.setDaemon(True)
        t.start()

    t = WriteThread(result_queue)
    t.setDaemon(True)
    t.start()

    for chunk_size in [chunk_size]*multiplier:
        input_queue.put(chunk_size)

    input_queue.join()
    result_queue.join()

In [7]:
write_file(count = 10000000)

Finished 'write_file' in 680.1546 secs


In [8]:
write_file_threads(output_file_name, chunk_size, row_count)

  t.setDaemon(True)
  t.setDaemon(True)


Finished 'write_file_threads' in 684.4237 secs


In [9]:
%%timeit
pl.read_csv("movie_data.csv").write_parquet("movie.parquet")

4.47 s ± 138 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%%timeit
pd.read_csv("movie_data.csv").to_parquet('movie.parquet', engine='pyarrow')

13.7 s ± 225 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Read Parquet

In [18]:
%%timeit
pl.read_parquet("movie.parquet")

473 ms ± 19 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%%timeit
pd.read_parquet("movie.parquet")

5.25 s ± 256 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## More polars documention

- https://www.pola.rs/
- https://pola-rs.github.io/polars-book/user-guide/