# Generate DataFrame

In [None]:
import pandas as pd
import numpy as np

data = []
bins = np.arange(1000, 500_000, step=10_000)
for N in bins:
    for i in range(0, N):
        data.append({'index': i, 'value': i})
    df = pd.DataFrame(data)
    df.to_csv(str(N) + '.csv')

# Pandas

In [None]:
import time

pandas_results = []

for i in bins:
    s = time.time()
    df = pd.read_csv(str(i) + '.csv')
    e = time.time()
    pandas_time = e - s 
    pandas_results.append({'time' : pandas_time, 'bin' : i})

# PySpark

In [None]:
import findspark
#import all the libraries of pyspark.sql
from pyspark.sql import *
#import SparkContext and SparkConf
from pyspark import SparkContext, SparkConf

#setup configuration property 
#set the master URL 
#set an application name 
conf = SparkConf().setMaster("local").setAppName("sparkproject")
#start spark cluster 
#if already started then get it else start it 
sc = SparkContext.getOrCreate(conf=conf)

spark = SparkSession.builder.getOrCreate()

In [None]:
pyspark_results = []

for i in bins:
    s = time.time()
    df = spark.read.csv(str(i) + '.csv',inferSchema =True,header=True)
    e = time.time()
    pyspark_time = e - s 
    pyspark_results.append({'time' : pandas_time, 'bin' : i})

In [None]:
sc.stop()

In [None]:
df_spark = pd.DataFrame(pyspark_results)
df_pandas = pd.DataFrame(pandas_results)

In [None]:
import matplotlib.pyplot as plt

plt.plot(df_spark['bin'], df_spark['time'], label='pandas')
plt.plot(df_pandas['bin'], df_pandas['time'], label='pyspark')
plt.legend()
plt.grid()
plt.xlabel('N')
plt.ylabel('time (sec.)')
plt.title('Time elapsed to load a dataset with N records')
plt.show()