# Dealing with big data
## single node computation - possible approaches

    - generator based data ingestion and processing pipeline
    - pandas chunksize technique
    - disk + memory computing using pandas + sql

In [2]:
# generators - special type of iterators which are lazy
a = [4,5,6]
for i in a:
    print(i)

4
5
6


In [3]:
for i in a:
    print(i)

4
5
6


In [4]:
for i in a:
    print(i)

4
5
6


In [5]:
def myfun(a):
    return a+2,a+3,a+4,a+5,a+6

In [6]:
w = myfun(3)
w

(5, 6, 7, 8, 9)

In [7]:
# generator function

def myfun(a):
    yield a+2
    yield a+3
    yield a+4
    yield a+5
    yield a+6

In [8]:
w = myfun(3)
w

<generator object myfun at 0x0000024EB0027190>

In [17]:
w = myfun(3)
for k in w:
    print(k)

5
6
7
8
9


In [18]:
for k in w:
    print(k)

In [19]:
for k in w:
    print(k)

In [9]:
type(w)

generator

In [10]:
next(w)

5

In [11]:
next(w)

6

In [12]:
next(w)

7

In [13]:
next(w)

8

In [14]:
next(w)

9

In [15]:
next(w)

StopIteration: 

In [20]:
a = [7,4,2,5,6,3,9]

In [24]:
m = [i**2 for i in a]
type(m)

list

In [25]:
print(m)

[49, 16, 4, 25, 36, 9, 81]


In [21]:
k = (i**2 for i in a)
type(k)

generator

In [22]:
for i in k:
    print(i)

49
16
4
25
36
9
81


In [23]:
for i in k:
    print(i)

# Data ingestion pipeline using generator

In [26]:
file = r"D:\AI\data\datasets-1\Bank_churn_modelling.csv"

In [38]:
# objective: sum of age
gen = (line for line in open(file))
gen = (line.strip().split(",") for line in gen)
col_names = next(gen)
dict_gen = (dict(zip(col_names,row)) for row in gen)

age_gen = (int(k['Age']) for k in dict_gen)

out = 0
size = 0
for k in age_gen:
    out +=k
    size +=1
print(out/size)

38.9218


In [40]:
list(dict_gen)

[]

In [41]:
# objective: sum of age
gen = (line for line in open(file))
gen = (line.strip().split(",") for line in gen)
col_names = next(gen)
dict_gen = (dict(zip(col_names,row)) for row in gen)
list(dict_gen)

[{'RowNumber': '1',
  'CustomerId': '15634602',
  'Surname': 'Hargrave',
  'CreditScore': '619',
  'Geography': 'France',
  'Gender': 'Female',
  'Age': '42',
  'Tenure': '2',
  'Balance': '0',
  'NumOfProducts': '1',
  'HasCrCard': '1',
  'IsActiveMember': '1',
  'EstimatedSalary': '101348.88',
  'Exited': '1'},
 {'RowNumber': '2',
  'CustomerId': '15647311',
  'Surname': 'Hill',
  'CreditScore': '608',
  'Geography': 'Spain',
  'Gender': 'Female',
  'Age': '41',
  'Tenure': '1',
  'Balance': '83807.86',
  'NumOfProducts': '1',
  'HasCrCard': '0',
  'IsActiveMember': '1',
  'EstimatedSalary': '112542.58',
  'Exited': '0'},
 {'RowNumber': '3',
  'CustomerId': '15619304',
  'Surname': 'Onio',
  'CreditScore': '502',
  'Geography': 'France',
  'Gender': 'Female',
  'Age': '42',
  'Tenure': '8',
  'Balance': '159660.8',
  'NumOfProducts': '3',
  'HasCrCard': '1',
  'IsActiveMember': '0',
  'EstimatedSalary': '113931.57',
  'Exited': '1'},
 {'RowNumber': '4',
  'CustomerId': '15701354',
  

In [42]:
def data_impute(x):
    if x=="":
        return 0
    else:
        return x

In [None]:
age_gen = (int(data_impute(k['Age'])) for k in dict_gen)

## Chunksize

In [43]:
import pandas as pd

In [44]:
help(pd.read_csv)

Help on function read_csv in module pandas.io.parsers.readers:

read_csv(filepath_or_buffer: 'FilePathOrBuffer', sep=<no_default>, delimiter=None, header='infer', names=<no_default>, index_col=None, usecols=None, squeeze=False, prefix=<no_default>, mangle_dupe_cols=True, dtype: 'DtypeArg | None' = None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, skipfooter=0, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, cache_dates=True, iterator=False, chunksize=None, compression='infer', thousands=None, decimal: 'str' = '.', lineterminator=None, quotechar='"', quoting=0, doublequote=True, escapechar=None, comment=None, encoding=None, encoding_errors: 'str | None' = 'strict', dialect=None, error_bad_lines=None, warn_bad_lines=None, on_bad_lines=None, delim_whitespace=False, low_me

In [45]:
file = r"D:\AI\data\datasets-1\Bank_churn_modelling.csv"

for df in pd.read_csv(file,chunksize=1000):
    print(df.shape)

(1000, 14)
(1000, 14)
(1000, 14)
(1000, 14)
(1000, 14)
(1000, 14)
(1000, 14)
(1000, 14)
(1000, 14)
(1000, 14)


In [47]:
file = r"D:\AI\data\datasets-1\Bank_churn_modelling.csv"

out = 0
count = 0
for df in pd.read_csv(file,chunksize=1000):
    out += df.Age.mean()
    count +=1
print(out/count)

38.921800000000005


## SQL + Pandas

In [48]:
file = r"D:\AI\data\datasets-1\Bank_churn_modelling.csv"
import sqlite3
import pandas as pd

In [50]:
conn = sqlite3.connect("bankchurn.db")

In [51]:
for df in pd.read_csv(file,chunksize=1000):
    df.to_sql(name="bank",con=conn,index=False,if_exists='append')

In [55]:
df = pd.read_sql_query("select Age from bank where Geography='France';",con=conn)
df

Unnamed: 0,Age
0,42
1,42
2,39
3,50
4,44
...,...
5009,29
5010,39
5011,35
5012,36
