# DataScience Toolbox 2

___

# 1.2 Playing with iterators

In [5]:
mutants = ('charles xavier',
 'bobby drake',
 'kurt wagner',
 'max eisenhardt',
 'kitty pryde')

names = ['barton', 'stark', 'odinson', 'maximoff']

display(mutants)
display(names)

('charles xavier',
 'bobby drake',
 'kurt wagner',
 'max eisenhardt',
 'kitty pryde')

['barton', 'stark', 'odinson', 'maximoff']

In [2]:
mutants_zip = zip(mutants)
mutants_zip

<zip at 0x1b3dd7a2ec8>

In [14]:
display(*mutants_zip)

('charles xavier',)

('bobby drake',)

('kurt wagner',)

('max eisenhardt',)

('kitty pryde',)

In [16]:
*mutants_zip

SyntaxError: can't use starred expression here (<ipython-input-16-9c41d4017b5a>, line 4)

In [4]:
mutants_unzip = list(zip(mutants))
mutants_unzip

[('charles xavier',),
 ('bobby drake',),
 ('kurt wagner',),
 ('max eisenhardt',),
 ('kitty pryde',)]

In [10]:
for z1, z2 in zip(names, mutants):
    print(z1 + " - " + z2)

barton - charles xavier
stark - bobby drake
odinson - kurt wagner
maximoff - max eisenhardt


___

# 1.3 Using iterators to load large files into memory

### Loading data in chunks

- There can be too much data to hold in memory
- Solution: load data in chunks!
- Pandas function:     read_csv()
    - Specify the chunk: chunk_size

### Iterating over data

In [19]:
mutants = ('charles xavier',
 'bobby drake',
 'kurt wagner',
 'max eisenhardt',
 'kitty pryde')
powers = ('telepathy',
 'thermokinesis',
 'teleportation',
 'magnetokinesis',
 'intangibility')

# Create a zip object from mutants and powers: z1
z1 = zip(mutants, powers)

# Print the tuples in z1 by unpacking with *
print(*z1)

# Re-create a zip object from mutants and powers: z1
z1 = zip(mutants, powers)

# 'Unzip' the tuples in z1 by unpacking with * and zip(): result1, result2
result1, result2 = zip(*z1)

# Check if unpacked tuples are equivalent to original tuples
print(result1 == mutants)
print(result2 == powers)

('charles xavier', 'telepathy') ('bobby drake', 'thermokinesis') ('kurt wagner', 'teleportation') ('max eisenhardt', 'magnetokinesis') ('kitty pryde', 'intangibility')
True
True


In [28]:
mutants = ('charles xavier',
 'bobby drake',
 'kurt wagner',
 'max eisenhardt',
 'kitty pryde')
powers = ('telepathy',
 'thermokinesis',
 'teleportation',
 'magnetokinesis',
 'intangibility')

z1 = zip(mutants, powers) #create zip object
z1_list = list(z1)
number1, number2 = zip(*z1)
print(number1)
print(number2)

ValueError: not enough values to unpack (expected 2, got 0)

In [18]:
import pandas as pd

f500 = []

for chunk in pd.read_csv('F500.csv', chunksize = 250, delimiter = ';'):
    f500.append(sum(chunk['id']))
total = sum(f500)
display(total)



125750

In [17]:
import pandas as pd

total = 0 

for chunk in pd.read_csv('F500.csv', chunksize = 100, delimiter = ';'):
    total += sum(chunk['id'])

display(total)



125750

In [8]:
fortune = pd.read_csv('F500.csv', delimiter = ';')
fortune.head()

Unnamed: 0,id,rank,name,employees,previousrank,revenues,revenuechange,profits,profitschange,assets,marketvalue
0,2,1,Walmart,2300000,1.0,"$485,873",0.8%,"$13,643.0",-7.2%,"$198,825","$218,619"
1,3,2,Berkshire Hathaway,367700,4.0,"$223,604",6.1%,"$24,074.0",0.0%,"$620,854","$411,035"
2,4,3,Apple,116000,3.0,"$215,639",-7.7%,"$45,687.0",-14.4%,"$321,686","$753,718"
3,5,4,Exxon Mobil,72700,2.0,"$205,004",-16.7%,"$7,840.0",-51.5%,"$330,314","$340,056"
4,6,5,McKesson,68000,5.0,"$192,487",6.2%,"$2,258.0",53.0%,"$56,563","$31,439"


In [19]:
def new_function(data, col_name, chunk_size, dlr = ';'):
    
    total = 0
       
    for entry in pd.read_csv(data, chunksize = chunk_size, delimiter = dlr):
        total += sum(entry[col_name])
    return total

x = new_function('F500.csv', 'id', 100)
x

125750

In [21]:
def find_len_of_table(data, col_name, chunk_size, dlr =';'):
    total = 0
    for entry in pd.read_csv(data, chunksize = chunk_size, delimiter = dlr):
        total += len(entry[col_name])
    return total

y = find_len_of_table('F500.csv', 'id', 10)
y

500

___

# List compehensions

In [24]:
nums = [12, 8, 21, 3, 16]
new_nums = []
for num in nums:
    new_nums.append(num + 1)
    
new_nums

[13, 9, 22, 4, 17]

In [26]:
new_nums_lt = [num * 10 for num in nums]
new_nums_lt

[120, 80, 210, 30, 160]

In [27]:
avengers = ['hawkeye', 'iron man', 'thor', 'quicksilver']
new_avengers = [avg + ' i am alive' for avg in avengers]
new_avengers

['hawkeye i am alive',
 'iron man i am alive',
 'thor i am alive',
 'quicksilver i am alive']

In [51]:
count_avengers = [(str(x) + " " + y) for x in range(3) for y in avengers]
count_avengers

['0 hawkeye',
 '0 iron man',
 '0 thor',
 '0 quicksilver',
 '1 hawkeye',
 '1 iron man',
 '1 thor',
 '1 quicksilver',
 '2 hawkeye',
 '2 iron man',
 '2 thor',
 '2 quicksilver']

In [42]:
result = [num for num in range(1, 10, 2)]
result

[1, 3, 5, 7, 9]

In [48]:
pairs = []
for num1 in range(10, 20):
    for num2 in range(25, 50):
      pairs.append(num1, num2)
pairs

TypeError: append() takes exactly one argument (2 given)

In [50]:
pairs_1 = []
for num1 in range(0, 2):
    for num2 in range(6, 8):
        pairs_1.append(num1, num2)
print(pairs_1)

TypeError: append() takes exactly one argument (2 given)

In [53]:
pairs_2 = [(num5, num6) for num5 in range(50, 52) for num6 in range(100, 103)]
pairs_2

[(50, 100), (50, 101), (50, 102), (51, 100), (51, 101), (51, 102)]

In [55]:
x = [num * 2 for num in range(4, 50) if num%4 == 0]
x

[8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96]

In [58]:
avengers = ['hawkeye', 'iron man', 'thor', 'quicksilver']
les_av = [num for num in avengers if len(num) <=5]
les_av

['thor']

In [63]:
american_names = {'address': 'Downtown', 'age': 27, 'name': 'Jack'}

amer_dct = {x:  y for x, y in american_names}
amer_dct


ValueError: too many values to unpack (expected 2)

In [3]:
doctor = ['house', 'cuddy', 'chase', 'thirteen', 'wilson']
flash = ['jay garrick', 'barry allen', 'wally west', 'bart allen']
x = [[dt + " - доктор" for dt in doctor] for y in range(0,3)]
x

[['house - доктор',
  'cuddy - доктор',
  'chase - доктор',
  'thirteen - доктор',
  'wilson - доктор'],
 ['house - доктор',
  'cuddy - доктор',
  'chase - доктор',
  'thirteen - доктор',
  'wilson - доктор'],
 ['house - доктор',
  'cuddy - доктор',
  'chase - доктор',
  'thirteen - доктор',
  'wilson - доктор']]

___

# Introduction to generator expressions

In [4]:
x = (y for y in range(4))
x

<generator object <genexpr> at 0x0000024464D8A5E8>

In [6]:
z = list(x)
z

[0, 1, 2, 3]

In [13]:
xy = zip(x)
xy
xyz = list(xy)
xyz

[]

In [14]:
x = (y for y in range(4))

for num in x:
    print(num)

0
1
2
3


In [22]:
x = (y for y in range(4))

print(next(x))
print(next(x))
print(next(x))
print(next(x))

0
1
2
3


In [26]:
x = (y for y in range(5,45) if y%3 ==0 )
list(x)

[6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42]

In [34]:
def num_sequence(x):
    i = 0
    while i < x:
        yield i
        i += 3
    return i

list(num_sequence(10))

[0, 3, 6, 9]

In [39]:
lannister = ['cersei', 'jaime', 'tywin', 'tyrion', 'joffrey']

def get_length(lt):
    for x in lt:
        yield len(x)
 
list(get_length(lannister))

[6, 5, 5, 6, 7]

In [48]:
import pandas as pd

df = pd.read_csv('F500.csv', delimiter = ';')
df.head()

df_revenue = df['revenues']
df_revenue.head()

df_rev_new = [x[1:4] for x in df_revenue]
df_rev_new[0:10]

['485', '223', '215', '205', '192', '184', '177', '166', '163', '151']

In [50]:
df.head()

Unnamed: 0,id,rank,name,employees,previousrank,revenues,revenuechange,profits,profitschange,assets,marketvalue
0,2,1,Walmart,2300000,1.0,"$485,873",0.8%,"$13,643.0",-7.2%,"$198,825","$218,619"
1,3,2,Berkshire Hathaway,367700,4.0,"$223,604",6.1%,"$24,074.0",0.0%,"$620,854","$411,035"
2,4,3,Apple,116000,3.0,"$215,639",-7.7%,"$45,687.0",-14.4%,"$321,686","$753,718"
3,5,4,Exxon Mobil,72700,2.0,"$205,004",-16.7%,"$7,840.0",-51.5%,"$330,314","$340,056"
4,6,5,McKesson,68000,5.0,"$192,487",6.2%,"$2,258.0",53.0%,"$56,563","$31,439"


In [54]:
df_rev_dol = [x[0]==' 'for x in df_revenue]
df_rev_dol[0:10]

[False, False, False, False, False, False, False, False, False, False]

___

# Bringing it all together

In [11]:
avengers = ['hawkeye', 'iron man', 'thor', 'quicksilver']
names = ['barton', 'stark', 'odinson', 'maximoff']

def list2dict(list1, list2):
    dict={}
    for x in list1:
        dict[x] = x
        for y in list2:
             dict.val[y] = y
    return dict

z = list2dict(avengers, names)
z

{'hawkeye': 'hawkeye',
 'barton': 'barton',
 'stark': 'stark',
 'odinson': 'odinson',
 'maximoff': 'maximoff',
 'iron man': 'iron man',
 'thor': 'thor',
 'quicksilver': 'quicksilver'}

In [3]:
with open ('F500.csv') as file:
    file.readline()
    counts_dict = {}
    for j in range(0,100):
        line = file.readline().split(',')
        first_col = line[0]
        if first_col in counts_dict.keys():
            counts_dict[first_col] += 1
        else:
            counts_dict[first_col] = 1

print(counts_dict)

{'"2";"1";"Walmart";"2': 1, '"3";"2";"Berkshire Hathaway";"367': 1, '"4";"3";"Apple";"116': 1, '"5";"4";"Exxon Mobil";"72': 1, '"6";"5";"McKesson";"68': 1, '"7";"6";"UnitedHealth Group";"230': 1, '"8";"7";"CVS Health";"204': 1, '"9";"8";"General Motors";"225': 1, '"10";"9";"AT&T";"268': 1, '"11";"10";"Ford Motor";"201': 1, '"12";"11";"AmerisourceBergen";"18': 1, '"13";"12";"Amazon.com";"341': 1, '"14";"13";"General Electric";"295': 1, '"15";"14";"Verizon";"160': 1, '"16";"15";"Cardinal Health";"37': 1, '"17";"16";"Costco";"172': 1, '"18";"17";"Walgreens Boots Alliance";"300': 1, '"19";"18";"Kroger";"443': 1, '"20";"19";"Chevron";"55': 1, '"21";"20";"Fannie Mae";"7': 1, '"22";"21";"J.P. Morgan Chase";"243': 1, '"23";"22";"Express Scripts Holding";"25': 1, '"24";"23";"Home Depot";"406': 1, '"25";"24";"Boeing";"150': 1, '"26";"25";"Wells Fargo";"269': 1, '"27";"26";"Bank of America Corp.";"208': 1, '"28";"27";"Alphabet";"72': 1, '"29";"28";"Microsoft";"114': 1, '"30";"29";"Anthem";"53': 1

In [96]:
import pandas as pd

df500 = pd.read_csv('F501.csv', delimiter = ';', chunksize = 250)
first_chunk = next(df500)
first_chunk

y = first_chunk[first_chunk['revenues']>99]
y
z = y[y['marketvalue']>50]
z
#zy = y.replace(to_replace= '    $', value=' ', regex = True)
#zy
    



TypeError: '>' not supported between instances of 'str' and 'int'