In [1]:
with open(r"/data/stocks.small.csv", mode="r", encoding="utf-8") as f:
    content = f.read()
    
lines = content.split("\n") 

In [2]:
len(lines)

1848

In [4]:
[i for i, line in enumerate(lines) if len(line) == 0]

[1847]

In [5]:
lines[:5]

['date,open,high,low,close,volume,adjclose,symbol',
 '2000-07-18,144.8125,144.828125,141.4375,143.0,5.06836E7,50.155473,INTC',
 '2000-07-20,32.93751,34.25001,32.8125,33.75,3288300.0,8.789734,BEN',
 '2000-07-24,64.25,67.312477,64.187523,64.75,948800.0,7.689567,APH',
 '2000-07-26,21.875,22.125,20.9375,20.9375,1464300.0,15.61832,SHW']

In [6]:
data = [line for line in lines[1:] if len(line)>0]

In [7]:
len(data)

1846

In [8]:
data[:5]

['2000-07-18,144.8125,144.828125,141.4375,143.0,5.06836E7,50.155473,INTC',
 '2000-07-20,32.93751,34.25001,32.8125,33.75,3288300.0,8.789734,BEN',
 '2000-07-24,64.25,67.312477,64.187523,64.75,948800.0,7.689567,APH',
 '2000-07-26,21.875,22.125,20.9375,20.9375,1464300.0,15.61832,SHW',
 '2000-07-26,42.0,42.312481,41.625,41.875,1397600.0,9.402721,STJ']

In [10]:
unique_symbols = set([line.split(",")[-1] for line in data])

In [12]:
len(unique_symbols)

471

In [13]:
frequencies = {}
for line in data:
    splits = line.split(",")
    symbol = splits[-1]
    f = frequencies.get(symbol, 0)
    frequencies[symbol] = f + 1

In [20]:
list(frequencies.items())[:3]

[('INTC', 4), ('BEN', 6), ('APH', 3)]

In [17]:
sorted(frequencies.items(), key = lambda p: p[1], reverse=True)[:5]

[('UPS', 11), ('ZBH', 11), ('K', 10), ('FLIR', 9), ('KR', 9)]

In [23]:
volumes_by_symbol = {} # key: symbol: value: list of volumes
for line in data:
    splits = line.split(",")
    symbol = splits[-1]
    volume = float(splits[5])
    vols = volumes_by_symbol.get(symbol, [])
    vols.append(volume)
    volumes_by_symbol[symbol] = vols

In [24]:
avg_volume_by_symbol = {}
for symbol in volumes_by_symbol:
    vols = volumes_by_symbol[symbol]
    avg = sum(vols)/len(vols)
    avg_volume_by_symbol[symbol] = avg

In [27]:
print(avg_volume_by_symbol)

{'INTC': 74368550.0, 'BEN': 3317050.0, 'APH': 1573366.6666666667, 'SHW': 1028760.0, 'STJ': 1341333.3333333333, 'GGP': 961020.0, 'SBUX': 21040233.333333332, 'EQT': 988628.5714285715, 'BCR': 1557300.0, 'NKE': 13730375.0, 'IPG': 3774960.0, 'EXC': 4272625.0, 'BXP': 561860.0, 'BIIB': 2678871.4285714286, 'DD': 6005600.0, 'NUE': 2655587.5, 'ALK': 2896700.0, 'COG': 4203033.333333333, 'ETN': 2120087.5, 'DO': 1929440.0, 'BSX': 4953450.0, 'UPS': 3431600.0, 'ENDP': 1364540.0, 'WBA': 4418425.0, 'PXD': 1205650.0, 'STI': 4397600.0, 'FDX': 2076366.6666666667, 'LLTC': 4596966.666666667, 'RHT': 2198866.6666666665, 'O': 923700.0, 'RAI': 9824660.0, 'BWA': 1606150.0, 'FLIR': 1176022.2222222222, 'MET': 5447520.0, 'XLNX': 6288650.0, 'FCX': 9204975.0, 'FOX': 2188475.0, 'EMC': 15612966.666666666, 'RHI': 1558700.0, 'FISV': 2424133.3333333335, 'CINF': 511871.4285714286, 'AGN': 1079380.0, 'IP': 3418514.285714286, 'KSS': 3495000.0, 'EBAY': 37022033.333333336, 'CMS': 1328642.857142857, 'TROW': 1943600.0, 'KR': 7185

In [26]:
avg_volume_by_symbol["INTC"]

74368550.0

In [31]:
dates = [line.split(",")[0] for line in data]
dates[:5]

['2000-07-18', '2000-07-20', '2000-07-24', '2000-07-26', '2000-07-26']

In [32]:
from datetime import datetime

In [34]:
d = datetime.strptime("2019-09-23", "%Y-%m-%d")

In [35]:
type(d)

datetime.datetime

In [36]:
d.weekday()

0

In [37]:
dates = [line.split(",")[0] for line in data]
dates = [datetime.strptime(date, "%Y-%m-%d").weekday() for date in dates]
dates[:5]

[1, 3, 0, 2, 2]

In [38]:
from collections import Counter

In [39]:
Counter(dates)

Counter({1: 348, 3: 374, 0: 361, 2: 393, 4: 370})

In [42]:
lines[0]

'date,open,high,low,close,volume,adjclose,symbol'

In [43]:
pct_changes = []
for line in data:
    if line.startswith("2016"):
        parts = line.split(",")
        symbol = parts[-1]
        open_price = float(parts[1])
        close_price = float(parts[4])
        pct = (close_price-open_price)/open_price
        pct_changes.append((symbol, pct))
    

In [46]:
sorted(pct_changes, key = lambda p: -p[1])[:5]

[('WYNN', 0.05208330121966219),
 ('ALK', 0.03867928727813826),
 ('CAT', 0.03235981077745779),
 ('COP', 0.031759743852349545),
 ('LYB', 0.029605263157894735)]