In [21]:
import os
import numpy as np
import pandas as pd

In [22]:
def dir_files(dir_path):
    return [os.path.join(dir_path,f) for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f))]

In [44]:
data_dir = r'C:\Users\asaf\Desktop\DS_workshop-project\stock_market_neural_nets\data\relevant\dukascopy\BID'
save_skipped = r'C:\Users\asaf\Desktop\DS_workshop-project\stock_market_neural_nets\data\relevant\dukascopy\feather\skipped.csv'
csv_files = dir_files(data_dir)

In [45]:
skipped = []
count = 1
for file in csv_files:
    print(count, os.path.basename(file))
    count+=1
    
    df = pd.read_csv(file)
    
    # rename columns
    new_cols_map = {col: col.lower() for col in df}
    new_cols_map['Local time'] = "datetime"
    df.rename(columns=new_cols_map, inplace=True)

    # convert to datetime
    df['datetime'] = df['datetime'].str.replace(r"\:00\.000 GMT\+0\d00","")
    df['datetime'] = pd.to_datetime(df['datetime'], format='%d.%m.%Y %H:%M')
    
    # drop duplicates (daylight saving time)
    df = df[~df.duplicated()]
    
    # drop saturdays and sundays
    mask = (df['datetime'].dt.dayofweek==5) | (df['datetime'].dt.dayofweek==6)
    if (mask & (df['volume']!=0)).any():
        skipped.append((file, "weekends with none 0 volume"))
        continue
    df = df[~mask]
    
    # drop inactive hours
    mask = ((df['datetime'].dt.hour*100 + df['datetime'].dt.minute < 1530) | \
           ( df['datetime'].dt.hour*100 + df['datetime'].dt.minute > 2300))
    if (mask & df['volume']!=0).any():
        skipped.append((file, "non trading hours with volume different than 0"))
        continue
    mask = ~mask & (df['volume']!=0)
    df = df[mask]
    
    # sort by datetime
    df.sort_values(by=['datetime'], inplace=True)
    
    # reset index, more convenient
    df.reset_index(drop=True, inplace=True)
    
    # save data in feather format
    file_name = os.path.basename(file).replace(".csv", ".feather")
    df.to_feather(os.path.join(save_dir, file_name))

1 A.USUSD.csv
2 AA.USUSD.csv
3 AABA.USUSD.csv
4 AAL.GBGBX.csv
5 AAL.USUSD.csv
6 AAPL.USUSD.csv
7 ABBN.CHCHF.csv
8 ABC.USUSD.csv
9 ABEV.USUSD.csv
10 ABF.GBGBX.csv
11 ABI.BEEUR.csv
12 ABT.USUSD.csv
13 AC.FREUR.csv
14 ACA.FREUR.csv
15 ACS.ESEUR.csv
16 ACX.ESEUR.csv
17 ADBE.USUSD.csv
18 ADEN.CHCHF.csv
19 ADI.USUSD.csv
20 ADM.GBGBX.csv
21 ADP.USUSD.csv
22 ADS.DEEUR.csv
23 ADSK.USUSD.csv
24 AENA.ESEUR.csv
25 AET.USUSD.csv
26 AF.FREUR.csv
27 AGK.GBGBX.csv
28 AGN.NLEUR.csv
29 AGS.BEEUR.csv
30 AH.NLEUR.csv
31 AHT.GBGBX.csv
32 AI.FREUR.csv
33 AIG.USUSD.csv
34 AIR.FREUR.csv
35 AKZA.NLEUR.csv
36 ALFA.SESEK.csv
37 ALL.USUSD.csv
38 ALO.FREUR.csv
39 ALV.DEEUR.csv
40 ALXN.USUSD.csv
41 AMAT.USUSD.csv
42 AMD.USUSD.csv
43 AMGN.USUSD.csv
44 AMS.ESEUR.csv
45 AMT.USUSD.csv
46 AMZN.USUSD.csv
47 ANTM.USUSD.csv
48 ANTO.GBGBX.csv
49 APA.USUSD.csv
50 APC.USUSD.csv
51 APD.USUSD.csv
52 ASML.NLEUR.csv
53 ATCOA.SESEK.csv
54 ATVI.USUSD.csv
55 AUDCAD.csv
56 AUDCHF.csv
57 AUDJPY.csv
58 AUDNZD.csv
59 AUDSGD.csv
60 AUDUS

463 RDSA.NLEUR.csv
464 RDSB.GBGBX.csv
465 REE.ESEUR.csv
466 REGN.USUSD.csv
467 REL.GBGBX.csv
468 REN.NLEUR.csv
469 REP.ESEUR.csv
470 RF.USUSD.csv
471 RI.FREUR.csv
472 RIO.GBGBX.csv
473 RMG.GBGBX.csv
474 RNO.FREUR.csv
475 ROG.CHCHF.csv
476 ROST.USUSD.csv
477 RR.GBGBX.csv
478 RRC.USUSD.csv
479 RRS.GBGBX.csv
480 RSA.GBGBX.csv
481 RTN.USUSD.csv
482 RWE.DEEUR.csv
483 SAF.FREUR.csv
484 SAN.ESEUR.csv
485 SAN.FREUR.csv
486 SAND.SESEK.csv
487 SAP.DEEUR.csv
488 SBRY.GBGBX.csv
489 SBUX.USUSD.csv
490 SCAB.SESEK.csv
491 SCHW.USUSD.csv
492 SCMN.CHCHF.csv
493 SDF.DEEUR.csv
494 SEBA.SESEK.csv
495 SECUB.SESEK.csv
496 SGD.IDXSGD.csv
497 SGDJPY.csv
498 SGE.GBGBX.csv
499 SGO.FREUR.csv
500 SGSN.CHCHF.csv
501 SHP.GBGBX.csv
502 SHW.USUSD.csv
503 SIE.DEEUR.csv
504 SJM.USUSD.csv
505 SKAB.SESEK.csv
506 SKFB.SESEK.csv
507 SKY.GBGBX.csv
508 SLHN.CHCHF.csv
509 SLV.USUSD.csv
510 SMIN.GBGBX.csv
511 SN.GBGBX.csv
512 SNAP.USUSD.csv
513 SO.USUSD.csv
514 SOLB.BEEUR.csv
515 SOON.CHCHF.csv
516 SOYBEAN.CMDUSX.csv
517 SPG.U

In [69]:
skipped = [(os.path.basename(file), error) for file, error in skipped]

In [74]:
import csv
with open(save_dir,'w') as out:
    csv_out=csv.writer(out)
    csv_out.writerows(skipped)