# CSV consolidation

## Description

The point of this script is to list all the CSV of a directory and find out which hours are missing

In [105]:
# Library importation
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
import math

In [155]:
# Declaration of variables 
import_path = "/home/exalis/VELIB-DATA/TO CONSOLIDATE/2020_01_27"

In [156]:
# List of Files
list_of_files = [f for f in listdir(import_path) if isfile(join(import_path, f))]
list_of_files = sorted(list_of_files)
print("number of files:", len(list_of_files))

number of files: 1440


In [157]:
df =  pd.DataFrame(list_of_files, columns=["list of files"])

In [158]:
df.head()

Unnamed: 0,list of files
0,2020_01_27_00:00:01.csv
1,2020_01_27_00:01:01.csv
2,2020_01_27_00:02:01.csv
3,2020_01_27_00:03:01.csv
4,2020_01_27_00:04:01.csv


In [159]:
# Frist processing: adding hour in good format, hour and minutes
df["date"] = df["list of files"].str.replace("_","-")
df["date"] = df["date"].apply(lambda x: x[:-4])
df["date"] = df["date"].apply(lambda x: x[0:10]+" "+x[11:16])

df["hour"] = df["date"].apply(lambda x: pd.Timestamp(x).hour)
df["minute"] = df["date"].apply(lambda x: pd.Timestamp(x).minute)

In [160]:
df

Unnamed: 0,list of files,date,hour,minute
0,2020_01_27_00:00:01.csv,2020-01-27 00:00,0,0
1,2020_01_27_00:01:01.csv,2020-01-27 00:01,0,1
2,2020_01_27_00:02:01.csv,2020-01-27 00:02,0,2
3,2020_01_27_00:03:01.csv,2020-01-27 00:03,0,3
4,2020_01_27_00:04:01.csv,2020-01-27 00:04,0,4
...,...,...,...,...
1435,2020_01_27_23:55:01.csv,2020-01-27 23:55,23,55
1436,2020_01_27_23:56:01.csv,2020-01-27 23:56,23,56
1437,2020_01_27_23:57:01.csv,2020-01-27 23:57,23,57
1438,2020_01_27_23:58:01.csv,2020-01-27 23:58,23,58


In [161]:
df.head()

Unnamed: 0,list of files,date,hour,minute
0,2020_01_27_00:00:01.csv,2020-01-27 00:00,0,0
1,2020_01_27_00:01:01.csv,2020-01-27 00:01,0,1
2,2020_01_27_00:02:01.csv,2020-01-27 00:02,0,2
3,2020_01_27_00:03:01.csv,2020-01-27 00:03,0,3
4,2020_01_27_00:04:01.csv,2020-01-27 00:04,0,4


In [162]:
# Displaying the hours with missing files

for i in range(24):
    a=i
    b=len(df[df["hour"]==i]["minute"].unique())
    c= len(df[df["hour"]==i]["minute"])
    try:
        d = round(b/c,3)
    except:
        d = "n/a"
    print("{}: {} unique values over {}. Ratio : {}".format(a, b ,c,d))
    if b < 60 or c <60:
        print("ERROR")

0: 60 unique values over 60. Ratio : 1.0
1: 60 unique values over 60. Ratio : 1.0
2: 60 unique values over 60. Ratio : 1.0
3: 60 unique values over 60. Ratio : 1.0
4: 60 unique values over 60. Ratio : 1.0
5: 60 unique values over 60. Ratio : 1.0
6: 60 unique values over 60. Ratio : 1.0
7: 60 unique values over 60. Ratio : 1.0
8: 60 unique values over 60. Ratio : 1.0
9: 60 unique values over 60. Ratio : 1.0
10: 60 unique values over 60. Ratio : 1.0
11: 60 unique values over 60. Ratio : 1.0
12: 60 unique values over 60. Ratio : 1.0
13: 60 unique values over 60. Ratio : 1.0
14: 60 unique values over 60. Ratio : 1.0
15: 60 unique values over 60. Ratio : 1.0
16: 60 unique values over 60. Ratio : 1.0
17: 60 unique values over 60. Ratio : 1.0
18: 60 unique values over 60. Ratio : 1.0
19: 60 unique values over 60. Ratio : 1.0
20: 60 unique values over 60. Ratio : 1.0
21: 60 unique values over 60. Ratio : 1.0
22: 60 unique values over 60. Ratio : 1.0
23: 60 unique values over 60. Ratio : 1.0


#  Error analysis

In [103]:
# doubles counting 
df[df["hour"]==6]["minute"].value_counts()

59    1
27    1
25    1
24    1
23    1
22    1
21    1
20    1
19    1
18    1
17    1
16    1
15    1
14    1
13    1
12    1
11    1
10    1
9     1
8     1
7     1
6     1
5     1
4     1
3     1
2     1
1     1
26    1
28    1
58    1
29    1
57    1
56    1
55    1
54    1
53    1
52    1
51    1
50    1
49    1
48    1
47    1
46    1
45    1
41    1
40    1
39    1
38    1
37    1
36    1
35    1
34    1
33    1
32    1
31    1
30    1
0     1
Name: minute, dtype: int64

In [104]:
# Focus on missing hour
df[df["hour"]==6]["minute"]

360     0
361     1
362     2
363     3
364     4
365     5
366     6
367     7
368     8
369     9
370    10
371    11
372    12
373    13
374    14
375    15
376    16
377    17
378    18
379    19
380    20
381    21
382    22
383    23
384    24
385    25
386    26
387    27
388    28
389    29
390    30
391    31
392    32
393    33
394    34
395    35
396    36
397    37
398    38
399    39
400    40
401    41
402    45
403    46
404    47
405    48
406    49
407    50
408    51
409    52
410    53
411    54
412    55
413    56
414    57
415    58
416    59
Name: minute, dtype: int64

Here, at 15:00, 17" is in double, and 14", 15", 16" are missing