# CSV consolidation

## Description

The point of this script is to list all the CSV of a directory and find out which hours are missing

In [1]:
# Library importation
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
import math

In [2]:
# Declaration of variables 
import_path = "/home/exalis/Téléchargements/BACKUP/2020_01_16"

In [3]:
# List of Files
list_of_files = [f for f in listdir(import_path) if isfile(join(import_path, f))]
list_of_files = sorted(list_of_files)
print("number of files:", len(list_of_files))

number of files: 1438


In [4]:
df =  pd.DataFrame(list_of_files, columns=["list of files"])

In [5]:
df.head()

Unnamed: 0,list of files
0,2020_01_16_00:00:01.csv
1,2020_01_16_00:01:01.csv
2,2020_01_16_00:02:01.csv
3,2020_01_16_00:03:01.csv
4,2020_01_16_00:04:01.csv


In [6]:
# Frist processing: adding hour in good format, hour and minutes
df["date"] = df["list of files"].str.replace("_","-")
df["date"] = df["date"].apply(lambda x: x[:-4])
df["date"] = df["date"].apply(lambda x: x[0:10]+" "+x[11:16])

df["hour"] = df["date"].apply(lambda x: pd.Timestamp(x).hour)
df["minute"] = df["date"].apply(lambda x: pd.Timestamp(x).minute)

In [7]:
df

Unnamed: 0,list of files,date,hour,minute
0,2020_01_16_00:00:01.csv,2020-01-16 00:00,0,0
1,2020_01_16_00:01:01.csv,2020-01-16 00:01,0,1
2,2020_01_16_00:02:01.csv,2020-01-16 00:02,0,2
3,2020_01_16_00:03:01.csv,2020-01-16 00:03,0,3
4,2020_01_16_00:04:01.csv,2020-01-16 00:04,0,4
...,...,...,...,...
1433,2020_01_16_23:55:01.csv,2020-01-16 23:55,23,55
1434,2020_01_16_23:56:01.csv,2020-01-16 23:56,23,56
1435,2020_01_16_23:57:01.csv,2020-01-16 23:57,23,57
1436,2020_01_16_23:58:01.csv,2020-01-16 23:58,23,58


In [8]:
df.head()

Unnamed: 0,list of files,date,hour,minute
0,2020_01_16_00:00:01.csv,2020-01-16 00:00,0,0
1,2020_01_16_00:01:01.csv,2020-01-16 00:01,0,1
2,2020_01_16_00:02:01.csv,2020-01-16 00:02,0,2
3,2020_01_16_00:03:01.csv,2020-01-16 00:03,0,3
4,2020_01_16_00:04:01.csv,2020-01-16 00:04,0,4


In [9]:
# Displaying the hours with missing files

for i in range(24):
    a=i
    b=len(df[df["hour"]==i]["minute"].unique())
    c= len(df[df["hour"]==i]["minute"])
    try:
        d = round(b/c,3)
    except:
        d = "n/a"
    print("{}: {} unique values over {}. Ratio : {}".format(a, b ,c,d))
    if b < 60 or c <60:
        print("ERROR")

0: 60 unique values over 60. Ratio : 1.0
1: 60 unique values over 60. Ratio : 1.0
2: 60 unique values over 60. Ratio : 1.0
3: 60 unique values over 60. Ratio : 1.0
4: 60 unique values over 60. Ratio : 1.0
5: 60 unique values over 60. Ratio : 1.0
6: 60 unique values over 60. Ratio : 1.0
7: 60 unique values over 60. Ratio : 1.0
8: 60 unique values over 60. Ratio : 1.0
9: 60 unique values over 60. Ratio : 1.0
10: 60 unique values over 60. Ratio : 1.0
11: 60 unique values over 60. Ratio : 1.0
12: 60 unique values over 60. Ratio : 1.0
13: 60 unique values over 60. Ratio : 1.0
14: 60 unique values over 60. Ratio : 1.0
15: 57 unique values over 58. Ratio : 0.983
ERROR
16: 60 unique values over 60. Ratio : 1.0
17: 60 unique values over 60. Ratio : 1.0
18: 60 unique values over 60. Ratio : 1.0
19: 60 unique values over 60. Ratio : 1.0
20: 60 unique values over 60. Ratio : 1.0
21: 60 unique values over 60. Ratio : 1.0
22: 60 unique values over 60. Ratio : 1.0
23: 60 unique values over 60. Ratio 

#  Error analysis

In [12]:
# doubles counting 
df[df["hour"]==15]["minute"].value_counts()

17    2
59    1
30    1
28    1
27    1
26    1
25    1
24    1
23    1
22    1
21    1
20    1
19    1
18    1
13    1
12    1
11    1
10    1
9     1
8     1
7     1
6     1
5     1
4     1
3     1
2     1
1     1
29    1
31    1
58    1
32    1
57    1
56    1
55    1
54    1
53    1
52    1
51    1
50    1
49    1
48    1
47    1
46    1
45    1
44    1
43    1
42    1
41    1
40    1
39    1
38    1
37    1
36    1
35    1
34    1
33    1
0     1
Name: minute, dtype: int64

In [13]:
# Focus on missing hour
df[df["hour"]==15]["minute"]

900     0
901     1
902     2
903     3
904     4
905     5
906     6
907     7
908     8
909     9
910    10
911    11
912    12
913    13
914    17
915    17
916    18
917    19
918    20
919    21
920    22
921    23
922    24
923    25
924    26
925    27
926    28
927    29
928    30
929    31
930    32
931    33
932    34
933    35
934    36
935    37
936    38
937    39
938    40
939    41
940    42
941    43
942    44
943    45
944    46
945    47
946    48
947    49
948    50
949    51
950    52
951    53
952    54
953    55
954    56
955    57
956    58
957    59
Name: minute, dtype: int64

Here, at 15:00, 17" is in double, and 14", 15", 16" are missing