-
Notifications
You must be signed in to change notification settings - Fork 0
/
combine-and-merge-all.py
113 lines (91 loc) · 5.37 KB
/
combine-and-merge-all.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
import csv
import numpy as np
dept_table = ["EDA", "EFD", "UVEK", "EJPD", "VBS", "BK", "WBF", "EDI"] # list all the departments
years_table = ["2011", "2012", "2013", "2014"] # list the avaiable years
# removes previous output files
files = os.listdir("combined/")
for f in files:
os.remove("combined/"+f)
for dept in dept_table: # this loop will apply for each department...
for year in years_table: # ... and for each year
biggest = "biggest/"+dept+" - "+year+".csv" # path to CSV with the 40 biggest data
categories = "categories/"+dept+" - "+year+".csv" # path to CSV with the categories data
outputfile = "combined/"+dept+" - "+year+".csv" # path for output file
array_biggest = []
array_categories = []
array_unknown = []
table_unknown = {} # dictionnary
####################################
#### CONVERT THE 40 BIGGEST FILE ###
####################################
with open(biggest, 'rt') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
# put the csv into an array
array_in = [row for row in reader]
# headers for the output array
array_biggest.append(["deptDe", "fullDeptDe", "year", "idSupplier", "supplier", "idCategory", "intCategory", "fullCategoryDe", "officeDe", "amount"])
for y in range(1, len(array_in)): # go through the table line by line starting with line 1
for x in range(7, len(array_in[0])): # go through the line starting with column 7
if array_in[y][x]: # if cell is not empty
array_biggest.append([array_in[y][0], array_in[y][1], array_in[y][2], array_in[y][3], array_in[y][4], str(array_in[y][5]).replace(".0",""), int(float(array_in[y][5])), array_in[y][6], array_in[0][x], array_in[y][x]]) # write a new line in the output array
#####################################
#### CONVERT THE CATEGORIES FILE ###
#####################################
with open(categories, 'rt') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
# put the csv into an array
array_in = [row for row in reader]
# headers for the output array
array_categories.append(["deptDe", "fullDeptDe", "year", "idSupplier", "supplier", "idCategory", "fullCategoryDe", "officeDe", "amount"])
for y in range(1, len(array_in)): # go through the table line by line starting with line 1
for x in range(7, len(array_in[0])): # go through the line starting with column 7
if array_in[y][x]: # if cell is not empty
if float(array_in[y][5]).is_integer(): # only if it's a main category (filter out the subcategories)
array_categories.append([array_in[y][0], array_in[y][1], array_in[y][2], "0", "Unknown", array_in[y][5], array_in[y][6], array_in[0][x], array_in[y][x]]) # write a new line in the output array
#########################################
#### COMBINE BIGGEST AND CATEGORIES ###
#########################################
# fetch the total amount for each category and office
for y in range(1, len(array_categories)): # go through the table line by line
catoffice = str(array_categories[y][5]) + "|" + str(array_categories[y][7]) # create a unique key combinining the category number and the office (category_nb|offce)
table_unknown[catoffice] = array_categories[y][8] # put the key in the dictionnary and add the total amount for this category and office as value
# fetch the 40biggest data
for x in range(1, len(array_biggest)): # go through the table
z = str(array_biggest[x][5]) + "|" + array_biggest[x][8] # generate unique key for this line with subcategories (decimal)
if not z in table_unknown: # if subcategory not in the categories data
z = str(array_biggest[x][6]) + "|" + array_biggest[x][8] # unique key with main categorie
res = round(round(float(table_unknown[z]),2) - round(float(array_biggest[x][9]),2),2) # make the substraction
if float(res) == 0.0: # if amount after substraction equals 0
table_unknown.pop(z, None) # delete the key from the dictionnary
else:
table_unknown[z] = res # if not, the new amount is set
# convert the table_unknown results into the same format as the 40biggest data (same columns)
# dept,fullDept,year,idSupplier,supplier,idCategory,intCategory,fullCategory,office,amount
for k,v in table_unknown.items():
categorie = k.split("|")
array_unknown.append([array_biggest[1][0], array_biggest[1][1], array_biggest[1][2], "0", "Unknown", categorie[0], categorie[0], "", categorie[1], v])
# combine the 40 biggest data and the unknown data in a new csv file
with open(outputfile, 'wb') as output:
writer = csv.writer(output, delimiter=',', lineterminator='\n') # , quoting=csv.QUOTE_NONNUMERIC
writer.writerows(array_biggest)
writer.writerows(array_unknown)
###############################
#### MERGE ALL DEPT ###
###############################
array = []
i = 1
for file in os.listdir("combined/"):
if file.endswith(".csv"):
with open("combined/"+file) as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
if i == 1:
array += [row for row in reader] # put all the csv into an array
else:
array += [row for row in reader if row[0] != "deptDe"] # put the csv into an array but excluding the headers
i += 1
arraynp = np.asarray(array)
exportshort = np.delete(arraynp, [1,4,5,7], 1) # delete unnecessary colums
with open("master_export_short.csv", 'wb') as output:
writer = csv.writer(output, delimiter=',', lineterminator='\n')
writer.writerows(exportshort)