/
main.go
145 lines (128 loc) · 3 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
package main
import (
"bufio"
"bytes"
"errors"
"flag"
"fmt"
"io"
"os"
"runtime"
"runtime/pprof"
"sync"
)
// Set chunk size
const chunkSize int64 = 64 * 1024 * 1024
var input = flag.String("input", "", "file to read")
var jobs = flag.Int("jobs", runtime.NumCPU(), "number of concurrent jobs")
var cpuprofile = flag.String("cpuprofile", "", "file to read cpu profile to ")
func main() {
flag.Parse()
// Profiling
if *cpuprofile != "" {
f, err := os.Create(*cpuprofile)
if err != nil {
fmt.Printf("could not create CPU profile: %v", err)
}
defer f.Close()
if err := pprof.StartCPUProfile(f); err != nil {
fmt.Printf("could not start CPU profile: %v", err)
}
defer pprof.StopCPUProfile()
}
// Open the file
file, err := os.Open(*input)
if err != nil {
fmt.Println("Error:", err)
return
}
defer file.Close()
// Get the file size
fileInfo, err := file.Stat()
if err != nil {
fmt.Println("Error getting file info:", err)
return
}
fileSize := fileInfo.Size()
// Get number of chunks and chunks per reader
chunks := (fileSize + chunkSize - 1) / chunkSize
chunksPerReader := (chunks + int64(*jobs) - 1) / int64(*jobs)
out := make(chan []byte)
var wg sync.WaitGroup
for i := 0; i < *jobs; i++ {
wg.Add(1)
go func(i int) {
defer wg.Done()
start := int64(i) * chunksPerReader * chunkSize
end := start + chunksPerReader*chunkSize
end = min(end, fileSize)
// Buffer to read chunks into
buf := make([]byte, chunkSize)
// Read file by chunks
for pos, n := start, 0; pos < end; pos += int64(n) {
// Read a chunk
n, err = file.ReadAt(buf, pos)
if err != nil && !errors.Is(err, io.EOF) {
panic(err)
}
// Don't read past chunk limits
n = min(n, int(end-pos))
buf = buf[:n]
// If no bytes were read, break the loop
if n == 0 {
break
}
// If not the first chunk in the file and is
// first chunk in this worker, read from after
// the first new line
//
// aaa;1.2
// bbb;3.4
// ccc;5.6
//
// The above example may be split into chunks
// as follows below
//
// aaa;1.2
// +--- chunk split here
// |
// v
// bbb;3.4
// ccc;5.6
//
// worker 1 | worker 2
// [(chunk1, chunk2) | (chunk3, chunk4)]
// ...aaa;1.2\nbb | b;3.4\nccc;...
//
// In this case, we want worker 1 to read the
// full line of bbb and worker 2 to start
// reading at ccc.
if pos != 0 && pos == start {
i := bytes.Index(buf, []byte{'\n'})
buf = buf[i+1:]
}
_, err := file.Seek(pos+int64(n), 0)
reader := bufio.NewReader(file)
overflow, err := reader.ReadBytes('\n')
if err != nil && !errors.Is(err, io.EOF) {
panic(err)
}
send := make([]byte, len(buf))
copy(send, buf)
send = append(send, overflow...)
n += len(overflow)
out <- send
}
}(i)
}
done := make(chan bool)
go func() {
for data := range out {
fmt.Print(string(data))
}
done <- true
}()
wg.Wait()
close(out)
<-done
}