-
Notifications
You must be signed in to change notification settings - Fork 26
/
main.go
147 lines (132 loc) · 3.54 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
package main
import (
"bufio"
"encoding/hex"
"flag"
"fmt"
"github.com/boltdb/bolt"
"github.com/cespare/xxhash"
"github.com/mitchellh/go-homedir"
"io"
"log"
"os"
"os/exec"
"path/filepath"
)
func hash_file_xxhash(filePath string) (string, error) {
var returnMD5String string
file, err := os.Open(filePath)
if err != nil {
return returnMD5String, err
}
defer file.Close()
hash := xxhash.New()
if _, err := io.Copy(hash, file); err != nil {
return returnMD5String, err
}
hashInBytes := hash.Sum(nil)[:]
returnMD5String = hex.EncodeToString(hashInBytes)
return returnMD5String, nil
}
func main() {
flag.Usage = func() {
fmt.Printf(`Usage: fast-p [OPTIONS]
Reads a list of PDF filenames from STDIN and returns a list of null-byte
separated items of the form
filename[TAB]text
where "text" is the text extracted from the first two pages of the PDF
by pdftotext and [TAB] denotes a tab character "\t".
Common usage of this tool is to pipe the result to FZF with a command in
your .bashrc as explained in https://github.com/bellecp/fast-p.
`)
flag.PrintDefaults()
}
version := flag.Bool("version", false, "Display program version")
clearCache := flag.Bool("clear-cache", false, "Delete cache file located at: \n~/.cache/fast-p-pdftotext-output/fast-p_cached_pdftotext_output.db")
flag.Parse()
if *version != false {
fmt.Printf("v.0.2.5 \nhttps://github.com/bellecp/fast-p\n")
os.Exit(0)
}
if *clearCache != false {
removePath, err := homedir.Expand("~/.cache/fast-p-pdftotext-output/fast-p_cached_pdftotext_output.db")
if err != nil {
log.Fatal(err)
os.Exit(1)
}
os.Remove(removePath)
os.Exit(0)
}
// Create ~/.cache folder if does not exist
// https://stackoverflow.com/questions/37932551/mkdir-if-not-exists-using-golang
cachePath, err := homedir.Expand("~/.cache/fast-p-pdftotext-output/")
os.MkdirAll(cachePath, os.ModePerm)
// open BoltDB cache database
scanner := bufio.NewScanner(os.Stdin)
boltDbFilepath := filepath.Join(cachePath, "fast-p_cached_pdftotext_output.db")
if err != nil {
log.Fatal(err)
}
db, err := bolt.Open(boltDbFilepath, 0600, nil)
bucketName := "fast-p_bucket_for_cached_pdftotext_output"
if err != nil {
log.Fatal(err)
}
defer db.Close()
nullByte := "\u0000"
db.Update(func(tx *bolt.Tx) error {
_, err := tx.CreateBucketIfNotExists([]byte(bucketName))
if err != nil {
return fmt.Errorf("create bucket: %s", err)
}
return nil
})
missing := make(map[string]string)
alreadySeen := make(map[string]bool)
for scanner.Scan() {
filepath := scanner.Text()
hash, err := hash_file_xxhash(filepath)
if alreadySeen[hash] != true {
alreadySeen[hash] = true
if err != nil {
log.Println("err", hash)
}
var content string
found := false
err2 := db.View(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte(bucketName))
v := b.Get([]byte(hash))
if v != nil {
found = true
content = string(v)
}
return nil
})
if err2 != nil {
log.Println(err2)
}
if found == true {
fmt.Println(filepath + "\t" + content + nullByte)
} else {
missing[hash] = filepath
}
}
}
for hash, filepath := range missing {
cmd := exec.Command("pdftotext", "-l", "2", filepath, "-")
out, err := cmd.CombinedOutput()
content := string(out)
if err != nil {
log.Println(err)
}
fmt.Println(filepath + "\t" + content + nullByte)
db.Update(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte(bucketName))
err := b.Put([]byte(hash), []byte(content))
if err != nil {
fmt.Println(err)
}
return nil
})
}
}