Skip to content

Commit

Permalink
WIP pdf extractor
Browse files Browse the repository at this point in the history
  • Loading branch information
joepio committed Feb 11, 2023
1 parent 6fc885e commit e717a40
Show file tree
Hide file tree
Showing 8 changed files with 460 additions and 23 deletions.
399 changes: 377 additions & 22 deletions Cargo.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[workspace]
members = [
"atomizer",
"server",
"cli",
"lib",
Expand Down
12 changes: 12 additions & 0 deletions atomizer/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[package]
description = "Turn files into Atomic Data."
edition = "2021"
name = "atomizer"
version = "0.1.0"

[dependencies]
atomic_lib = {version = "0.34.0", path = "../lib"}
# Should match the version of pdf-extract
lopdf = "0.26"
mime_guess = "2.0.4"
pdf-extract = {git = "https://github.com/Hessesian/pdf-extract"}
38 changes: 38 additions & 0 deletions atomizer/src/file.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
use std::{collections::HashMap, error::Error};

use atomic_lib::resources::PropVals;

use crate::pdf;

pub struct File {
filename: String,
mime: String,
bytes: Vec<u8>,
}

impl File {
pub fn open(filename: &str) -> Result<File, Box<dyn Error>> {
let bytes = std::fs::read(filename)?;
let mime = mime_guess::from_path(filename)
.first_or_octet_stream()
.to_string();

Ok(File {
filename: filename.to_string(),
mime,
bytes,
})
}

/// Transforms an input file into an Atomic Data [Resource]
pub fn atomize(&self) -> PropVals {
match self.mime.as_str() {
"application/pdf" => pdf::atomize(self),
_ => HashMap::new(),
}
}

pub fn bytes(&self) -> &[u8] {
&self.bytes
}
}
4 changes: 4 additions & 0 deletions atomizer/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
mod file;
mod pdf;

use atomic_lib::Resource;
27 changes: 27 additions & 0 deletions atomizer/src/pdf.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
use atomic_lib::resources::PropVals;
use lopdf::content;

const content_prop: &str = "content";

pub fn atomize(file: &crate::file::File) -> PropVals {
let mut props = PropVals::new();
let mut s = String::new();
let mut output = pdf_extract::PlainTextOutput::new(&mut s);
let text = pdf_extract::extract_text_mem(file.bytes()).unwrap();
props.insert(content_prop.into(), atomic_lib::Value::String(text));
props
}

#[cfg(test)]
mod tests {
use super::*;
use crate::file::File;

#[test]
fn load_pdf() {
let f = File::open("./test/docs-demo.pdf").unwrap();
let propvals = f.atomize();
let content = propvals.get(content_prop).unwrap();
assert!(content.to_string().contains("Atomic Data"));
}
}
Binary file added atomizer/test/docs-demo.pdf
Binary file not shown.
2 changes: 1 addition & 1 deletion lib/src/populate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use crate::{
parse::ParseOpts,
schema::{Class, Property},
storelike::Query,
urls, Resource, Storelike, Value,
urls, Storelike, Value,
};

/// Populates a store with some of the most fundamental Properties and Classes needed to bootstrap the whole.
Expand Down

0 comments on commit e717a40

Please sign in to comment.