Permalink
Cannot retrieve contributors at this time
Join GitHub today
GitHub is home to over 28 million developers working together to host and review code, manage projects, and build software together.
Sign up
Fetching contributors…
| // Copyright (c) Carl-Erwin Griffith | |
| // | |
| // Permission is hereby granted, free of charge, to any | |
| // person obtaining a copy of this software and associated | |
| // documentation files (the "Software"), to deal in the | |
| // Software without restriction, including without | |
| // limitation the rights to use, copy, modify, merge, | |
| // publish, distribute, sublicense, and/or sell copies of | |
| // the Software, and to permit persons to whom the Software | |
| // is furnished to do so, subject to the following | |
| // conditions: | |
| // | |
| // The above copyright notice and this permission notice | |
| // shall be included in all copies or substantial portions | |
| // of the Software. | |
| // | |
| // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF | |
| // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED | |
| // TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A | |
| // PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT | |
| // SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | |
| // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | |
| // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR | |
| // IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
| // DEALINGS IN THE SOFTWARE. | |
| // This is a simple text "parser" that looks for "almost" common ranges of lines. | |
| // It only support ascii/utf8 files. | |
| extern crate byteorder; | |
| extern crate clap; | |
| extern crate crc; | |
| extern crate num_cpus; | |
| use byteorder::{BigEndian, WriteBytesExt}; | |
| use clap::{App, Arg}; | |
| use crc::{crc64, Hasher64}; | |
| use std::collections::HashMap; | |
| use std::collections::HashSet; | |
| use std::fs::File; | |
| use std::io; | |
| use std::io::prelude::*; | |
| use std::io::BufReader; | |
| use std::io::BufWriter; | |
| use std::mem; | |
| use std::sync::atomic::{AtomicUsize, Ordering}; | |
| use std::sync::{Arc, RwLock}; | |
| use std::thread; | |
| const DEFAULT_MIN_WINDOW_SIZE: usize = 3; | |
| const VERSION: &str = env!("CARGO_PKG_VERSION"); | |
| struct Config { | |
| nr_threads: usize, | |
| stack_size: usize, | |
| min_window_size: usize, | |
| files: Vec<String>, | |
| print_results: bool, | |
| } | |
| type FileIndex = u32; | |
| type LineIndex = u32; | |
| type LineNumber = u32; | |
| type CpdHash = u64; | |
| #[derive(Debug, Copy, Clone)] | |
| struct WindowSize(usize); | |
| #[derive(Debug, Copy, Clone)] | |
| struct CallDepth(u32); | |
| #[derive(Debug, Copy, Clone)] | |
| struct LineInfo { | |
| line_number: usize, | |
| hash: CpdHash, | |
| } | |
| impl LineInfo { | |
| fn new(line_number: usize, hash: CpdHash) -> LineInfo { | |
| LineInfo { line_number, hash } | |
| } | |
| } | |
| #[derive(Debug)] | |
| struct FileInfo { | |
| file_index: usize, | |
| filename: String, | |
| lines: Vec<LineInfo>, | |
| } | |
| struct CpdResults { | |
| window_to_files_range: HashMap<CpdHash, HashSet<(FileIndex, LineIndex)>>, | |
| window_size_map: HashMap<u32, HashSet<CpdHash>>, | |
| } | |
| impl CpdResults { | |
| fn new() -> CpdResults { | |
| CpdResults { | |
| window_to_files_range: HashMap::new(), | |
| window_size_map: HashMap::new(), | |
| } | |
| } | |
| } | |
| /// This function computes a 64 bits crc for each file's line. | |
| /// It will ignore empty lines and some blank characters (' ', '\t', '\n'). | |
| /// Each crc serves as a key in a shared map to store a (file_index, line_index) tuple. | |
| /// The (file_index, line_index) tuple is stored in a hash set and then in the hash set is stored in the global map. | |
| /// | |
| /// Summary: hashmap[crc] -> hashset<(file_index, line_index)> | |
| fn parse_file( | |
| filename: &str, | |
| file_index: usize, | |
| map: &RwLock<HashMap<CpdHash, HashSet<(FileIndex, LineIndex)>>>, | |
| ) -> Option<Vec<LineInfo>> { | |
| let file = File::open(filename); | |
| let file = match file { | |
| Ok(file) => file, | |
| _ => { | |
| eprintln!("cannot parse {}", filename); | |
| return None; | |
| } | |
| }; | |
| let file_index = file_index as FileIndex; | |
| let mut line_info = Vec::<LineInfo>::new(); | |
| // 1 - read line | |
| let buf_reader = BufReader::new(file); | |
| let lines = buf_reader.lines(); | |
| for (line_number, l) in lines.enumerate() { | |
| let l = match l { | |
| Ok(l) => l, | |
| Err(_) => break, | |
| }; | |
| let mut digest = crc64::Digest::new(crc64::ECMA); | |
| // NB: String internal representation is utf8 | |
| // we can filter instead | |
| // ignore spaces|tabs|newline | |
| let mut slen = 0; | |
| for c in l.into_bytes() { | |
| let sl: [u8; 1] = [c]; | |
| match sl[0] { | |
| 0x20 | 0x9 | 0xa | 0xd => {} | |
| _ => { | |
| slen += 1; | |
| digest.write(&sl); | |
| } | |
| } | |
| } | |
| // ignore empty lines | |
| if slen <= 0 { | |
| continue; | |
| } | |
| // compute line checksum | |
| let hash = digest.sum64(); | |
| // build line info | |
| let line_index = line_info.len() as LineIndex; | |
| line_info.push(LineInfo::new(line_number + 1, hash)); | |
| // insert (file_index , line_index) pair in global table | |
| map.write() | |
| .unwrap() | |
| .entry(hash) | |
| .or_insert_with(HashSet::new) | |
| .insert((file_index, line_index)); | |
| } | |
| Some(line_info) | |
| } | |
| /// This functions runs multiple threads to call parse_file(...) . | |
| /// It returns all hash and the shared map that holds all crc -> (file_index, file_line) mapping. | |
| fn parse_files( | |
| n_jobs: usize, | |
| file_max: usize, | |
| files_inf: &Arc<RwLock<Vec<FileInfo>>>, | |
| ) -> ( | |
| Vec<CpdHash>, | |
| Arc<RwLock<HashMap<CpdHash, HashSet<(FileIndex, LineIndex)>>>>, | |
| ) { | |
| // hash_graph: HashMap<CpdHash, HashSet<(u32, u32)>> | |
| let hash_graph = Arc::new(RwLock::new(HashMap::new())); | |
| let mut handles = vec![]; | |
| let idx = Arc::new(AtomicUsize::new(0)); | |
| for _ in 0..n_jobs { | |
| let files_inf = Arc::clone(&files_inf); | |
| let idx = Arc::clone(&idx); | |
| let hash_graph = Arc::clone(&hash_graph); | |
| let handle = thread::spawn(move || loop { | |
| let i = idx.fetch_add(1, Ordering::SeqCst); | |
| if i >= file_max { | |
| return; | |
| } | |
| let filename = { | |
| let mut vec = files_inf.read().unwrap(); | |
| vec[i].filename.clone() | |
| }; | |
| eprint!("\rparse files : {}/{} ", i + 1, file_max); | |
| if let Some(lines) = parse_file(&filename, i, &hash_graph) { | |
| let mut vec = files_inf.write().unwrap(); | |
| vec[i].lines = lines; | |
| } | |
| }); | |
| handles.push(handle); | |
| } | |
| for handle in handles { | |
| handle.join().unwrap(); | |
| } | |
| eprintln!(""); | |
| // keys to vec | |
| let mut hash_vec = Vec::<CpdHash>::new(); | |
| { | |
| let hash_graph = hash_graph.read().unwrap(); | |
| hash_vec.extend((*hash_graph).keys()); | |
| } | |
| (hash_vec, hash_graph) | |
| } | |
| /// This function prints a specific range of line, for a given file | |
| fn print_file_lines(filename: &str, start_line: usize, end_line: usize) { | |
| let file = File::open(filename); | |
| let file = match file { | |
| Ok(file) => file, | |
| _ => { | |
| eprintln!("cannot print {}", filename); | |
| return; | |
| } | |
| }; | |
| let reader = BufReader::new(file); | |
| let mut writer = BufWriter::new(io::stdout()); | |
| for line in reader | |
| .lines() | |
| .skip(start_line - 1) | |
| .take(end_line - start_line + 1) | |
| { | |
| let l = line.unwrap(); | |
| writeln!(writer, "{}", l).unwrap(); | |
| } | |
| } | |
| /// This (recursive) function compares the consecutives crcs to detect the cut/past code. | |
| /// for a given starting crc there is a list/array of (file_index,line_index) tuples. | |
| /// We check all (file_index, line_index + 1). | |
| /// If they all point to the same crc we loop (cut/paste detected). | |
| /// If any next line diverges, we store the current results (if the minimum number of line reached). | |
| /// We recurse with all "new" crcs until there is no next line. | |
| fn walk_graph( | |
| results: &Arc<RwLock<CpdResults>>, | |
| min_window_size: usize, | |
| window_size: WindowSize, | |
| depth: CallDepth, | |
| files_inf: &[FileInfo], | |
| hash_graph: &HashMap<CpdHash, HashSet<(u32, u32)>>, | |
| current_lines: &mut Vec<(CpdHash, FileIndex, LineIndex)>, | |
| ) { | |
| let mut window_size = window_size; | |
| loop { | |
| window_size.0 += 1; | |
| let mut to_flush = false; | |
| let mut next_lines: Vec<(CpdHash, FileIndex, LineIndex)> = | |
| Vec::with_capacity(current_lines.len()); | |
| let mut sub_graph: HashMap<CpdHash, HashSet<(u32, u32)>> = HashMap::new(); | |
| // TODO: fn compute_next_lines(&hash_graph, &mut to_flush, &mut next_lines) | |
| // build vec from current_lines<(fi,li)> -> next_lines<(fi,li+1)> | |
| for e in current_lines.iter() { | |
| let finfo = &files_inf[e.1 as usize]; | |
| let next_li = (e.2 + 1) as usize; | |
| // reached end-of-file ? | |
| if next_li >= finfo.lines.len() { | |
| if window_size.0 >= min_window_size && current_lines.len() > 1 { | |
| // flush only if there are at least two files | |
| to_flush = true; | |
| } | |
| // eof reached process next file | |
| continue; | |
| } | |
| // get next line hash | |
| let next_hash = finfo.lines[next_li].hash; | |
| if let Some(lines) = hash_graph.get(&next_hash) { | |
| if lines.len() > 1 { | |
| // at least 2 lines | |
| next_lines.push((next_hash, e.1, next_li as u32)); | |
| // we must keep track of the next lines and build a new graph | |
| // if there are more than two hashes we will recurse | |
| sub_graph | |
| .entry(next_hash) | |
| .or_insert_with(HashSet::new) | |
| .insert((e.1, next_li as u32)); | |
| } else if window_size.0 >= min_window_size && current_lines.len() > 1 { | |
| // unique line detected: must flush , restart window | |
| // restart window for this index | |
| // when new file(s) are detected must insert new window | |
| to_flush = true; | |
| } | |
| } else if window_size.0 >= min_window_size && current_lines.len() > 1 { | |
| // unique line detected: must flush , restart window | |
| // restart window for this index | |
| // when new file(s) are detected must insert new window | |
| to_flush = true; | |
| } | |
| } | |
| // will recurse | |
| if sub_graph.len() > 1 && window_size.0 >= min_window_size { | |
| to_flush = true; | |
| } | |
| // compute result before recursion | |
| if to_flush { | |
| // build hash of all hashes | |
| // insert in 2 maps | |
| // window_size to hset(window_hash) to hset(fi, li) : u32 -> CpdHash, hmap(CpdHash) -> hset((FileIndex, LineNumber)) | |
| let hash_concat = { | |
| let window_size = window_size.0 as u32; | |
| let mut window_digest = crc64::Digest::new(crc64::ECMA); | |
| let e = ¤t_lines[0]; | |
| let (fi, li) = (e.1 as usize, e.2 as usize); | |
| let finfo = &files_inf[fi]; | |
| let mut bytes = | |
| Vec::with_capacity((window_size as usize) * mem::size_of::<CpdHash>()); | |
| for i in 0..(window_size as usize) { | |
| let h = finfo.lines[i + (li + 1) - (window_size as usize)].hash; | |
| bytes.write_u64::<BigEndian>(h).unwrap(); | |
| } | |
| window_digest.write(&bytes); | |
| window_digest.sum64() | |
| }; | |
| // insert window hash | |
| { | |
| let window_size = window_size.0 as u32; | |
| results | |
| .write() | |
| .unwrap() | |
| .window_size_map | |
| .entry(window_size) | |
| .or_insert_with(HashSet::new) | |
| .insert(hash_concat); | |
| } | |
| for e in current_lines.iter() { | |
| let (fi, li) = (e.1 as usize, e.2 as usize); | |
| // NB: "(li + 1) - window_size.0" | |
| // and not "li - window_size.0 + 1" to avoid underflow | |
| let l_start_index = (li + 1) - window_size.0; | |
| results | |
| .write() | |
| .unwrap() | |
| .window_to_files_range | |
| .entry(hash_concat) | |
| .or_insert_with(HashSet::new) | |
| .insert((fi as FileIndex, l_start_index as LineNumber)); | |
| } | |
| } | |
| // unique hash ? loop | |
| if sub_graph.len() == 1 { | |
| // prepare next cursors | |
| *current_lines = next_lines.clone(); | |
| continue; | |
| } | |
| // for each "new" crc | |
| for (hash, hash_set) in &sub_graph { | |
| let mut current_lines: Vec<(CpdHash, FileIndex, LineIndex)> = Vec::new(); | |
| for set in hash_set.iter() { | |
| current_lines.push((*hash, set.0, set.1)); | |
| } | |
| walk_graph( | |
| results, | |
| min_window_size, | |
| WindowSize(window_size.0), | |
| CallDepth(depth.0 + 1), | |
| &files_inf, | |
| &hash_graph, | |
| &mut current_lines, | |
| ); | |
| } | |
| break; | |
| } | |
| } | |
| /// This function detects the starting point of a cut/paste region. | |
| /// It simply compares the crcs of each previous lines. | |
| /// If it diverges (or some previous lines cannot be computed): you have your starting point. | |
| fn get_first_common_line( | |
| files_inf: &[FileInfo], | |
| current_lines: &mut Vec<(CpdHash, FileIndex, LineIndex)>, | |
| ) { | |
| let mut previous_lines: Vec<(CpdHash, FileIndex, LineIndex)> = Vec::new(); | |
| if current_lines.is_empty() { | |
| return; | |
| } | |
| loop { | |
| let mut prev_hash: CpdHash = 0; | |
| for (index, l) in current_lines.iter().enumerate() { | |
| let &(_, fi, li) = l; | |
| if li == 0 { | |
| // cannot compute previous line | |
| return; | |
| } | |
| let finfo = &files_inf[fi as usize]; | |
| let prev_li = li - 1; | |
| let hash = finfo.lines[prev_li as usize].hash; | |
| if index > 0 && hash != prev_hash { | |
| // found an other hash | |
| return; | |
| } | |
| prev_hash = hash; | |
| previous_lines.push((prev_hash, fi, prev_li)); | |
| } | |
| current_lines.clear(); | |
| current_lines.append(&mut previous_lines); | |
| } | |
| } | |
| fn filter_commom_starting_point( | |
| n_jobs: usize, | |
| hash_vec: Vec<CpdHash>, | |
| files_inf: &Arc<RwLock<Vec<FileInfo>>>, | |
| hash_graph: &Arc<RwLock<HashMap<CpdHash, HashSet<(FileIndex, LineIndex)>>>>, | |
| ) -> Vec<CpdHash> { | |
| let nr_hash = hash_vec.len(); | |
| let hash_vec = Arc::new(hash_vec); | |
| let idx = Arc::new(AtomicUsize::new(0)); | |
| let idx = Arc::clone(&idx); | |
| let filtered_hash = Arc::new(RwLock::new(HashSet::<CpdHash>::new())); | |
| let mut handles = vec![]; | |
| eprintln!("filtering starting points..."); | |
| for th_idx in 0..n_jobs { | |
| let idx = Arc::clone(&idx); | |
| let files_inf = Arc::clone(&files_inf); | |
| let hash_vec = Arc::clone(&hash_vec); | |
| let hash_graph = Arc::clone(&hash_graph); | |
| let filtered_hash = Arc::clone(&filtered_hash); | |
| let th_name = format!("hash_filter_{}", th_idx); | |
| let builder = thread::Builder::new().name(th_name); | |
| let handle = builder | |
| .spawn(move || loop { | |
| let i = idx.fetch_add(1, Ordering::SeqCst); | |
| if i >= nr_hash { | |
| return; | |
| } | |
| let files_inf = files_inf.read().unwrap(); | |
| let hash = hash_vec[i]; | |
| let mut current_lines: Vec<(CpdHash, FileIndex, LineIndex)> = Vec::new(); | |
| let hash_graph = hash_graph.read().unwrap(); | |
| let hash_set = hash_graph.get(&hash); | |
| for set in hash_set.unwrap().iter() { | |
| current_lines.push((hash, set.0, set.1)); | |
| } | |
| get_first_common_line(&files_inf, &mut current_lines); | |
| // println!("hash {} -> start @ hash {}", hash, current_lines[0].0); | |
| let hash = current_lines[0].0; | |
| let mut filtered_hash = filtered_hash.write().unwrap(); | |
| filtered_hash.insert(hash); | |
| }) | |
| .unwrap(); | |
| handles.push(handle); | |
| } | |
| for handle in handles { | |
| handle.join().unwrap(); | |
| } | |
| eprintln!("done"); | |
| let filtered_hash = filtered_hash.read().unwrap(); | |
| let mut hash_vec = Vec::with_capacity(filtered_hash.len()); | |
| eprintln!( | |
| "found {} common starting points...", | |
| nr_hash - filtered_hash.len() | |
| ); | |
| hash_vec.extend(filtered_hash.iter()); | |
| hash_vec | |
| } | |
| fn parse_graph( | |
| n_jobs: usize, | |
| stack_size: usize, | |
| hash_graph: &Arc<RwLock<HashMap<CpdHash, HashSet<(FileIndex, LineIndex)>>>>, | |
| hash_vec: Vec<CpdHash>, | |
| files_inf: &Arc<RwLock<Vec<FileInfo>>>, | |
| results: &Arc<RwLock<CpdResults>>, | |
| min_window_size: usize, | |
| ) { | |
| eprintln!("parsing graph nodes..."); | |
| let nr_hash = hash_vec.len(); | |
| let hash_vec = Arc::new(RwLock::new(hash_vec)); | |
| let idx = Arc::new(AtomicUsize::new(0)); | |
| let mut handles = vec![]; | |
| for th_idx in 0..n_jobs { | |
| let results = Arc::clone(&results); | |
| let idx = Arc::clone(&idx); | |
| let hash_graph = Arc::clone(hash_graph); | |
| let hash_vec = Arc::clone(&hash_vec); | |
| let files_inf = Arc::clone(files_inf); | |
| let th_name = format!("worker_{}", th_idx); | |
| let builder = thread::Builder::new().name(th_name).stack_size(stack_size); | |
| let handle = builder | |
| .spawn(move || loop { | |
| let i = idx.fetch_add(1, Ordering::SeqCst); | |
| if i >= nr_hash { | |
| return; | |
| } | |
| let hash_graph = hash_graph.read().unwrap(); | |
| let hash_vec = hash_vec.read().unwrap(); | |
| let files_inf = files_inf.read().unwrap(); | |
| let hash = hash_vec[i]; | |
| let hash_set = hash_graph.get(&hash); | |
| let mut current_lines: Vec<(CpdHash, FileIndex, LineIndex)> = Vec::new(); | |
| // eprintln!("hash 0x{:x} {{", hash); | |
| for set in hash_set.unwrap().iter() { | |
| current_lines.push((hash, set.0, set.1)); | |
| } | |
| walk_graph( | |
| &results, | |
| min_window_size, | |
| WindowSize(0), | |
| CallDepth(1), | |
| &files_inf, | |
| &hash_graph, | |
| &mut current_lines, | |
| ); | |
| }) | |
| .unwrap(); | |
| handles.push(handle); | |
| } | |
| for handle in handles { | |
| handle.join().unwrap(); | |
| } | |
| eprintln!("done"); | |
| } | |
| fn print_results(files_inf: &RwLock<Vec<FileInfo>>, results: &RwLock<CpdResults>) { | |
| // add option : print results | |
| let files_inf = files_inf.read().unwrap(); | |
| let map = &results.read().unwrap().window_size_map; | |
| let mut window_hash_vec = Vec::<(u32, &HashSet<CpdHash>)>::new(); | |
| // map -> vec | |
| for (k, v) in map { | |
| window_hash_vec.push((*k, v)); | |
| } | |
| // sort by window size greater first | |
| window_hash_vec.sort_unstable_by(|a, b| (b.0).cmp(&a.0)); | |
| let file_map = &results.read().unwrap().window_to_files_range; | |
| // print all windows | |
| for w in &window_hash_vec { | |
| let (window_size, cpd_hashes) = (w.0 as usize, w.1); | |
| // sort hashes | |
| let mut h_vec = Vec::new(); | |
| h_vec.extend(cpd_hashes.into_iter()); | |
| h_vec.sort(); | |
| for hash in h_vec.iter() { | |
| println!("\n\nfound window size {}", window_size); | |
| println!("checking {:?}", hash); | |
| if let Some(hset) = file_map.get(hash) { | |
| // sort file by file index | |
| let mut v = Vec::new(); | |
| for (fi, li) in hset.iter() { | |
| v.push((*fi as usize, *li as usize)); | |
| } | |
| v.sort_unstable_by(|a, b| { | |
| let res = (a).0.cmp(&b.0); | |
| if res == std::cmp::Ordering::Equal { | |
| (a).1.cmp(&b.1) | |
| } else { | |
| res | |
| } | |
| }); | |
| for (index, (fi, li)) in v.iter().enumerate() { | |
| let finfo = &files_inf[*fi]; | |
| let l_start = finfo.lines[*li].line_number; | |
| let l_end = finfo.lines[*li + window_size - 1].line_number; | |
| println!("{} : {}-{}", finfo.filename, l_start, l_end); | |
| if index + 1 == v.len() { | |
| println!("8<----------------"); | |
| print_file_lines(&finfo.filename, l_start, l_end); | |
| println!("8<----------------"); | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| fn read_file(filename: &str) -> Vec<String> { | |
| let mut v = Vec::new(); | |
| let file = File::open(filename); | |
| let file = match file { | |
| Ok(file) => file, | |
| _ => { | |
| eprintln!("cannot parse {}", filename); | |
| return v; | |
| } | |
| }; | |
| let buf_reader = BufReader::new(file); | |
| for line in buf_reader.lines() { | |
| v.push(line.unwrap()); | |
| } | |
| v | |
| } | |
| fn parse_command_line() -> Config { | |
| let matches = App::new("cpd") | |
| .version(VERSION) | |
| .author("Carl-Erwin Griffith <carl.erwin@gmail.com>") | |
| .about("simple cut/paste detector") | |
| .arg( | |
| Arg::with_name("THREAD") | |
| .short("t") | |
| .long("thread") | |
| .value_name("NUMBER") | |
| .help("sets number of thread to scan the files") | |
| .takes_value(true), | |
| ) | |
| .arg( | |
| Arg::with_name("STACK_SIZE") | |
| .short("s") | |
| .long("stack_size") | |
| .value_name("STACK_SIZE") | |
| .help("sets worker thread stack size in bytes (default Mib)") | |
| .takes_value(true), | |
| ) | |
| .arg( | |
| Arg::with_name("FL") | |
| .short("l") | |
| .long("list") | |
| .value_name("FL") | |
| .help("this file contains the list of files to scan") | |
| .takes_value(true), | |
| ) | |
| .arg( | |
| Arg::with_name("WINDOW") | |
| .short("w") | |
| .long("min_window") | |
| .value_name("WINDOW") | |
| .help("sets the minimun number of lines to consider in results") | |
| .takes_value(true), | |
| ) | |
| .arg( | |
| Arg::with_name("PR") | |
| .short("r") | |
| .long("--skip-results") | |
| .help("do not print results"), | |
| ) | |
| .arg( | |
| Arg::with_name("FILES") | |
| .help("list of the files to scan") | |
| .required(false) | |
| .multiple(true), | |
| ) | |
| .get_matches(); | |
| let nr_threads = if matches.is_present("THREAD") { | |
| let nr_threads = matches.values_of("THREAD").unwrap().collect::<String>(); | |
| nr_threads.trim_right().parse::<usize>().unwrap_or(1) | |
| } else { | |
| num_cpus::get() | |
| }; | |
| let nr_threads = ::std::cmp::max(1, nr_threads); | |
| let stack_size = if matches.is_present("STACK_SIZE") { | |
| let stack_size = matches.values_of("STACK_SIZE").unwrap().collect::<String>(); | |
| stack_size | |
| .trim_right() | |
| .parse::<usize>() | |
| .unwrap_or(64 * 1024 * 1024) | |
| } else { | |
| 64 * 1024 * 1024 | |
| }; | |
| let stack_size = ::std::cmp::max(8192, stack_size); | |
| let min_window_size = if matches.is_present("WINDOW") { | |
| let min_window_size = matches.values_of("WINDOW").unwrap().collect::<String>(); | |
| min_window_size | |
| .trim_right() | |
| .parse::<usize>() | |
| .unwrap_or(DEFAULT_MIN_WINDOW_SIZE) | |
| } else { | |
| DEFAULT_MIN_WINDOW_SIZE | |
| }; | |
| let mut files = Vec::new(); | |
| if matches.is_present("FL") { | |
| let filename = matches.values_of("FL").unwrap().collect::<String>(); | |
| files.extend(read_file(&filename)); | |
| } | |
| if matches.is_present("FILES") { | |
| let list: Vec<String> = matches | |
| .values_of("FILES") | |
| .unwrap() | |
| .map(|x| x.to_owned()) | |
| .collect(); | |
| files.extend(list); | |
| } | |
| let print_results = !matches.is_present("PR"); | |
| Config { | |
| nr_threads, | |
| files, | |
| min_window_size, | |
| stack_size, | |
| print_results, | |
| } | |
| } | |
| fn main() { | |
| let config = parse_command_line(); | |
| // dedup files | |
| let set: HashSet<String> = config.files.into_iter().collect(); | |
| let mut args: Vec<String> = set.into_iter().collect(); | |
| // sort files list | |
| args.sort(); | |
| let results = Arc::new(RwLock::new(CpdResults::new())); | |
| // prepare file slots | |
| let mut v = Vec::<FileInfo>::with_capacity(args.len()); | |
| for (i, item) in args.iter().enumerate() { | |
| v.push(FileInfo { | |
| file_index: i, | |
| filename: item.clone(), | |
| lines: Vec::new(), | |
| }); | |
| } | |
| let files_inf = Arc::new(RwLock::new(v)); | |
| let (hash_vec, hash_graph) = parse_files(config.nr_threads, args.len(), &files_inf); | |
| let hash_vec = | |
| filter_commom_starting_point(config.nr_threads, hash_vec, &files_inf, &hash_graph); | |
| parse_graph( | |
| config.nr_threads, | |
| config.stack_size, | |
| &hash_graph, | |
| hash_vec, | |
| &files_inf, | |
| &results, | |
| config.min_window_size, | |
| ); | |
| if config.print_results { | |
| print_results(&files_inf, &results); | |
| } | |
| } |