# Example usage

To use `deduplication` in a project:

In [4]:
import deduplication
from utils.utils import read_tsv
from deduplication.LSH import LSH
from deduplication.LSHImproved import LSHImproved
from deduplication.LSHForest import LSHForest
from deduplication.dedup import Baseline
from utils.use_cases import collection_deduplication, nearest_neighbor_search

print(deduplication.__version__)

0.1.0


## LSH Baeline

In [22]:
tsv_dict = read_tsv('../data/onek.tsv')

num_hashes = 100
num_bands = 20
rows_per_band = 5
k = 10

lsh = LSH(num_hashes=num_hashes, num_bands=num_bands, rows_per_band=rows_per_band, k=k)
signatures = lsh.compute_minhash_signatures(tsv_dict)
clusters = collection_deduplication(lsh)

for key, value in list(clusters.items())[:20]:
    print(key, value)

1 [1]
2 [2]
3 [3]
4 [4]
5 [5]
6 [6]
7 [7]
8 [8]
9 [9]
10 [10]
11 [11]
12 [12]
13 [13]
14 [14]
15 [15]
16 [16, 41, 299, 356, 745]
17 [17]
18 [18]
19 [19]
20 [20, 266, 972]


## LSH Multi Probe

In [23]:
tsv_dict = read_tsv('../data/onek.tsv')

num_hashes = 100
num_bands = 20
rows_per_band = 5
k = 10

lsh = LSHImproved(num_hashes=num_hashes, num_bands=num_bands, rows_per_band=rows_per_band, k=k)
signatures = lsh.compute_minhash_signatures(tsv_dict)
clusters = collection_deduplication(lsh)
for key, value in list(clusters.items())[:20]:
    print(key, value)

1 [1]
2 [2]
3 [3]
4 [4]
5 [5]
6 [6]
7 [7]
8 [8]
9 [9]
10 [10]
11 [11]
12 [12]
13 [13]
14 [14]
15 [15]
16 [16, 41, 299, 356, 745]
17 [17]
18 [18]
19 [19]
20 [20, 266, 972]


## LSH Forest

In [24]:
tsv_dict = read_tsv('../data/onek.tsv')

num_hashes = 200
num_bands = 10
rows_per_band = 5
num_trees = 4
k = 10

lsh = LSHForest(num_hashes=num_hashes, num_bands=num_bands, rows_per_band=rows_per_band, k=k, num_trees=num_trees)
signatures = lsh.compute_minhash_signatures(tsv_dict)
clusters = collection_deduplication(lsh)
for key, value in list(clusters.items())[:20]:
    print(key, value)

1 [1]
2 [2]
3 [3]
4 [4]
5 [5]
6 [6]
7 [7]
8 [8]
9 [9]
10 [10]
11 [11]
12 [12]
13 [13]
14 [14]
15 [15]
16 [16, 41, 299, 356, 745]
17 [17]
18 [18]
19 [19]
20 [20, 266, 972]


## Baseline

In [25]:
tsv_dict = read_tsv('../data/onek.tsv')
base = Baseline()
clusters = base.collection_deduplication(tsv_dict)
for key, value in list(clusters.items())[:20]:
    print(key, value)

1 [1]
2 [2]
3 [3]
4 [4]
5 [5]
6 [6]
7 [7]
8 [8]
9 [9]
10 [10]
11 [11]
12 [12]
13 [13]
14 [14]
15 [15]
16 [16, 41, 299, 356, 745]
17 [17]
18 [18]
19 [19]
20 [20, 266, 972]


## How to run package from a terminal

Arguments:
- -d, --indir (str): Required. Directory path of the input file.
- -t, --case (str): Required. Type of use case.
- -s, --save (str): Optional. Whether to output results to a text file ('y' or 'n').
- -e, --example (str): Optional. Document to query.
- -n, --numhash (int): Optional. Number of hash functions to use.
- -b, --numband (int): Optional. Number of bands.
- -r, --row (int): Optional. Number of rows per band.
- -k, --shinlen (int): Optional. Length of shingles.
- -c, --treesize (int): Optional. Size of the tree.
- -m, --method (str): Optional. Default is 'LSH'. Specifies the method to use. Options: 'baseline', 'LSH', 'LSH_mp', 'LSH_forest'.

Terminal Code:
- python -m deduplication -d './data/onek.tsv' -t 'deduplication' -s 'y'
- python -m deduplication -d './data/threehundred.tsv' -t 'deduplication'
- python -m deduplication -d './data/threehundred.tsv' -t 'deduplication' -m 'baseline'
- python -m deduplication -d './data/hundred.tsv' -t 'ann' -e 'this is a blank statement'
- python -m deduplication -d './data/onek.tsv' -t 'deduplication' -m "LSH_forest"
- python -m deduplication -d './data/onek.tsv' -t 'deduplication' -m "LSH_forest" -n 200 -b 10 -r 5 -c 4