In [None]:
# Informatics code for Younginger, Stewart, Balkan and Ballhorn
## "Stable coexistence or competitive exclusion? 
## Fern endophytes shift our understanding of microbial temporal turnover"

In [None]:
## Software needed for analysis:
FastQC v0.11.5
usearch v10.0.247M_i86linux64
jun_jul_renamer.sh
control_renamer.sh
R1_renamer.sh


## Sample processing

### Concatenate the R1 reads since they were demultiplexed:

cat *_R1_* > ../FastQC/jun_jul_R1.fastq.gz

##### Total raw R1 reads for each month #####

- Jun_Jul: 2605987
- Aug_Sep: 2440660
- Oct_Nov: 3310806
- Dec_Jan: 2465935
- preprocessed Apr_May: 15566066
- demultiplexed Apr_May: 7955646

## Demultiplex April May reads

usearch10.0.274M_i86linux64 -fastx_demux Undetermined_S0_L001_R1_001.fastq -reverse Undetermined_S0_L001_R2_001.fastq -index Undetermined_S0_L001_I1_001.fastq -barcodes barcodes_6.fa -fastqout apr_may_R1.fastq -output2 apr_may_R2.fastq

	03:30 110Mb   100.0% Demuxed 7955646 / 15566066 (51.1%)

## Consider using only R1 reads and also merging, running the analyses side-by-side to see how the results differ...
	## Will merge first though

## Also, need to look at how the files are labeled and rename the control files to match the appropriate formatting
	## Jun_Jul will need to be renamed to match the other months, too!

# Changed the Jun_Jul samples with the following script:

jun_jul_renamer.sh

# Will now change up the names of the controls in the remaining samples

control_renamer.sh

## Merging reads

## Apr_May

usearch10.0.274M_i86linux64 -fastq_mergepairs apr_may_R1.fastq -fastqout ../Merged_reads/apr_may.fastq -fastq_maxdiffs 300


Merging
  Fwd apr_may_R1.fastq
  Rev apr_may_R2.fastq
  Keep read labels

00:01 71Mb   CPU has 28 cores, defaulting to 10 threads
01:14 751Mb   100.0% 91.4% merged

Totals:
   7955646  Pairs (8.0M)
   7274761  Merged (7.3M, 91.44%)
   1423348  Alignments with zero diffs (17.89%)
    641535  Too many diffs (> 300) (8.06%)
     39350  No alignment found (0.49%)
         0  Alignment too short (< 16) (0.00%)
   7469521  Staggered pairs (93.89%) merged & trimmed
    205.78  Mean alignment length
    209.40  Mean merged length
      0.33  Mean fwd expected errors
      1.90  Mean rev expected errors
      0.14  Mean merged expected errors
      
## Jun_Jul

usearch10.0.274M_i86linux64 -fastq_mergepairs *R1*.fastq -fastqout ../Merged_reads/jun_jul.fastq -relabel @ -fastq_maxdiffs 300

00:18 765Mb  CPU has 28 cores, defaulting to 10 threads
00:18 765Mb   100.0% 91.9% merged

Totals:
   2605987  Pairs (2.6M)
   2395636  Merged (2.4M, 91.93%)
    193274  Alignments with zero diffs (7.42%)
    175297  Too many diffs (> 300) (6.73%)
     35054  No alignment found (1.35%)
         0  Alignment too short (< 16) (0.00%)
   2344067  Staggered pairs (89.95%) merged & trimmed
    199.25  Mean alignment length
    216.07  Mean merged length
      0.64  Mean fwd expected errors
      1.70  Mean rev expected errors
      0.19  Mean merged expected errors

#### Wondering if I need to deal with differences in header lines between Apr_May and the rest of the files:

@M02149:71:000000000-AAMFM:1:1101:13889:1730;sample=BY-12-1-5;

@BY-10-1-6.2

## Will try to address it with the following argument: -relabel @

usearch10.0.274M_i86linux64 -fastq_mergepairs apr_may_R1.fastq -fastqout ../Merged_reads/apr_may_V2.fastq -relabel @ -fastq_maxdiffs 300
	# Nope, that didn't do it. Hopefully it will get taken care of later in processing  
    
## Aug_Sep

usearch10.0.274M_i86linux64 -fastq_mergepairs *R1*.fastq -fastqout ../Merged_reads/aug_sep.fastq -relabel @ -fastq_maxdiffs 300

00:15 765Mb  CPU has 28 cores, defaulting to 10 threads
00:15 765Mb   100.0% 54.8% merged

Totals:
   2440660  Pairs (2.4M)
   1337858  Merged (1.3M, 54.82%)
      9220  Alignments with zero diffs (0.38%)
   1083301  Too many diffs (> 300) (44.39%)
         4  Fwd tails Q <= 2 trimmed (0.00%)
        41  Rev tails Q <= 2 trimmed (0.00%)
        87  Fwd too short (< 64) after tail trimming (0.00%)
       309  Rev too short (< 64) after tail trimming (0.01%)
     19105  No alignment found (0.78%)
         0  Alignment too short (< 16) (0.00%)
   2306121  Staggered pairs (94.49%) merged & trimmed
    201.15  Mean alignment length
    210.86  Mean merged length
      1.16  Mean fwd expected errors
      3.87  Mean rev expected errors
      0.62  Mean merged expected errors

# Too many diffs resulted in 44.39% of the reads getting discarded! Did this happen before? 
	# May need to just continue with the R1s...
	# But what if I run it with 400 max diffs? 

usearch10.0.274M_i86linux64 -fastq_mergepairs *R1*.fastq -fastqout ../Merged_reads/aug_sep_V2.fastq -relabel @ -fastq_maxdiffs 400
	# Results in the exact same output. Really weird and an indication that I should be using the R1 reads only

## Oct_Nov

usearch10.0.274M_i86linux64 -fastq_mergepairs *R1*.fastq -fastqout oct_nov.fastq -relabel @ -fastq_maxdiffs 300
	# Need to move this into the Merged_reads directory since I didn't specify

00:21 765Mb  CPU has 28 cores, defaulting to 10 threads
00:21 765Mb   100.0% 90.4% merged

Totals:
   3310806  Pairs (3.3M)
   2992916  Merged (3.0M, 90.40%)
     34999  Alignments with zero diffs (1.06%)
    306191  Too many diffs (> 300) (9.25%)
     11699  No alignment found (0.35%)
         0  Alignment too short (< 16) (0.00%)
   2946717  Staggered pairs (89.00%) merged & trimmed
    197.32  Mean alignment length
    219.33  Mean merged length
      0.64  Mean fwd expected errors
      2.59  Mean rev expected errors
      0.58  Mean merged expected errors

## Dec_Jan

usearch10.0.274M_i86linux64 -fastq_mergepairs *R1*.fastq -fastqout ../Merged_reads/dec_jan.fastq -relabel @ -fastq_maxdiffs 300

00:16 765Mb  CPU has 28 cores, defaulting to 10 threads
00:16 765Mb   100.0% 94.2% merged

Totals:
   2465935  Pairs (2.5M)
   2323617  Merged (2.3M, 94.23%)
     63861  Alignments with zero diffs (2.59%)
    133325  Too many diffs (> 300) (5.41%)
      8993  No alignment found (0.36%)
         0  Alignment too short (< 16) (0.00%)
   2158636  Staggered pairs (87.54%) merged & trimmed
    196.24  Mean alignment length
    221.91  Mean merged length
      0.80  Mean fwd expected errors
      2.38  Mean rev expected errors
      0.33  Mean merged expected errors

## Put them all together

cat *.fastq > temporal_merged.fastq

## Now look into renaming the R1 files and concatenating them

## Need to relabel all the header lines with the fastx_relabel command with usearch
	## This should be easy enough with a shell script, but need to deal with the Jun_Jul samples
	## and the controls. I should make a separate script for each month-group
	
# Used the following script to rename the R1s only:

R1_renamer.sh

# Concerned about the differences in the header lines between Apr_May and the rest, but will have
	# to wait and see how usearch handles it
	
cat * > ../../Forward_reads/jun_jul_R1.fastq
	# Did the same for the remaining months

# Okay, the merged and R1 files are in the following directories, respectively:

/workspace/scratch/obrett/Temporal/Merged_reads/temporal_merged.fastq
	### Read total: 16324788 ###

/workspace/scratch/obrett/Temporal/Forward_reads/temporal_R1.fastq
	### Read total: 18779034 ###

# Will work on Cutadapt next:

## R1 reads only:

cutadapt -a GATCTCTTGGNTCTNGCATCGATGAAGAACG -q 20 -e 0.2 temporal_R1.fastq -o temporal_R1_cutadapt.fastq

This is cutadapt 1.10 with Python 2.7.7
Command line parameters: -a GATCTCTTGGNTCTNGCATCGATGAAGAACG -q 20 -e 0.2 temporal_R1.fastq -o temporal_R1_cutadapt.fastq
Trimming 1 adapter with at most 20.0% errors in single-end mode ...


Finished in 474.03 s (25 us/read; 2.38 M reads/minute).

=== Summary ===

Total reads processed:              18,779,034
Reads with adapters:                17,002,958 (90.5%)
Reads written (passing filters):    18,779,034 (100.0%)

Total basepairs processed: 4,709,405,469 bp
Quality-trimmed:              56,305,856 bp (1.2%)
Total written (filtered):  3,905,674,514 bp (82.9%)

=== Adapter 1 ===

Sequence: GATCTCTTGGNTCTNGCATCGATGAAGAACG; Type: regular 3'; Length: 31; Trimmed: 17002958 times.

No. of allowed errors:
0-4 bp: 0; 5-9 bp: 1; 10-14 bp: 2; 15-19 bp: 3; 20-24 bp: 4; 25-29 bp: 5; 30-31 bp: 6

Bases preceding removed adapters:
  A: 0.3%
  C: 0.3%
  G: 98.6%
  T: 0.9%
  none/other: 0.0%
WARNING:
    The adapter is preceded by "G" extremely often.
    The provided adapter sequence may be incomplete.
    To fix the problem, add "G" to the beginning of the adapter sequence.

Overview of removed sequences
length	count	expect	max.err	error counts
3	31375	293422.4	0	31375
4	28238	73355.6	0	5937 22301
5	22333	18338.9	1	7998 14335
6	6189	4584.7	1	2531 3658
7	22495	1146.2	1	18857 3638
8	47008	286.5	1	40986 5554 468
9	8274	71.6	1	5165 1596 1513
10	7200	17.9	2	3041 1185 2974
11	7357	4.5	2	4212 1968 1177
12	16153	1.1	2	11227 2929 1952 45
13	7455	0.3	2	4252 1870 993 340
14	27889	0.1	2	18397 6911 2338 243
15	17096	0.0	3	10683 4247 1596 570
16	26221	0.0	3	18821 4962 1653 784 1
17	27322	0.0	3	19179 5166 1657 1061 259
18	12899	0.0	3	8406 2558 1119 472 344
19	23484	0.0	3	16584 3758 1673 665 804
20	45336	0.0	4	27816 11078 4132 1527 781 2
21	52374	0.0	4	27228 14602 3381 1243 5917 3
22	21090	0.0	4	6973 6599 1875 904 4643 96
23	50084	0.0	4	31639 11956 3998 1599 870 22
24	73617	0.0	4	48967 15172 6296 2016 1082 78 6
25	50452	0.0	5	35043 9472 3187 1498 718 450 84
26	263347	0.0	5	224053 29662 5620 2044 1133 739 96
27	194213	0.0	5	158479 23443 5682 2284 2518 1373 434
28	32278	0.0	5	14878 9939 4171 1836 978 450 26
29	98665	0.0	5	58998 19428 8475 4134 5962 1630 38
30	53098	0.0	6	21977 12658 6444 3048 7422 1165 384
31	45000	0.0	6	21867 11834 6037 2779 1468 663 352
32	70381	0.0	6	37861 17673 8230 3811 1644 780 382
33	69273	0.0	6	38613 16868 7638 3392 1603 740 419
34	68984	0.0	6	31130 15773 7605 5557 3923 2949 2047
35	87784	0.0	6	47293 20269 8094 5650 3210 2079 1189
36	260133	0.0	6	156882 57877 20757 12729 6437 3657 1794
37	913918	0.0	6	703775 123618 35418 23912 10373 5127 11695
38	407887	0.0	6	315044 53881 15420 10661 4746 2512 5623
39	255709	0.0	6	197932 33309 9528 6523 2912 1697 3808
40	227071	0.0	6	176411 28766 8478 5534 2515 1515 3852
41	226038	0.0	6	174433 28484 8465 5264 2401 1519 5472
42	232753	0.0	6	182365 28607 8154 5525 2642 1690 3770
43	371030	0.0	6	296476 41970 11727 8155 3838 2487 6377
44	572833	0.0	6	457233 61932 18349 11896 5632 3616 14175
45	867901	0.0	6	702918 93397 25678 18176 8691 5868 13173
46	3276664	0.0	6	2772634 293239 72195 50987 23560 16056 47993
47	6077651	0.0	6	5264534 454810 110150 77091 35670 25385 110011
48	35061	0.0	6	20401 5610 1982 1621 1468 1932 2047
49	43796	0.0	6	31231 7787 2047 1155 665 450 461
50	89724	0.0	6	72792 10909 2092 2001 808 535 587
51	831068	0.0	6	748778 62519 7447 8464 1487 865 1508
52	50675	0.0	6	38116 7338 1825 1258 680 711 747
53	36339	0.0	6	30508 3970 817 283 190 136 435
54	32163	0.0	6	25122 4809 1033 379 208 167 445
55	13365	0.0	6	10637 1654 358 265 140 127 184
56	101183	0.0	6	87445 10080 1291 1342 531 218 276
57	15735	0.0	6	11335 3137 567 265 126 191 114
58	99392	0.0	6	87135 8479 1450 915 495 416 502
59	7373	0.0	6	5437 1056 301 210 105 113 151
60	15062	0.0	6	12341 1767 358 226 124 128 118
61	29378	0.0	6	24613 3249 604 342 169 167 234
62	8251	0.0	6	6638 809 237 169 112 116 170
63	9367	0.0	6	7864 819 199 186 107 75 117
64	2628	0.0	6	1326 687 208 142 99 85 81
65	23272	0.0	6	5081 14903 1863 562 471 219 173
66	44629	0.0	6	31068 7110 1902 1920 1315 755 559
67	7952	0.0	6	2744 2080 811 849 649 489 330
68	11784	0.0	6	8341 2245 542 272 161 82 141
69	4656	0.0	6	3109 948 232 142 77 64 84
70	1081	0.0	6	451 285 118 54 50 57 66
71	45798	0.0	6	36741 6419 887 534 470 407 340
72	44880	0.0	6	33993 6193 1815 915 535 425 1004
73	3502	0.0	6	2669 463 162 98 48 33 29
74	1742	0.0	6	1278 234 72 78 30 26 24
75	2289	0.0	6	1709 327 77 76 33 29 38
76	8822	0.0	6	6223 1890 284 175 80 55 115
77	11994	0.0	6	8002 2661 604 262 196 169 100
78	2080	0.0	6	1579 274 118 39 21 23 26
79	18698	0.0	6	13978 3910 443 130 133 61 43
80	2896	0.0	6	2332 376 91 47 24 15 11
81	1384	0.0	6	801 345 156 41 16 11 14
82	3716	0.0	6	2039 1161 355 76 39 29 17
83	10655	0.0	6	6310 3183 874 149 54 27 58
84	6795	0.0	6	4413 1579 573 108 48 31 43
85	2276	0.0	6	1419 579 157 43 48 16 14
86	1187	0.0	6	749 279 95 32 16 5 11
87	803	0.0	6	455 165 77 28 29 20 29
88	1420	0.0	6	761 265 121 97 68 68 40
89	275	0.0	6	175 56 21 7 6 4 6
90	422	0.0	6	299 79 16 10 7 8 3
91	286	0.0	6	192 49 14 13 7 7 4
92	634	0.0	6	492 73 23 13 15 9 9
93	109	0.0	6	62 34 8 2 1 0 2
94	142	0.0	6	84 34 13 2 4 2 3
95	239	0.0	6	130 58 18 9 8 8 8
96	488	0.0	6	308 91 37 19 9 12 12
97	324	0.0	6	152 119 16 10 5 11 11
98	797	0.0	6	475 153 44 50 33 23 19
99	403	0.0	6	32 276 52 15 11 11 6
100	77	0.0	6	29 34 7 5 0 1 1
101	119	0.0	6	45 49 13 3 4 3 2
102	207	0.0	6	126 51 12 5 6 4 3
103	545	0.0	6	388 100 23 8 11 2 13
104	78	0.0	6	36 25 8 4 1 2 2
105	130	0.0	6	52 51 11 6 4 2 4
106	197	0.0	6	68 84 29 4 2 4 6
107	190	0.0	6	87 54 36 4 2 2 5
108	205	0.0	6	95 66 31 5 4 1 3
109	365	0.0	6	85 134 108 16 12 5 5
110	957	0.0	6	169 397 303 48 11 13 16
111	2302	0.0	6	523 1372 248 71 39 26 23
112	1823	0.0	6	1368 240 77 53 18 14 53
113	22	0.0	6	17 2 0 1 1 1
114	15	0.0	6	11 4
115	22	0.0	6	16 4 0 0 0 2
116	31	0.0	6	26 4 0 0 0 0 1
117	40	0.0	6	29 4 1 3 0 0 3
118	127	0.0	6	104 20 1 1 0 1
119	361	0.0	6	296 50 6 5 1 2 1
120	24	0.0	6	21 2 1
121	8	0.0	6	4 2 0 0 1 0 1
122	5	0.0	6	0 2 3
123	6	0.0	6	5 0 0 0 0 1
124	1	0.0	6	1
125	6	0.0	6	3 0 1 0 1 1
126	9	0.0	6	7 1 0 1
127	14	0.0	6	7 5 1 1
128	15	0.0	6	7 2 4 0 0 0 2
129	6	0.0	6	5 0 1
130	14	0.0	6	13 0 1
131	12	0.0	6	8 1 1 0 0 2
132	26	0.0	6	19 2 4 0 0 0 1
133	32	0.0	6	23 4 3 0 0 0 2
134	33	0.0	6	25 4 4
135	115	0.0	6	89 15 4 4 1 1 1
136	393	0.0	6	337 27 16 9 1 1 2
137	17	0.0	6	6 0 11
138	65	0.0	6	13 1 46 4 0 1
139	149	0.0	6	13 0 124 10 1 1
140	31	0.0	6	27 1 0 1 1 1
141	90	0.0	6	83 5 1 0 0 1
142	262	0.0	6	223 25 6 2 1 2 3
143	23	0.0	6	18 4 0 0 1
144	45	0.0	6	33 10 0 0 1 0 1
145	9	0.0	6	7 1 1
146	9	0.0	6	8 1
147	37	0.0	6	34 2 1
148	99	0.0	6	91 3 3 0 0 0 2
149	3	0.0	6	3
151	1	0.0	6	0 1
152	2	0.0	6	0 2
153	1	0.0	6	0 1
156	1	0.0	6	0 1
157	1	0.0	6	0 1
158	2	0.0	6	0 2
159	5	0.0	6	0 5
160	15	0.0	6	0 15
164	1	0.0	6	0 1
176	2	0.0	6	2
192	1	0.0	6	0 1
196	4	0.0	6	0 4
197	8	0.0	6	0 8
209	1	0.0	6	0 0 0 0 0 0 1

WARNING:
    One or more of your adapter sequences may be incomplete.
    Please see the detailed output above.

### For paired reads which always overlap (per usearch docs):
If the read length is long enough that the longest ITS sequence will given an overlap of at least, say, 
32 bases, then you don't need any additional trimming:  fastq_mergepairs does everything you need. Short amplicons will 
create "staggered" pairs which are correctly truncated during the merging."

/workspace/scratch/obrett/Temporal/Merged_reads

usearch10.0.274M_i86linux64 -fastq_filter temporal_merged.fastq -fastq_maxee 1.0 -fastaout temporal_merged_filtered.fasta

00:00 38Mb   CPU has 28 cores, defaulting to 10 threads
01:37 683Mb   100.0% Filtering, 93.8% passed
  16324788  Reads (16.3M)                   
   1007674  Discarded reads with expected errs > 1.00
  15317114  Filtered reads (15.3M, 93.8%)
  
/workspace/scratch/obrett/Temporal/Forward_reads

usearch10.0.274M_i86linux64 -fastq_filter temporal_R1_cutadapt.fastq -fastq_maxee 1.0 -fastaout temporal_R1_filtered.fasta

00:01 38Mb   CPU has 28 cores, defaulting to 10 threads
02:19 683Mb   100.0% Filtering, 83.9% passed
  18779034  Reads (18.8M)                   
   3031410  Discarded reads with expected errs > 1.00
  15747604  Filtered reads (15.7M, 83.9%)

# Okay, 15.3M reads in the merged dataset, 15.7M reads in the R1 dataset.
	# Looks like those same reads in the Aug_Sep group were kicked out

usearch10.0.274M_i86linux64 -fastx_uniques Merged_reads/temporal_merged_filtered.fasta -fastaout OTUs/merged_uniques.fasta -sizeout -relabel Uniq

00:44 4.7Gb   100.0% Reading Merged_reads/temporal_merged_filtered.fasta
00:44 4.7Gb  CPU has 28 cores, defaulting to 10 threads                 
00:54 8.2Gb   100.0% DF
01:03 8.3Gb  15317114 seqs, 802957 uniques, 660362 singletons (82.2%)
01:03 8.3Gb  Min size 1, median 1, max 9957620, avg 19.08
01:10 5.8Gb   100.0% Writing OTUs/merged_uniques.fasta

usearch10.0.274M_i86linux64 -fastx_uniques Forward_reads/temporal_R1_filtered.fasta -fastaout OTUs/R1_uniques.fasta -sizeout -relabel Uniq

00:46 4.8Gb   100.0% Reading Forward_reads/temporal_R1_filtered.fasta
00:46 4.7Gb  CPU has 28 cores, defaulting to 10 threads              
00:56 8.3Gb   100.0% DF
01:05 8.4Gb  15747604 seqs, 958884 uniques, 757129 singletons (79.0%)
01:05 8.4Gb  Min size 1, median 1, max 9174463, avg 16.42
01:12 5.9Gb   100.0% Writing OTUs/R1_uniques.fasta

## 2020-02-12

# Will cluster OTUs and denoise into ZOTUs for both the merged and R1 reads

usearch10.0.274M_i86linux64 -cluster_otus merged_uniques.fasta -otus merged_otus.fasta -relabel Otu

00:17 61Mb    100.0% 1301 OTUs, 320 chimeras

usearch10.0.274M_i86linux64 -cluster_otus R1_uniques.fasta -otus R1_otus.fasta -relabel Otu

00:19 60Mb    100.0% 1474 OTUs, 670 chimeras
	
# Already 170 more OTUs and 350 more chimeras in the R1 dataset

usearch10.0.274M_i86linux64 -unoise3 merged_uniques.fasta -zotus merged_zotus.fasta

00:03 333Mb   100.0% Reading merged_uniques.fasta
00:04 327Mb   100.0% 992 amplicons, 2636496 bad (size >= 8) 
00:20 338Mb   100.0% 985 good, 7 chimeras                  
00:20 338Mb   100.0% Writing zotus

# Wow, already > 300 fewer ZOTUs than OTUs

usearch10.0.274M_i86linux64 -unoise3 R1_uniques.fasta -zotus R1_zotus.fasta

00:02 316Mb   100.0% Reading R1_uniques.fasta
00:04 316Mb   100.0% 1187 amplicons, 3803606 bad (size >= 8) 
00:21 327Mb   100.0% 1180 good, 7 chimeras                  
00:21 327Mb   100.0% Writing zotus

# For constructing these tables, I'll want to use the following files: 
	temporal_merged.fastq
	temporal_R1_cutadapt.fastq
	
usearch10.0.274M_i86linux64 -otutab ../Merged_reads/temporal_merged.fastq -otus merged_otus.fasta -otutabout merged_otutab.txt -mapout merged_map.txt

00:00 43Mb    100.0% Reading merged_otus.fasta
00:00 9.3Mb   100.0% Masking (fastnucleo)
00:00 10Mb    100.0% Word stats
00:00 10Mb    100.0% Alloc rows
00:00 12Mb    100.0% Build index
00:00 45Mb   CPU has 28 cores, defaulting to 10 threads
52:57 737Mb   100.0% Searching temporal_merged.fastq, 99.8% matched
16291012 / 16324788 mapped to OTUs (99.8%)
52:57 737Mb  Writing merged_otutab.txt
52:57 737Mb  Writing merged_otutab.txt ...done.


usearch10.0.274M_i86linux64 -otutab ../Forward_reads/temporal_R1_cutadapt.fastq -otus R1_otus.fasta -otutabout R1_otutab.txt -mapout R1_map.txt

00:00 43Mb    100.0% Reading R1_otus.fasta
00:00 9.3Mb   100.0% Masking (fastnucleo)
00:00 11Mb    100.0% Word stats
00:00 11Mb    100.0% Alloc rows
00:00 12Mb    100.0% Build index
00:00 45Mb   CPU has 28 cores, defaulting to 10 threads
53:02 736Mb   100.0% Searching temporal_R1_cutadapt.fastq, 96.0% matched
18021157 / 18779034 mapped to OTUs (96.0%)
53:02 736Mb  Writing R1_otutab.txt
53:02 736Mb  Writing R1_otutab.txt ...done.

usearch10.0.274M_i86linux64 -otutab ../Merged_reads/temporal_merged.fastq -otus merged_zotus.fasta -otutabout merged_zotutab.txt -mapout merged_zmap.txt

00:00 43Mb    100.0% Reading merged_zotus.fasta
00:00 9.3Mb   100.0% Masking (fastnucleo)
00:00 10Mb    100.0% Word stats
00:00 10Mb    100.0% Alloc rows
00:00 11Mb    100.0% Build index
00:00 44Mb   CPU has 28 cores, defaulting to 10 threads
53:22 726Mb   100.0% Searching temporal_merged.fastq, 94.8% matched
15471210 / 16324788 mapped to OTUs (94.8%)
53:22 726Mb  Writing merged_zotutab.txt
53:22 726Mb  Writing merged_zotutab.txt ...done.

usearch10.0.274M_i86linux64 -otutab ../Forward_reads/temporal_R1_cutadapt.fastq -otus R1_zotus.fasta -otutabout R1_zotutab.txt -mapout R1_zmap.txt

00:00 43Mb    100.0% Reading R1_zotus.fasta
00:00 9.3Mb   100.0% Masking (fastnucleo)
00:00 10Mb    100.0% Word stats
00:00 10Mb    100.0% Alloc rows
00:00 11Mb    100.0% Build index
00:00 45Mb   CPU has 28 cores, defaulting to 10 threads
56:46 736Mb   100.0% Searching temporal_R1_cutadapt.fastq, 95.7% matched
17979390 / 18779034 mapped to OTUs (95.7%)                              
56:46 736Mb  Writing R1_zotutab.txt
56:46 736Mb  Writing R1_zotutab.txt ...done.

### Okay, think this is done for now. Will want to go back and run some of the QC checks, but will look at the files now:

# Also, should make a tarball of the Temporal/ directory now, too

## Overview of final outputs:

- merged otus: 16291012 / 16324788 mapped to OTUs (99.8%) - 1301 OTUs, 320 chimeras
- R1 otus: 18021157 / 18779034 mapped to OTUs (96.0%) - 1474 OTUs, 670 chimeras
- merged zotus: 15471210 / 16324788 mapped to OTUs (94.8%) - 985 good, 7 chimeras
- R1 zotus: 17979390 / 18779034 mapped to OTUs (95.7%) - 1180 good, 7 chimeras

## Would like to explore the crosstalk option now:
	## Instead, think I'm going to go through the QC pipeline Robert Edgar recommends:
		https://www.drive5.com/usearch/manual/otu_qc.html

/workspace/scratch/obrett/Temporal/OTUs/QC

## Suppose I should download and get the most recent UNITE database, too. 

/workspace/scratch/obrett/UNITE/uchime_reference_dataset_28.06.2017/ITS1_ITS2_datasets

usearch10.0.274M_i86linux64 -makeudb_sintax uchime_reference_dataset_ITS1_28.06.2017.fasta -output ~/UNITE2/ITS1_28_06_2017.udb
	
00:00 55Mb    100.0% Reading uchime_reference_dataset_ITS1_28.06.2017.fasta
00:00 22Mb    100.0% Converting to upper case                              
00:00 23Mb    100.0% Word stats              
00:00 23Mb    100.0% Alloc rows
00:01 48Mb    100.0% Build index
00:01 50Mb    100.0% Initialize taxonomy data
00:01 50Mb    100.0% Building name table     
00:01 50Mb   1 names, tax levels min 0, avg 0.0, max 0


usearch10.0.274M_i86linux64 -makeudb_sintax uchime_reference_dataset_ITS1_28.06.2017.fasta -output /home/obrett/UNITE2/ITS1_28_06_2017.udb

---Fatal error---
../utaxdata.cpp(949) assert failed: Size > 0 && Ids != 0

usearch10.0.274M_i86linux64 -makeudb_sintax uchime_reference_dataset_28.06.2017.fasta -output ~/UNITE2/ITS_2017_06_28.udb

00:00 65Mb    100.0% Reading uchime_reference_dataset_28.06.2017.fasta
00:00 32Mb    100.0% Converting to upper case                         
00:01 33Mb    100.0% Word stats              
00:01 33Mb    100.0% Alloc rows
00:02 98Mb    100.0% Build index
00:02 100Mb   100.0% Initialize taxonomy data
00:02 100Mb   100.0% Building name table     
00:02 100Mb  1 names, tax levels min 0, avg 0.0, max 0


usearch10.0.274M_i86linux64 -makeudb_sintax uchime_reference_dataset_28.06.2017.fasta -output /home/obrett/UNITE2/ITS_2017_06_28.udb

---Fatal error---
../utaxdata.cpp(949) assert failed: Size > 0 && Ids != 0

# Need to figure out the error and get a new DB formatted and in the correct directory

# 2020-02-13

# Downloaded a different copy of an ITS dataset. Hopefully this is the right one.

/workspace/scratch/obrett/UNITE

wget https://files.plutof.ut.ee/public/orig/E8/83/E883EB19E3EA7B64C1F652521301239831FAFE0BFF015C9E2B4786DC0976C0FC.gz

gunzip E883EB19E3EA7B64C1F652521301239831FAFE0BFF015C9E2B4786DC0976C0FC.gz

mv E883EB19E3EA7B64C1F652521301239831FAFE0BFF015C9E2B4786DC0976C0FC utax_reference_dataset_04.02.2020.fasta

usearch10.0.274M_i86linux64 -makeudb_sintax utax_reference_dataset_04.02.2020.fasta -output ~/UNITE2/unite_2020_02_04.udb

---Fatal error---
Missing x: in name >MF167586|SH1151072.08FU;tax=d:Fungi,p:Ascomycota,c:Sordariomycetes,o:Xylariales,f:,g:Delonicicola,s:Delonicicola_siamense_SH1151072.08FU;

## Not sure if this is the only header in the entire file that is missing info, but the "f:" designation is missing. 
	## This is the perfect time to use a python script to read the file, find the line that is messed up and replace it with the appropriate line

udb_cleanup.py

# Had to run the above twice to get the taxonomy file cleaned up, but did get this warning:

WARNING: 7 taxonomy nodes have >1 parent

# Will have to wait and see if this is a problem downstream. Still, the taxonomy file is here:

/home/obrett/UNITE2/unite_2020_02_04.udb

## Back to the QC stuff...

	# Making a subsample of the most and least abundant otus

head -21 merged_otus.fasta > QC/alignment_subsample.fasta

tail -19 merged_otus.fasta > QC/alignment_subsample_V2.fasta

cat alignment_subsample.fasta alignment_subsample_V2.fasta > alignment_subsample_V3.fasta

usearch10.0.274M_i86linux64 -usearch_global alignment_subsample_V3.fasta -db ~/UNITE2/unite_2020_02_04.udb -id 0.9 -strand both -alnout out.aln -uc otu.uc

## I think the alignments look really good. Any differences are indels in the middle of the reads, not towards the ends. 
	# Safe to proceed
	
cut -f1 ../merged_otutab.txt | grep -v "^#" > table_labels.txt

grep "^>" ../merged_otus.fasta | sed "-es/>//" > seq_labels.txt

sort seq_labels.txt table_labels.txt table_labels.txt | uniq -u > missing_labels.txt

usearch10.0.274M_i86linux64 -fastx_getseqs ../merged_otus.fasta -labels missing_labels.txt -fastaout missing.fasta

00:00 3.6Mb  Reading missing_labels.txt...done.
00:00 37Mb    100.0% Searching, 0 found

## Safe to proceed

## Looking at the orientation of the reads: 

usearch10.0.274M_i86linux64 -orient ../merged_otus.fasta -db ~/UNITE2/unite_2020_02_04.udb -tabbedout orient.txt

cut -f2 orient.txt | sort | uniq -c
     16 -
    203 ?
   1082 +

# Wonder why 16 and 203 are minus and undetermined...

# Okay, need to orient them before the command below:

usearch10.0.274M_i86linux64 -fastx_uniques Merged_reads/temporal_merged_filtered.fasta -fastaout OTUs/merged_uniques.fasta -sizeout -relabel Uniq

# Will do the following, then feed it back into the pipeline:

usearch10.0.274M_i86linux64 -orient ../../Merged_reads/temporal_merged_filtered.fasta -db ~/UNITE2/unite_2020_02_04.udb -fastaout ../../Merged_reads/merged_oriented.fasta -tabbedout orient_V2.txt

01:53 1.0Gb   100.0% 14687508 plus (95.9%), 12353 minus (0.1%), 617253 undet. (4.0%)
	# I wonder what this does with the undetermined reads???

### Pipeline after this step:

usearch10.0.274M_i86linux64 -fastx_uniques Merged_reads/temporal_merged_filtered.fasta -fastaout OTUs/merged_uniques.fasta -sizeout -relabel Uniq
usearch10.0.274M_i86linux64 -cluster_otus merged_uniques.fasta -otus merged_otus.fasta -relabel Otu
usearch10.0.274M_i86linux64 -otutab ../Merged_reads/temporal_merged.fastq -otus merged_otus.fasta -otutabout merged_otutab.txt -mapout merged_map.txt

###

# Will continue with the QC stuff to see if there are other corrections needed before doing the fastx_uniques command again

# Examining offset sequences:

usearch10.0.274M_i86linux64 -cluster_fast ../merged_otus.fasta -id 0.97 -strand both -alnout otus2.aln -show_termgaps -userout user.txt -userfields query+target+qstrand+qlo+qhi+tlo+thi

      Seqs  1301
  Clusters  1183
  Max size  9
  Avg size  1.1
  Min size  1
Singletons  1149, 88.3% of seqs, 97.1% of clusters
   Max mem  707Mb
      Time  1.00s
Throughput  1301.0 seqs/sec.

# Well, all of the qlo values are 1. Not sure what this means...

# Looking further into it to compare unique sequences to the OTUs:

usearch10.0.274M_i86linux64 -usearch_global ../merged_uniques.fasta -db ../merged_otus.fasta -strand both -id 1.0 -maxaccepts 4 -maxrejects 64 -userout uniques_vs_otus.txt -userfields query+target

02:05 735Mb   100.0% Searching merged_uniques.fasta, 44.2% matched

cut -f1 uniques_vs_otus.txt | sort | uniq -d > uniq_test_f1.txt
cut -f2 uniques_vs_otus.txt | sort | uniq -d > uniq_test_f2.txt 

# Examine the output files above tomorrow in reference to the following webpage: https://drive5.com/usearch/manual/otu_qc_offset.html

## 2020-02-14

# Confused because the user.txt output file only shows qlo values of one throughout which means that I don't have 
	# staggering or strand sense issues, but then the unique_test_f1.txt shows that many of the uniques have a size of 
	# 2. This is conflicting info. Still, think I should go through the pipeline again with the oriented reads for both
	# the merged and R1, unoise and cluster steps...

# Instead of dealing with cross talk informatically (i.e. barcode switching), I'll just subtract reads found in the 
	# control samples from the rest of the samples, but will need to do it from each pair of months corresponding 
	# to the sequencing run. https://drive5.com/usearch/manual/crosstalk.html

# Will search for low complexity reads on the final OTUs...

https://drive5.com/usearch/manual/cmd_filter_lowc.html

# Don't have this command in V10, so will skip it...

# Will do a search for phix now:

usearch10.0.274M_i86linux64 -search_phix temporal_merged.fastq -notmatched ../OTUs/QC/phix_filtered.fastq -alnout ../OTUs/QC/phix_hits.txt

00:00 277Mb   100.0% Word stats
00:00 277Mb   100.0% Alloc rows
00:00 277Mb   100.0% Build index
00:00 244Mb  CPU has 28 cores, defaulting to 10 threads
02:32 936Mb   100.0% Searching, 5.1% matched

# There are actually some reads that are coming up as phix in this step, so I'll have to go back before making the fasta file
	# (fastq_filter, I believe) and then do the orientation step, unless these reads don't make it into the final table
	# How to tell if they are actually in the post-processing steps? 
		# I guess it'll be hard to tell; should go ahead with the removal the phix reads as planned before the fastq_filter and orient command
		
## Right now, need to do this:
- Incorporate the phix_filtered.fastq file (do it for the R1 reads, too!)
- Orient the reads

## Actually, maybe I need to use the -filter_phix command??? Yep! 

usearch10.0.274M_i86linux64 -filter_phix temporal_merged.fastq -output phix_filtered.fastq -alnout phix_hits.txt

00:00 277Mb   100.0% Word stats
00:00 277Mb   100.0% Alloc rows
00:00 277Mb   100.0% Build index
00:00 244Mb  CPU has 28 cores, defaulting to 10 threads
01:08 933Mb   100.0% Filtering for phix, 838106 hits (5.1%)

cp phix_filtered.fastq ../Redux_processing/

mv phix_filtered.fastq merged_phix_filtered.fastq
	# Pull the stats from line 859 above
	
	00:00 277Mb   100.0% Word stats
	00:00 277Mb   100.0% Alloc rows
	00:00 277Mb   100.0% Build index
	00:00 244Mb  CPU has 28 cores, defaulting to 10 threads
	01:08 933Mb   100.0% Filtering for phix, 838106 hits (5.1%)


## Remember that there wasn't a merged_cutadapt.fastq file because the merging stripped the primers.

usearch10.0.274M_i86linux64 -filter_phix ../Forward_reads/temporal_R1_cutadapt.fastq -output R1_phix_filtered.fastq -alnout ../Forward_reads/phix_hits.txt

00:00 277Mb   100.0% Word stats
00:00 277Mb   100.0% Alloc rows
00:00 277Mb   100.0% Build index
00:00 244Mb  CPU has 28 cores, defaulting to 10 threads
01:33 930Mb   100.0% Filtering for phix, 1035882 hits (5.5%)

usearch10.0.274M_i86linux64 -fastq_filter merged_phix_filtered.fastq -fastq_maxee 1.0 -fastaout merged_filtered.fasta

00:01 38Mb   CPU has 28 cores, defaulting to 10 threads
01:48 683Mb   100.0% Filtering, 95.6% passed
  15486682  Reads (15.5M)                   
    677084  Discarded reads with expected errs > 1.00
  14809598  Filtered reads (14.8M, 95.6%)

usearch10.0.274M_i86linux64 -fastq_filter R1_phix_filtered.fastq -fastq_maxee 1.0 -fastaout R1_filtered.fasta

00:00 38Mb   CPU has 28 cores, defaulting to 10 threads
01:33 683Mb   100.0% Filtering, 85.9% passed
  17743132  Reads (17.7M)                   
   2502269  Discarded reads with expected errs > 1.00
  15240863  Filtered reads (15.2M, 85.9%)
  
usearch10.0.274M_i86linux64 -orient merged_filtered.fasta -db ~/UNITE2/unite_2020_02_04.udb -fastaout merged_oriented.fasta -tabbedout merged_orient.txt

01:41 1.0Gb   100.0% 14678970 plus (99.1%), 985 minus (0.0%), 129643 undet. (0.9%)

usearch10.0.274M_i86linux64 -orient R1_filtered.fasta -db ~/UNITE2/unite_2020_02_04.udb -fastaout R1_oriented.fasta -tabbedout R1_orient.txt

01:41 1.0Gb   100.0% 15129120 plus (99.3%), 218 minus (0.0%), 111525 undet. (0.7%)

usearch10.0.274M_i86linux64 -fastx_uniques merged_oriented.fasta -fastaout merged_uniques.fasta -sizeout -relabel Uniq

00:45 4.4Gb   100.0% Reading merged_oriented.fasta
00:45 4.4Gb  CPU has 28 cores, defaulting to 10 threads
00:53 7.8Gb   100.0% DF
01:01 7.9Gb  14679955 seqs, 306010 uniques, 199184 singletons (65.1%)
01:01 7.9Gb  Min size 1, median 1, max 9957620, avg 47.97
01:04 5.6Gb   100.0% Writing merged_uniques.fasta

usearch10.0.274M_i86linux64 -fastx_uniques R1_oriented.fasta -fastaout R1_uniques.fasta -sizeout -relabel Uniq

00:49 4.6Gb   100.0% Reading R1_oriented.fasta
00:49 4.5Gb  CPU has 28 cores, defaulting to 10 threads
00:56 8.0Gb   100.0% DF
01:04 8.1Gb  15129338 seqs, 683798 uniques, 518290 singletons (75.8%)
01:04 8.1Gb  Min size 1, median 1, max 9174463, avg 22.13
01:08 5.7Gb   100.0% Writing R1_uniques.fasta

usearch10.0.274M_i86linux64 -cluster_otus merged_uniques.fasta -otus merged_otus.fasta -relabel Otu

00:13 60Mb    100.0% 1102 OTUs, 65 chimeras

usearch10.0.274M_i86linux64 -cluster_otus R1_uniques.fasta -otus R1_otus.fasta -relabel Otu

00:19 60Mb    100.0% 1217 OTUs, 104 chimeras

usearch10.0.274M_i86linux64 -unoise3 merged_uniques.fasta -zotus merged_zotus.fasta

00:01 130Mb   100.0% Reading merged_uniques.fasta
00:01 120Mb   100.0% 994 amplicons, 2592535 bad (size >= 8)
00:15 131Mb   100.0% 987 good, 7 chimeras                  
00:15 131Mb   100.0% Writing zotus

usearch10.0.274M_i86linux64 -unoise3 R1_uniques.fasta -zotus R1_zotus.fasta

00:02 229Mb   100.0% Reading R1_uniques.fasta
00:03 223Mb   100.0% 987 amplicons, 3535377 bad (size >= 8) 
00:15 234Mb   100.0% 980 good, 7 chimeras                  
00:15 234Mb   100.0% Writing zotus

usearch10.0.274M_i86linux64 -otutab merged_phix_filtered.fastq -otus merged_otus.fasta -otutabout merged_otutab.txt -mapout merged_map.txt

42:25 736Mb   100.0% Searching, 99.7% matched
15444917 / 15486682 mapped to OTUs (99.7%)

usearch10.0.274M_i86linux64 -otutab R1_phix_filtered.fastq -otus R1_otus.fasta -otutabout R1_otutab.txt -mapout R1_map.txt

43:18 736Mb   100.0% Searching, 96.2% matched
17071726 / 17743132 mapped to OTUs (96.2%)

usearch10.0.274M_i86linux64 -otutab merged_phix_filtered.fastq -otus merged_zotus.fasta -otutabout merged_zotutab.txt -mapout merged_zmap.txt

43:58 726Mb   100.0% Searching, 99.6% matched
15423973 / 15486682 mapped to OTUs (99.6%)

usearch10.0.274M_i86linux64 -otutab R1_phix_filtered.fastq -otus R1_zotus.fasta -otutabout R1_zotutab.txt -mapout R1_zmap.txt

49:52 726Mb   100.0% Searching, 95.9% matched
17018397 / 17743132 mapped to OTUs (95.9%)

## 2020-02-19

# Okay, now to complete the taxonomy step with the updated udb
	# Should consider using a cutoff of ~80% for the taxonomic assignments, though many of the fungi are pretty unique
		# Maybe do an non-cutoff and a cutoff version and see how the results compare? 
			# This would mean that I'd have 8 different analyses to run through. Maybe do a comparison of the 
			# cutoffs from the merged otus for starters?

/workspace/scratch/obrett/Temporal/Taxonomy

usearch10.0.274M_i86linux64 -sintax merged_otus.fasta -db ~/UNITE2/unite_2020_02_04.udb -tabbedout ../Taxonomy/merged_otus.sintax -strand both

# Note, no real output from the above command, so nothing printed here

# Concerned the applying the 80% cutoff is going to kick out most of the Catenasporaceae OTUs since I see only 2% seq identity for 
	# OTU 2 at genus level
		# Will try it anyways...

usearch10.0.274M_i86linux64 -sintax merged_otus.fasta -db ~/UNITE2/unite_2020_02_04.udb -tabbedout ../Taxonomy/merged_otus_80_percent.sintax -sintax_cutoff 0.8 -strand both

# Okay, it looks like it kept OTU2 despite the low similarity. Maybe it only applies the threshold at higher taxonomic ranks? 
	# Should consider comparing the two, side-by-side in R, but will proceed with the standard assignment for the remaining
	# otus/zotus for now.
	
usearch10.0.274M_i86linux64 -sintax R1_otus.fasta -db ~/UNITE2/unite_2020_02_04.udb -tabbedout ../Taxonomy/R1_otus.sintax -strand both

usearch10.0.274M_i86linux64 -sintax merged_zotus.fasta -db ~/UNITE2/unite_2020_02_04.udb -tabbedout ../Taxonomy/merged_zotus.sintax -strand both

usearch10.0.274M_i86linux64 -sintax R1_zotus.fasta -db ~/UNITE2/unite_2020_02_04.udb -tabbedout ../Taxonomy/R1_zotus.sintax -strand both

# Ready to move on to R? 
	# Will quickly try an octave plot of the merged_otus and the merged_zotus to see how things look. Will then move on to R.

usearch10.0.274M_i86linux64 -calc_distmx ../Redux_processing/merged_otus.fasta -tabbedout mx.txt

usearch10.0.274M_i86linux64 -calc_distmx ../Redux_processing/merged_zotus.fasta -tabbedout mx_zotus.txt

# Nevermind, the otutab_octave command isn't in V10

# Okay, will move on to R now...

# Well, that's disappointing: only have the apr_may samples in the otu table. Will check to see if this is the case in the zotu tables

# The reads are in the merged files, but I suspect the difference in the header lines is throwing off the commands. 
	# Recall needing to reformat the header lines for the apr_may samples, but will have to look through the old code to 
	# figure out what I did
		# I did use a script fastq_header_changer3.py to change the apr_may files previously, but maybe I can get around it with
		# the -sample_delim . option for the otu_tab command, if all the reads are present up until that point...

fastq_header_changer3.py apr_may.fastq > apr_may_V2.fastq

# A quick check of the headers for comparison:

less apr_may_V2.fastq: @BY-12-1-5.1101138891730
less aug_sep.fastq: @BY-9-4-9.1337858

# Looks good
	# Fuck balls. Will need to re-orient and phix_filter these reads again. Should I just use the header_changer3.py command on the
	# phix-filtered file? Definitely...if it works. Will try tomorrow. 
	
# 2020-02-20
	
fastq_header_changer3.py merged_phix_filtered.fastq > merged_phix_filtered_V2.fastq

grep -c "^@" merged_phix_filtered.fastq
	15487871

grep -c "^@" merged_phix_filtered_V2.fastq 
	15487871


usearch10.0.274M_i86linux64 -fastq_filter merged_phix_filtered_V2.fastq -fastq_maxee 1.0 -fastaout merged_filtered_V2.fasta

01:44 683Mb   100.0% Filtering, 95.6% passed
  15486682  Reads (15.5M)                   
    677084  Discarded reads with expected errs > 1.00
  14809598  Filtered reads (14.8M, 95.6%)
    
usearch10.0.274M_i86linux64 -orient merged_filtered_V2.fasta -db ~/UNITE2/unite_2020_02_04.udb -fastaout merged_oriented_V2.fasta -tabbedout merge_orient_V2.txt

01:40 1.0Gb   100.0% 14678970 plus (99.1%), 985 minus (0.0%), 129643 undet. (0.9%)

usearch10.0.274M_i86linux64 -fastx_uniques merged_oriented_V2.fasta -fastaout merged_uniques_V2.fasta -sizeout -relabel Uniq

01:02 7.6Gb  14679955 seqs, 306010 uniques, 199184 singletons (65.1%)
01:02 7.6Gb  Min size 1, median 1, max 9957620, avg 47.97
01:04 5.3Gb   100.0% Writing merged_uniques_V2.fasta

usearch10.0.274M_i86linux64 -unoise3 merged_uniques_V2.fasta -zotus merged_zotus_V2.fasta

00:01 130Mb   100.0% Reading merged_uniques_V2.fasta
00:02 120Mb   100.0% 994 amplicons, 2592535 bad (size >= 8)  
00:16 131Mb   100.0% 987 good, 7 chimeras                  
00:16 131Mb   100.0% Writing zotus


usearch10.0.274M_i86linux64 -otutab merged_phix_filtered_V2.fastq -otus merged_zotus_V2.fasta -otutabout merged_zotutab_V3.txt -mapout merged_zmap_V3.txt -sample_delim . 

42:18 726Mb   100.0% Searching, 99.6% matched
15423973 / 15486682 mapped to OTUs (99.6%)
42:18 726Mb  Writing merged_zotutab_V3.txt
42:18 726Mb  Writing merged_zotutab_V3.txt ...done.


/workspace/scratch/obrett/RDP

wget https://drive5.com/sintax/rdp_its_v2.fa.gz

gunzip rdp_its_v2.fa.gz

usearch10.0.274M_i86linux64 -makeudb_sintax rdp_its_v2.fa -output rdp_its_v2.udb

## This output has a taxonomic assignment for almost every rank which is helpful. Should compare between the two further. 

	# Think I'm going to work with the RDP...

usearch10.0.274M_i86linux64 -sintax merged_zotus_V2.fasta -db ../../../RDP/rdp_its_v2.udb -strand both -tabbedout merged_zotus_V2_rdp.sintax


In [None]:
### Processing for within plant

## First need to rename the fastq files (and potentially the headers) to make sure they process neatly downstream

## Will modify the shell script jun_jul_renamer.sh for the within-plant

within_plant_renamer.sh

usearch10.0.274M_i86linux64 -fastq_mergepairs *R1*.fastq -fastqout ../Merged_reads/within_p_merged2.fastq -relabel @ -fastq_maxdiffs 300

Totals:
    675853  Pairs (675.9k)
    583714  Merged (583.7k, 86.37%)
    229030  Alignments with zero diffs (33.89%)
     74970  Too many diffs (> 300) (11.09%)
     17169  No alignment found (2.54%)
         0  Alignment too short (< 16) (0.00%)
    581026  Staggered pairs (85.97%) merged & trimmed
    203.81  Mean alignment length
    219.84  Mean merged length
      0.37  Mean fwd expected errors
      1.73  Mean rev expected errors
      0.17  Mean merged expected errors

usearch10.0.274M_i86linux64 -filter_phix within_p_merged2.fastq -output within_p_phix_f.fastq -alnout phix_hits.txt

00:02 935Mb   100.0% Filtering for phix, 12380 hits (2.1%)

usearch10.0.274M_i86linux64 -fastq_filter within_p_phix_f.fastq -fastq_maxee 1.0 -fastaout within_p_filtered.fasta

00:04 683Mb   100.0% Filtering, 96.6% passed
    571334  Reads (571.3k)                  
     19582  Discarded reads with expected errs > 1.00
    551752  Filtered reads (551.8k, 96.6%)

usearch10.0.274M_i86linux64 -orient within_p_filtered.fasta -db ~/UNITE2/unite_2020_02_04.udb -fastaout within_p_oriented.fasta -tabbedout within_p_oriented.txt

00:04 1.0Gb   100.0% 548048 plus (99.3%), 2 minus (0.0%), 3702 undet. (0.7%)

usearch10.0.274M_i86linux64 -fastx_uniques within_p_oriented.fasta -fastaout within_p_uniques.fasta -sizeout -relabel Uniq

00:02 898Mb  548050 seqs, 20995 uniques, 15276 singletons (72.8%)
00:02 898Mb  Min size 1, median 1, max 180839, avg 26.10
00:03 898Mb   100.0% Writing within_p_uniques.fasta

usearch10.0.274M_i86linux64 -unoise3 within_p_uniques.fasta -zotus within_p_zotus.fasta

00:00 49Mb    100.0% Reading within_p_uniques.fasta
00:00 30Mb    100.0% 620 amplicons, 38994 bad (size >= 8)
00:05 41Mb    100.0% 615 good, 5 chimeras                
00:05 41Mb    100.0% Writing zotus

usearch10.0.274M_i86linux64 -otutab within_p_phix_f.fastq -otus within_p_zotus.fasta -otutabout within_p_zotutab.txt -mapout within_p_zmap.txt

03:32 726Mb   100.0% Searching, 99.1% matchedchedg, 98.6% matched
566207 / 571334 mapped to OTUs (99.1%)
03:32 726Mb  Writing within_p_zotutab.txt
03:32 726Mb  Writing within_p_zotutab.txt ...done.

usearch10.0.274M_i86linux64 -sintax within_p_zotus.fasta -db ../../../RDP/rdp_its_v2.udb -tabbedout within_p_zotus.sintax -strand both

## Done. Will incorporate into R for future analyses

# Just copied over the following files to the following directory for the # # within plant analysis:

/Users/brett/Documents/Temporal_turnover/Informatics
within_p_zotus.sintax
within_p_zotutab.txt

# Need to make an updated RDP sintax file for the zotu table:

usearch10.0.274M_i86linux64 -sintax merged_zotus_V2.fasta -db ../../../RDP/rdp_its_v2.udb -strand both -tabbedout merged_zotus_V2_rdp.sintax

# Need the following files for zotu analysis:
- merged_zotus_V2.fasta
- merged_zotus_V2_rdp.sintax
- merged_zotutab_V3.txt
- map_file16.txt

# Need to reformat merged_zotus_V2_rdp.sintax so it matches the formatting of merged_otus_V3.sintax
	# Saved as merged_zotus_V3_rdp.sintax

# Consider removing conf estimates and incertae sedis from tax table

# Need to re-run line 1629 from above with the -sample_delim . argument

/workspace/scratch/obrett/Temporal/Within_plant/Merged_reads

usearch10.0.274M_i86linux64 -otutab within_p_phix_f.fastq -otus within_p_zotus.fasta -otutabout within_p_zotutab2.txt -mapout within_p_zmap2.txt -sample_delim .

within_p_zotus.sintax
within_p_zotutab2.txt
within_p_map_file.txt

# Plotted an ordination and taxonomy plot for the within-p
	# See that there are multiples of the same genus/family on the 
	# taxonomy plot. Need to delete these conf. estimates since it makes 
	# each genus seem distinct. Will also delete the f: and g: designations
	
# Will update this on both taxonomy files for the big dataset and the within-p

Originals:
- 'within_p_zotus2.sintax.txt'
- 'merged_zotus_V3_rdp.sintax'

Edited:
- 'within_p_zotus3.sintax'
- 'merged_zotus_V4_rdp.sintax'