/
checkwiki.pl
executable file
·5428 lines (4246 loc) · 167 KB
/
checkwiki.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#! /usr/bin/env perl
###########################################################################
##
## FILE: checkwiki.pl
##
## USAGE: ./checkwiki.pl -c checkwiki.cfg --project=<enwiki>
## --load <live, dump, delay> --dumpfile --tt-file
##
## DESCRIPTION: Scan Wikipedia articles for errors.
##
## AUTHOR: Stefan Kühn, Bryan White
## LICENCE: GPLv3
## VERSION: 2016/11/30
##
###########################################################################
use strict;
use warnings;
#use lib '/data/project/checkwiki/perl5/perlbrew/perls/perl-5.24.0/lib/site_perl/5.24.0/';
use lib '/data/project/checkwiki/perl/lib/perl5';
use feature 'unicode_strings';
use DBI;
use Carp;
use DBD::mysql;
use Getopt::Long
qw(GetOptionsFromString :config bundling no_auto_abbrev no_ignore_case);
use POSIX qw(strftime);
use Business::ISBN qw( valid_isbn_checksum );
use MediaWiki::API;
# use MediaWiki::Bot; does not support rvslots API parameter
#THESE TWO ARE LOADED AT RUNTIME BY IF STATEMENT JUST AFTER COMMAND-LINE ARE FOUND
#SPEEDS UP STARTUP
#use MediaWiki::DumpFile::Pages;
#use File::Temp;
binmode( STDOUT, ':encoding(UTF-8)' );
##############################
## Program wide variables
##############################
my $Dump_or_Live = q{}; # Scan modus (dump, live, delay)
my $CheckOnlyOne = 0; # Check only one error or all errors
my $ServerName = q{}; # Address where api can be found
my $Language = q{}; # Code of the language being used 'de' or 'en'
my $project = q{}; # Name of the project 'dewiki'
my $end_of_dump = q{}; # When last article from dump reached
my $artcount = 0; # Number of articles processed
my $file_size = 0; # How many MB of the dump has been processed.
my $mediawiki_api;
# Database configuration
my $DbName;
my $DbServer;
my $DbUsername;
my $DbPassword;
my $dbh;
# MediaWiki::DumpFile variables
my $pages = q{};
# Time program starts
my $time_start = time(); # Start timer in secound
my $time_end = time(); # End time in secound
my $time_found = time(); # For column "Found" in cw_error
# Template list retrieved from Translation file
my @Template_list;
# Article name for article mode
my $ArticleName;
# Filename that contains a list of articles titles for list mode
my $ListFilename;
# Filename that contains the dump file for dump mode
my $DumpFilename;
# Should Template Tiger output be generated?
my $Template_Tiger = 0;
my $TTFile;
my $TTFilename;
my $TTDIRECTORY = '/data/project/templatetiger/public_html/dumps/';
my $TTnumber = 0;
# Total number of Errors
my $Number_of_error_description = 0;
##############################
## Wiki-special variables
##############################
#my @Namespace; # Namespace values
# 0 number
# 1 namespace in project language
# 2 namespace in english language
my @Namespace_cat; # All namespaces for categorys
my @Namespace_templates; # All namespaces for templates
my @Template_regex; # Template regex fron translation file
my $IMAGE_REGEX; # Regex used in get_images()
my $Cat_regex = q{}; # Regex used in get_categories()
my $REGEX_095; # Regex used in error_095_user_signature();
my $rtl_text_dir = 0; # Set to 1 if rtl_text_dir metadata record is present
my @Magicword_defaultsort;
my $Error_counter = -1; # Number of found errors in all article
my @ErrorPriority; # Priority each error has
my @Error_number_counter = (0) x 150; # Error counter for individual errors
my @FOUNDATION_PROJECTS; # Names and shortcuts of Wikimedia foundation
my @INTER_LIST; # Shortcuts to other language wikis
my @HTML_NAMED_ENTITIES; # HTML names for symbols
my @HTML_NAMED_ENTITIES_011; # HTML names for symbols minus some Greek letters
my @REGEX_003;
my @REGEX_002;
my @REGEX_BR_002; # Regex used in #002
my $REGEX_SHORT_016;
my $REGEX_LONG_016;
my @REGEX_034; # Contains all of #034 Regexes
my @REGEX_034_BRACKET; # Contains #034 regexes minues '{{{'
my @REGEX_061;
my @REGEX_078;
my $CHARACTERS_064;
my @REGEX_085;
my @REGEX_112;
my $REGEX_REFERENCESTUB;
##############################
## Wiki-special variables
##############################
my @ack;
@FOUNDATION_PROJECTS = qw / b c d n m q s v w
meta mw nost wikt wmf voy
commons foundation incubator phabricator
quality species testwiki wikibooks
wikidata wikimedia wikinews wikiquote
wikisource wikispecies wiktionary wikiversity
wikivoyage /;
@INTER_LIST = qw / af als an ar az bg bs bn
ca cs cy da de el en eo es et eu fa fi
fr fy gv he hi hr hu hy id is it ja
jv ka kk ko la lb lt ms nds nl nn no pl
pt ro ru sh sk sl sr sv sw ta th tr uk
ur uz vi zh simple nds_nl /;
# See http://turner.faculty.swau.edu/webstuff/htmlsymbols.html
@HTML_NAMED_ENTITIES = qw / aacute Aacute acute acirc Acirc aelig AElig
agrave Agrave alpha Alpha aring Aring asymp atilde Atilde auml Auml beta Beta
bdquo brvbar bull ccedil Ccedil cent chi Chi clubs copy crarr darr dArr deg
delta Delta diams divide eacute Eacute ecirc Ecirc egrave Egrave
epsilon Epsilon equiv eta Eta eth ETH euml Euml euro fnof frac12 frac14
frac34 frasl gamma Gamma ge harr hArr hearts hellip iacute Iacute icirc Icirc
iexcl igrave Igrave infin int iota Iota iquest iuml Iuml kappa Kappa
lambda Lambda laquo larr lArr ldquo le loz lsaquo lsquo micro middot minus
mu Mu ne not ntilde Ntilde nu Nu oacute Oacute ocirc Ocirc oelig OElig
ograve Ograve oline omega Omega omicron Omicron ordf ordm oslash Oslash
otilde Otilde ouml Ouml para part permil phi Phi pi Pi piv plusmn
pound Prime prime prod psi Psi quot radic raquo rarr rArr rdquo reg rho Rho
raquo rsaquo rsquo sbquo scaron Scaron sect sigma Sigma sigmaf spades
sum sup1 sup2 sup3 szlig tau Tau theta Theta thetasym thorn THORN
tilde times trade uacute Uacute uarr uArr ucirc Ucirc ugrave Ugrave upsih
upsilon Upsilon uuml Uuml xi Xi yacute Yacute yen yuml Yuml zeta Zeta /;
# FOR #011. DO NOT CONVERT GREEK LETTERS THAT LOOK LIKE LATIN LETTERS.
# Alpha (A), Beta (B), Epsilon (E), Zeta (Z), Eta (E), Kappa (K), kappa (k), Mu (M), Nu (N), nu (v), Omicron (O), omicron (o), Rho (P), Tau (T), Upsilon (Y), upsilon (o) and Chi (X).
@HTML_NAMED_ENTITIES_011 = qw / aacute Aacute acute acirc Acirc aelig AElig
agrave Agrave alpha aring Aring asymp atilde Atilde auml Auml beta bdquo
brvbar bull ccedil Ccedil cent chi clubs copy crarr darr dArr deg
delta Delta diams divide eacute Eacute ecirc Ecirc egrave Egrave
epsilon equiv eta eth ETH euml Euml euro fnof frac12 frac14
frac34 frasl gamma Gamma ge harr hArr hearts hellip iacute Iacute icirc
Icirc iexcl igrave Igrave infin int iota Iota iquest iuml Iuml
lambda Lambda laquo larr lArr ldquo le loz lsaquo lsquo micro middot minus
mu ne not ntilde Ntilde oacute Oacute ocirc Ocirc oelig OElig
ograve Ograve oline omega Omega ordf ordm oslash Oslash otilde Otilde ouml
Ouml para part permil phi Phi pi Pi piv plusmn pound Prime prime prod
psi Psi quot radic raquo rarr rArr rdquo reg rho raquo rsaquo rsquo sbquo
scaron Scaron sect sigma Sigma sigmaf spades sum sup1 sup2 sup3 szlig
tau theta Theta thetasym thorn THORN tilde times trade uacute Uacute uarr uArr
ucirc Ucirc ugrave Ugrave upsih upsilon uuml Uuml xi Xi yacute Yacute
yen yuml Yuml zeta Zeta /;
@ack = qw / abbr b big blockquote center cite
del div em font i p s small span strike sub sup td th tr tt u /;
foreach my $item (@ack) {
push @REGEX_002, qr/(<\s*\/?\s*$item\s*\/\s*>)/;
}
push @REGEX_BR_002, qr/<br\s*\/\s*+[^ >]>/; # <br\/t>
push @REGEX_BR_002, qr/<br[^\s]\s*\/>/; # <brt \/>
push @REGEX_BR_002, qr/<br[^\s\/>]>/; # <brt>
push @REGEX_BR_002, qr/<br\s*\/\s*+[^ >]/; # <br
push @REGEX_BR_002, qr/<br\s*+[^ >\/]/; # <br Note: \s*+ is possessive, no backtracking. Fixes ie. <br\n/>
push @REGEX_BR_002, qr/<br\h*+[^ \v>\/]/; # <br t> \v is newline
push @REGEX_BR_002, qr/<[^ w]br[^\/>]*\s*>/; # <tbr> or < br>
push @REGEX_BR_002, qr/<\/hr>/;
$REGEX_SHORT_016 = qr/[\x{200E}\x{FEFF}]/;
# Below <...> are links to enwiki pages
#Readonly::Scalar $REGEX_LONG_016 =>
# qr/[ \x{200E} # <Left to right mark>
# \x{200F} # <Right to left mark>
# \x{2004} # Whitespace character (Three per em space)
# \x{2005} # <Whitespace character> (Four per em space or mid-space)
# \x{2006} # <Whitespace character> (Sixper em space)
# \x{2007} # <Whitespace character> (Figure space)
# \x{2008} # <Whitespace character> (Punctuation space)
# \x{FEFF} # <Specials (Unicode block)>
# \x{007F} # <Delete character>
# \x{200B} # <Zero width space>
# \x{2028} # <Newline> (Line Separator)
# \x{202A} # <Bi directional text> (Left to Right Embedding)
# \x{202B} # <Bi directional text> (Right to Left Embedding)
# \x{202C} # <Bi directional text> (Pop Directional Format)
# \x{202D} # <Bi directional text> (Left to Right Override)
# \x{202E} # <Bi directional text> (Right to Left Override)
# ]/x;
$REGEX_LONG_016 =
qr/[\x{200E}\x{FEFF}\x{007F}\x{200B}\x{2028}\x{202A}\x{202C}\x{202D}\x{202E}\x{202B}\x{200F}\x{2004}\x{2005}\x{2006}\x{2007}\x{2008}]/;
push @REGEX_034, qr/#if:/;
push @REGEX_034, qr/#ifeq:/;
push @REGEX_034, qr/#switch:/;
push @REGEX_034, qr/#ifexist:/;
push @REGEX_034, qr/\{\{fullpagename}}/;
push @REGEX_034, qr/\{\{sitename}}/;
push @REGEX_034, qr/\{\{namespace}}/;
push @REGEX_034, qr/\{\{basepagename}}/;
push @REGEX_034, qr/\{\{pagename}}/;
push @REGEX_034, qr/\{\{subpagename}}/;
push @REGEX_034, qr/\{\{namespacenumber}}/;
push @REGEX_034, qr/\{\{fullpagenamee}}/;
push @REGEX_034, qr/\{\{subst:/;
push @REGEX_034, qr/__noindex__/;
push @REGEX_034, qr/__index__/;
push @REGEX_034, qr/__nonewsectionlink__/;
@REGEX_034_BRACKET = @REGEX_034;
push @REGEX_034_BRACKET, qr/\{\{\{/;
$CHARACTERS_064 = q{"'`\x{2018}\x{AB}\x{BB}\x{201E}\x{201C}\x{201D}().,\x{2013}\x{5BE}\x{2014}}; # unicode of "'`‘«»„“”().,–\x{5BE}—
push @REGEX_085, qr/<noinclude>\s*<\/noinclude>/;
push @REGEX_085, qr/<onlyinclude>\s*<\/onlyinclude/;
push @REGEX_085, qr/<includeonly>\s*<\/includeonly>/;
push @REGEX_085, qr/<center>\s*<\/center>/;
push @REGEX_085, qr/(<gallery[^>]*(?:\/>|>(?:\s| )*<\/gallery>))/;
push @REGEX_085, qr/<ref>\s*<\/ref>/;
push @REGEX_085, qr/<span(?!\s*id=)[^>]*>\s*<\/span>/;
push @REGEX_085, qr/#<div(?!\s*id=)[^>]*>\s*<\/div>/;
push @REGEX_085, qr/<div(?!(\s*id=|\s*style="clear))[^>]*>\s*<\/div>/;
push @REGEX_085, qr/<pre>\s*<\/pre>/;
push @REGEX_085, qr/<code>\s*<\/code>/;
push @REGEX_112, qr/[; ]-moz-/;
push @REGEX_112, qr/[; ]-webkit-/;
push @REGEX_112, qr/[; ]-ms-/;
push @REGEX_112, qr/[; ]data-cx-weight/;
push @REGEX_112, qr/[; ]contenteditable/;
# This uses possessive quantifier *+ which doesn't do any backtracking
$REGEX_REFERENCESTUB = qr/<ref(?:(?:\s+\w+(?:\s*=\s*(?:"[^"]*+"|'[^']*+'|[^'">\s]+))?)+\s*|\s*)\/>/;
###############################
## Variables for one article
###############################
my $title = q{}; # Title of current article
my $text = q{}; # Text of current article
my $lc_text = q{}; # Text of current article in lower case
my $text_original = q{}; # Text of article with comments only removed
my $page_namespace; # Namespace of page
my $page_is_redirect = 'no';
my $page_is_disambiguation = 'no';
my $Category_counter = -1;
my @Category; # 0 pos_start
# 1 pos_end
# 2 category Test
# 3 linkname Linkname
# 4 original [[Category:Test|Linkname]]
my @Interwiki; # 0 pos_start
# 1 pos_end
# 2 interwiki Test
# 3 linkname Linkname
# 4 original [[de:Test|Linkname]]
# 5 language
my $Interwiki_counter = -1;
my @Templates_all; # All templates
my @Template; # Templates with values
# 0 number of template
# 1 templatename
# 2 template_row
# 3 attribut
# 4 value
my $Number_of_template_parts = -1; # Number of all template parts
my @Links_all; # All links
my @Images_all; # All images
my @Ref; # All ref
my @Headlines; # All headlines
my @Lines; # Text seperated in lines
###########################################################################
###########################################################################
###########################################################################
## OPEN DATABASE
###########################################################################
sub open_db {
$dbh = DBI->connect(
'DBI:mysql:'
. $DbName
. ( defined($DbServer) ? ':host=' . $DbServer : q{} ),
$DbUsername,
$DbPassword,
{
mysql_enable_utf8mb4 => 1,
mysql_auto_reconnect => 1
}
) or die( 'Could not connect to database: ' . DBI::errstr() . "\n" );
$dbh->do('SET NAMES utf8mb4')
or die($dbh->errstr);
return ();
}
###########################################################################
## CLOSE DATABASE
###########################################################################
sub close_db {
$dbh->disconnect();
return ();
}
###########################################################################
## DELETE OLD LIST OF ARTICLES FROM LAST DUMP SCAN IN TABLE cw_dumpscan
###########################################################################
sub clearDumpscanTable {
my $sth = $dbh->prepare('DELETE FROM cw_dumpscan WHERE Project = ?;');
$sth->execute($project);
return ();
}
###########################################################################
## UPDATE DATE OF LAST DUMP IN DATABASE FOR PROJECT GIVEN
###########################################################################
sub updateDumpDate {
my ($date) = @_;
my $sth =
$dbh->prepare('UPDATE cw_overview SET Last_Dump = ? WHERE Project = ?;');
$sth->execute( $date, $project );
return ();
}
###########################################################################
##
###########################################################################
sub update_ui {
my $bytes = $pages->current_byte;
if ( $file_size > 0 ) {
my $percent = int( $bytes / $file_size * 100 );
printf( " %7d articles;%10s processed;%3d%% completed\n",
( $artcount, pretty_bytes($bytes), $percent ) );
}
else {
printf( " %7d articles;%10s processed\n",
( $artcount, pretty_bytes($bytes) ) );
}
return ();
}
###########################################################################
###
###########################################################################
sub pretty_number {
my $number = reverse(shift);
$number =~ s/(...)/$1,/g;
$number = reverse($number);
$number =~ s/^,//;
return $number;
}
###########################################################################
###
##########################################################################
sub pretty_bytes {
my ($bytes) = @_;
my $pretty = int($bytes) . ' bytes';
if ( ( $bytes = $bytes / 1024 ) > 1 ) {
$pretty = int($bytes) . ' KB';
}
if ( ( $bytes = $bytes / 1024 ) > 1 ) {
$pretty = sprintf( '%7.2f', $bytes ) . ' MB';
}
if ( ( $bytes = $bytes / 1024 ) > 1 ) {
$pretty = sprintf( '%0.3f', $bytes ) . ' GB';
}
return ($pretty);
}
###########################################################################
## RESET VARIABLES BEFORE SCANNING A NEW ARTICLE
###########################################################################
sub set_variables_for_article {
$title = q{}; # title of the current article
$text = q{}; # text of the current article (for work)
$page_is_redirect = 'no';
$page_is_disambiguation = 'no';
undef(@Category); # 0 pos_start
# 1 pos_end
# 2 category Test
# 3 linkname Linkname
# 4 original [[Category:Test|Linkname]]
$Category_counter = -1;
undef(@Interwiki); # 0 pos_start
# 1 pos_end
# 2 interwiki Test
# 3 linkname Linkname
# 4 original [[de:Test|Linkname]]
# 5 language
$Interwiki_counter = -1;
undef(@Lines); # Text seperated in lines
undef(@Headlines); # Headlines
undef(@Templates_all); # All templates
undef(@Template); # Templates with values
# 0 number of template
# 1 templatename
# 2 template_row
# 3 attribut
# 4 value
$Number_of_template_parts = -1; # Number of all template parts
undef(@Links_all); # All links
undef(@Images_all); # All images
undef(@Ref); # All ref
return ();
}
###########################################################################
## MOVE ARTICLES FROM cw_dumpscan INTO cw_error
###########################################################################
sub update_table_cw_error_from_dump {
if ( $Dump_or_Live eq 'dump' ) {
my $sth = $dbh->prepare('DELETE FROM cw_error WHERE Project = ?;');
$sth->execute($project);
$sth = $dbh->prepare(
'INSERT INTO cw_error (SELECT * FROM cw_dumpscan WHERE Project = ?);'
);
$sth->execute($project);
}
return ();
}
###########################################################################
## DELETE "DONE" ARTICLES FROM DB
###########################################################################
sub delete_done_article_from_db {
my $sth =
$dbh->prepare('DELETE FROM cw_error WHERE ok = 1 and project = ?;');
$sth->execute($project);
return ();
}
###########################################################################
## DELETE ARTICLE IN DATABASE
###########################################################################
sub delete_old_errors_in_db {
if ( ( $Dump_or_Live eq 'live' or $Dump_or_Live eq 'delay' ) && $title ne q{} ) {
my $sth = $dbh->prepare(
'DELETE FROM cw_error WHERE Title = ? AND Project = ?;');
$sth->execute( $title, $project );
}
return ();
}
###########################################################################
## GET @ErrorPriority
###########################################################################
sub getErrors {
my $error_count = 0;
my $sth = $dbh->prepare(
'SELECT COUNT(*) FROM cw_overview_errors WHERE project = ?;');
$sth->execute($project);
$Number_of_error_description = $sth->fetchrow();
$Number_of_error_description = 112;
$sth =
$dbh->prepare('SELECT prio FROM cw_overview_errors WHERE project = ?;');
$sth->execute($project);
foreach my $i ( 1 .. $Number_of_error_description ) {
$ErrorPriority[$i] = $sth->fetchrow();
if ( $ErrorPriority[$i] > 0 ) {
$error_count++;
}
}
if ( $Dump_or_Live ne 'article' ) {
two_column_display( 'Total # of errors possible:',
$Number_of_error_description );
two_column_display( 'Number of errors to process:', $error_count );
}
return ();
}
###########################################################################
## Read Metadata from API
###########################################################################
sub readMetadata {
my $image_regex_temp;
my $user_regex = q{};
my $draft_regex = q{};
$ServerName = $project;
if (
!(
$ServerName =~ s/^metawiki$/meta.wikimedia.org/
|| $ServerName =~ s/^nds_nlwiki$/nds-nl.wikipedia.org/
|| $ServerName =~ s/^([[:lower:]]+)wiki$/$1.wikipedia.org/
|| $ServerName =~ s/^([[:lower:]]+)wikisource$/$1.wikisource.org/
|| $ServerName =~ s/^([[:lower:]]+)wikiversity$/$1.wikiversity.org/
|| $ServerName =~ s/^([[:lower:]]+)wiktionary$/$1.wiktionary.org/
|| $ServerName =~ s/^([[:lower:]]+)wikivoyage$/$1.wikivoyage.org/
|| $ServerName =~ s/^([[:lower:]]+)wikiquote$/$1.wikiquote.org/
)
)
{
die( 'Couldn not calculate server name for project' . $project . "\n" );
}
($Language) = $ServerName =~ /^([[:lower:]]*)/;
my $sth = $dbh->prepare(
'SELECT Metaparam, Templates FROM cw_meta WHERE Project = ?');
$sth->execute($project);
while ( my @value = $sth->fetchrow_array ) {
if ( $value[0] eq 'magicword_defaultsort' ) {
push( @Magicword_defaultsort, $value[1] );
}
elsif ( $value[0] eq 'namespace_templates' ) {
push( @Namespace_templates, lc( $value[1] ) );
}
elsif ( $value[0] eq 'namespace_cat' ) {
push( @Namespace_cat, $value[1] );
}
elsif ( $value[0] eq 'image_regex' ) {
$image_regex_temp = $value[1];
}
elsif ( $value[0] eq 'cat_regex' ) {
$Cat_regex = $value[1];
}
elsif ( $value[0] eq 'user_regex' ) {
$user_regex = $value[1];
}
elsif ( $value[0] eq 'draft_regex' ) {
$draft_regex = $value[1];
}
elsif ( $value[0] eq 'rtl_text_dir' ) {
$rtl_text_dir = 1;
}
}
# API goofs on cswiki
if ( $project eq 'cswiki' ) {
$user_regex =
'user:|\[\[diskuse s wikipedistou:|\[\[wikipedista:|\[\[redaktor:|\[\[uživatel:|\[\[wikipedistka:|\[\[diskuse s uživatelem:|\[\[diskuse s wikipedistkou:|\[\[diskusia s redaktorom:|\[\[komentár k redaktorovi:|\[\[uživatel diskuse:|\[\[uživatelka diskuse:|\[\[wikipedista diskuse:|\[\[wikipedistka diskuse';
}
my $image_lc = lc($image_regex_temp);
$IMAGE_REGEX = qr/^\[\[\s*$image_regex_temp|$image_lc:/;
$REGEX_095 = qr/$user_regex$draft_regex/;
return ();
}
###########################################################################
## READ TEMPLATES GIVEN IN TRANSLATION FILE
###########################################################################
sub readTemplates {
my $template_sql;
foreach my $i ( 1 .. $Number_of_error_description ) {
$Template_list[$i][0] = '-9999';
$Template_regex[$i] = q{};
my $sth = $dbh->prepare(
'SELECT templates FROM cw_template WHERE error=? AND project=?');
$sth->execute( $i, $project );
$sth->bind_col( 1, \$template_sql );
while ( $sth->fetchrow_arrayref ) {
if ( defined($template_sql) ) {
if ( $Template_list[$i][0] eq '-9999' ) {
shift( @{ $Template_list[$i] } );
$Template_regex[$i] = '\{\{' . lc($template_sql) . q{|};
}
else {
$Template_regex[$i] =
$Template_regex[$i] . '\{\{' . lc($template_sql) . q{|};
}
push( @{ $Template_list[$i] }, lc($template_sql) );
}
}
}
foreach my $item ( @{ $Template_list[3] } ) {
$item = lc($item);
push @REGEX_003, qr/\{\{[ ]?$item/;
}
foreach my $item ( @{ $Template_list[61] } ) {
$item = lc($item);
push @REGEX_061, qr/\{\{[ ]?$item\s*\|[^}{]*\}\}[ ]{0,2}[.,?:;!]\s/; # Handling nested templates is too complicated.
}
foreach my $item ( @{ $Template_list[78] } ) {
$item = lc($item);
push @REGEX_078, qr/\{\{$item/;
}
return ();
}
###########################################################################
##
###########################################################################
sub scan_pages {
$end_of_dump = 'no';
my $page = q{};
if ( $Dump_or_Live eq 'dump' ) {
$pages = MediaWiki::DumpFile::Pages->new($DumpFilename);
# CHECK FILE_SIZE IF ONLY UNCOMPRESSED
if ( $DumpFilename !~ /(?:.*?)\.xml\.bz2$/ ) {
$file_size = ( stat($DumpFilename) )[7];
}
while ( defined( $page = $pages->next ) && $end_of_dump eq 'no' ) {
next if ( $page->namespace ne '0' ); #NS=0 IS ARTICLE NAMESPACE
set_variables_for_article();
$title = $page->title;
if ( $title ne q{} ) {
update_ui() if ++$artcount % 500 == 0;
#if ( $artcount > 300500 ) {
$page_namespace = 0;
$text = $page->revision->text;
check_article();
}
#$end_of_dump = 'yes' if ( $artcount > 1000 );
#$end_of_dump = 'yes' if ( $Error_counter > 40000 )
#}
}
}
elsif ( $Dump_or_Live eq 'delay' ) { delay_scan(); }
elsif ( $Dump_or_Live eq 'list' ) { list_scan(); }
elsif ( $Dump_or_Live eq 'article' ) { article_scan(); }
else { die("Wrong Load_mode entered \n"); }
return ();
}
###########################################################################
## CHECK ONE ARTICLE VIA A ARTICLE SCAN
###########################################################################
sub article_scan {
$page_namespace = 0;
new_api();
set_variables_for_article();
utf8::decode($ArticleName);
$text = api_get_text($ArticleName);
if ( defined($text) ) {
check_article();
}
return ();
}
###########################################################################
## CHECK ARTICLES VIA A LIST SCAN
###########################################################################
sub list_scan {
$page_namespace = 0;
new_api();
if ( !defined($ListFilename) ) {
die "The filename of the list was not defined\n";
}
open( my $list_of_titles, '<:encoding(UTF-8)', $ListFilename )
or die 'Could not open file ' . $ListFilename . "\n";
my @articles = <$list_of_titles>;
chomp @articles;
close($list_of_titles)
or die 'Could not close file ' . $list_of_titles . "\n";
foreach my $row (@articles) {
set_variables_for_article();
$title = $row;
$text = api_get_text($title);
if ( defined($text) ) {
check_article();
}
}
return ();
}
###########################################################################
##
###########################################################################
sub delay_scan {
my @title_array;
my $title_sql;
$page_namespace = 0;
new_api();
# Recheck 2500 articles that are over 1 month old, no DISTINCT because it changes the sort order
my $sth = $dbh->prepare('INSERT IGNORE INTO cw_new SELECT Project, Title FROM cw_error WHERE Found < DATE_SUB(NOW(), INTERVAL 31 DAY) AND Project = ? ORDER BY Found LIMIT 2500;');
$sth->execute($project);
# Get titles gathered from live_scan.pl
$sth = $dbh->prepare('SELECT Title FROM cw_new WHERE Project = ?;');
$sth->execute($project);
$sth->bind_col( 1, \$title_sql );
while ( $sth->fetchrow_arrayref ) {
push( @title_array, $title_sql );
}
# Remove the articles. live_scan.pl is continuously adding new article.
# So, need to remove before doing anything else.
$sth = $dbh->prepare('DELETE FROM cw_new WHERE Project = ?;');
$sth->execute($project);
foreach (@title_array) {
set_variables_for_article();
$title = $_;
if ( $title ne q{} ) {
$text = api_get_text($title);
printf( " %7d articles done\n", $artcount )
if ++$artcount % 500 == 0;
# Article may have been deleted or an empty title
if ( defined($text) ) {
check_article();
} else {
delete_old_errors_in_db(); # delete errors for deleted article
}
}
}
return ();
}
###########################################################################
##
###########################################################################
sub check_article {
delete_old_errors_in_db();
$text_original = $text;
#------------------------------------------------------
# Following alters text and must be run first
#------------------------------------------------------
# REMOVES FROM $text ANY CONTENT BETWEEN <!-- --> TAGS.
# CALLS #05
get_comments();
# REMOVES FROM $text ANY CONTENT BETWEEN <nowiki> </nowiki> TAGS.
# CALLS #23
get_nowiki();
# STRIPS TEMPLATES SPECIFIED IN CONFIG FOR #43
get_templates_all( 'strip' );
# REMOVES FROM $text ANY CONTENT BETWEEN <pre> </pre> TAGS.
# CALLS #24
get_pre();
# REMOVES FROM $text ANY CONTENT BETWEEN <code> </code> TAGS.
# CALLS #15
get_code();
# REMOVE FROM $text ANY CONTENT BETWEEN <syntaxhighlight> TAGS.
get_syntaxhighlight();
# REMOVES FROM $text ANY CONTENT BETWEEN <source> </sources TAGS.
# CALLS #014
get_source();
# REMOVES FROM $text ANY CONTENT BETWEEN <math> </math> TAGS.
# Goes after code and syntaxhighlight so it doesn't catch <math.h>
# CALLS #013
get_math();
# REMOVES FROM $text ANY CONTENT BETWEEN <ce> </ce> and <chem> </chem> TAGS.
get_ce();
# REMOVE FROM $text ANY CONTENT BETWEEN <hiero> TAGS.
get_hiero();
# REMOVE FROM $text ANY CONTENT BETWEEN <score> TAGS.
get_score();
# REMOVE FROM $text ANY CONTENT BETWEEN <graph> TAGS.
get_graph();
# REMOVE FROM $text ANY CONTENT BETWEEN <mapframe> TAGS.
get_mapframe();
$lc_text = lc($text);
#------------------------------------------------------
# Following interacts with other get_* or error #'s
#------------------------------------------------------
# CREATES @Ref - USED IN #81
if ( $ErrorPriority[81] > 0 ) {
get_ref();
}
# CREATES @Templates_all - USED IN #12, #31
# CALLS #43
get_templates_all( 'report' );
# DOES TEMPLATETIGER
# USES @Templates_all
# CREATES @Template - USED IN #59, #60
get_template();
# CREATES @Links_all & @Images_all-USED IN #65, #66, #67, #68, #74, #76, #82
# CALLS #10
get_links();
# SETS $page_is_redirect
check_for_redirect();
# CREATES @Category - USED IN #17, #18, #21, #22, #37, #53, #91
get_categories();
# CREATES @Interwiki - USED IN #45, #51, #53
get_interwikis();
# CREATES @Lines
# USED IN #02, #09, #26, #32, #34, #38, #39, #40-#42, #54, #75
create_line_array();
# CREATES @Headlines
# USES @Lines
# USED IN #07, #08, #25, #44, #51, #52, #57, #58, #62, #83, #84, #92
get_headlines();
# EXCEPT FOR get_* THAT REMOVES TAGS FROM $text, FOLLOWING DON'T NEED
# TO BE PROCESSED BY ANY get_* ROUTINES: 3-6, 11, 13-16, 19, 20, 23, 24,
# 27, 35, 36, 43, 46-50, 54-56, 59-61, 63-74, 76-80, 82, 84-90
error_check();
return ();
}
###########################################################################
## FIND MISSING COMMENTS TAGS AND REMOVE EVERYTHING BETWEEN THE TAGS
###########################################################################
sub get_comments {
if ( $text =~ /<!--/ ) {
my $comments_begin = 0;
my $comments_end = 0;
$comments_begin = () = $text =~ /<!--/g;
$comments_end = () = $text =~ /-->/g;
if ( $comments_begin > $comments_end ) {
my $snippet = get_broken_tag( '<!--', '-->' );
error_005_Comment_no_correct_end($snippet);
}
$text =~ s/<!--(.*?)-->//sg;
}
return ();
}
###########################################################################
## FIND MISSING NOWIKI TAGS AND REMOVE EVERYTHING BETWEEN THE TAGS
###########################################################################
sub get_nowiki {
# Convert to lower case is alot faster then the regex /i option
my $test_text = lc($text);
my $nowiki_begin = () = $test_text =~ /<nowiki>/g;
my $nowiki_end = () = $test_text =~ /<\/nowiki>/g;
if ( $nowiki_begin != $nowiki_end ) {
if ( $nowiki_begin > $nowiki_end ) {
my $snippet = get_broken_tag( '<nowiki>', '</nowiki>' );
error_023_nowiki_no_correct_end($snippet);
}
else {
my $snippet = get_broken_tag_closing( '<nowiki>', '</nowiki>' );
error_023_nowiki_no_correct_end($snippet);