Skip to content
Browse files

name change, MySQL db, and various script changes

  • Loading branch information...
1 parent 677697c commit 60648587d64364cdc7f831d6666ed5e13fe0ee8c barbie committed Dec 18, 2008
Showing with 1,438 additions and 466 deletions.
  1. +17 −5 CHANGES
  2. +6 −4 MANIFEST
  3. +27 −28 META.yml
  4. +23 −19 Makefile.PL
  5. +24 −21 bin/cpanstats
  6. +58 −48 bin/cpanstats-delete
  7. +26 −32 bin/cpanstats-reparse
  8. +41 −28 bin/cpanstats-select
  9. +63 −41 bin/cpanstats-update
  10. +83 −70 bin/cpanstats-verify
  11. +559 −0 lib/CPAN/Testers/Data/Generator.pm
  12. +375 −0 lib/CPAN/Testers/Data/Generator/Article.pm
  13. +3 −4 t/01base.t
  14. +6 −10 t/10functions.t
  15. +12 −15 t/11article.t
  16. +115 −67 t/12generate.t
  17. +0 −74 t/13database.t
View
22 CHANGES
@@ -1,11 +1,23 @@
-Revision history for Perl module CPAN::WWW::Testers::Generator.
+Revision history for Perl module CPAN::Testers::Data::Generator.
-0.31 current
+0.31 18/12/2008
+ - name change to fit below the CPAN::Testers namespace.
- added epoch to date extraction from article.
- added filename reference from CPAN-DistnameInfo.
- - added ability to provide SQL string in bin/cpanstats-select (v0.09).
- - added better command line option parsing to bin/cpanstats-reparse
- (v0.08).
+ - added ability to provide SQL string in bin/cpanstats-select.
+ - added better command line option parsing to bin/cpanstats-reparse.
+ - convert to use CPAN::Testers::Common::DBUtils.
+ - added MySQL cpanstats database to codebase.
+ - new scripts added to examples to handle database creation:
+ examples/cpanstats-createdb (v0.01)
+ examples/cpanstats-convert (v0.01)
+ - update to bin scripts with latest code changes:
+ bin/cpanstats (v0.05)
+ bin/cpanstats-delete (v0.12)
+ bin/cpanstats-reparse (v0.08)
+ bin/cpanstats-select (v0.09)
+ bin/cpanstats-update (v0.07)
+ bin/cpanstats-verify (v0.21)
0.30 01/10/2008
- ensure upload pattern matching is for the end of the string.
View
10 MANIFEST
@@ -9,19 +9,21 @@ bin/cpanstats-reparse
bin/cpanstats-select
bin/cpanstats-update
bin/cpanstats-verify
+examples/cpanstats-createdb
+examples/cpanstats-convert
examples/generate.sh
-lib/CPAN/WWW/Testers/Generator.pm
-lib/CPAN/WWW/Testers/Generator/Article.pm
-lib/CPAN/WWW/Testers/Generator/Database.pm
+examples/settings-example.ini
+lib/CPAN/Testers/Data/Generator.pm
+lib/CPAN/Testers/Data/Generator/Article.pm
t/01base.t
t/10functions.t
t/11article.t
t/12generate.t
-t/13database.t
t/90podtest.t
t/91podcover.t
t/94metatest.t
t/nntp/126015.txt
t/nntp/125106.txt
t/nntp/1804993.txt
t/nntp/1805500.txt
+t/test-config.ini
View
55 META.yml
@@ -1,5 +1,5 @@
--- #YAML:1.0
-name: CPAN-WWW-Testers-Generator
+name: CPAN-Testers-Data-Generator
version: 0.31
abstract: Download and summarize CPAN Testers data
author:
@@ -11,37 +11,36 @@ distribution_type: module
installdirs: site
requires:
- Cwd: 0
- CPAN::DistnameInfo: 0
- Class::Accessor::Fast: 0
- DBD::SQLite: 1.07
- DBI: 0
- Email::Simple: 0
- File::Basename: 0
- File::Path: 0
- Getopt::Long: 0
- MIME::Base64: 0
- MIME::QuotedPrint: 0
- Net::NNTP: 0
- version: 0
+ Cwd: 0
+ Config::IniFiles: 0
+ CPAN::DistnameInfo: 0
+ CPAN::Testers::Common::DBUtils: 0.03
+ Class::Accessor::Fast: 0
+ DBD::mysql: 0
+ DBD::SQLite: 1.07
+ Email::Simple: 0
+ File::Basename: 0
+ File::Path: 0
+ Getopt::Long: 0
+ MIME::Base64: 0
+ MIME::QuotedPrint: 0
+ Net::NNTP: 0
+ version: 0
recommends:
- Test::More: 0.70
- Test::MockObject: 0
- Test::Pod: 1.00
- Test::Pod::Coverage: 0.08
- Test::CPAN::Meta: 0.12
+ Test::More: 0.70
+ Test::MockObject: 0
+ Test::Pod: 1.00
+ Test::Pod::Coverage: 0.08
+ Test::CPAN::Meta: 0.12
build_requires:
- Test::More: 0.01
+ Test::More: 0.01
provides:
- CPAN::WWW::Testers::Generator:
- file: lib/CPAN/WWW/Testers/Generator.pm
+ CPAN::Testers::Data::Generator:
+ file: lib/CPAN/Testers/Data/Generator.pm
version: 0.31
- CPAN::WWW::Testers::Generator::Article:
- file: lib/CPAN/WWW/Testers/Generator/Article.pm
- version: 0.31
- CPAN::WWW::Testers::Generator::Database:
- file: lib/CPAN/WWW/Testers/Generator/Database.pm
+ CPAN::Testers::Data::Generator::Article:
+ file: lib/CPAN/Testers/Data/Generator/Article.pm
version: 0.31
no_index:
@@ -51,7 +50,7 @@ no_index:
resources:
license: http://dev.perl.org/licenses/
- bugtracker: http://rt.cpan.org/Public/Dist/Display.html?Name=-CPAN-WWW-Testers-Generator
+ bugtracker: http://rt.cpan.org/Public/Dist/Display.html?Name=CPAN-Testers-Data-Generator
meta-spec:
version: 1.4
View
42 Makefile.PL
@@ -1,30 +1,34 @@
use ExtUtils::MakeMaker;
WriteMakefile(
- 'NAME' => 'CPAN::WWW::Testers::Generator',
- 'VERSION_FROM' => 'lib/CPAN/WWW/Testers/Generator.pm',
+ 'NAME' => 'CPAN::Testers::Data::Generator',
+ 'VERSION_FROM' => 'lib/CPAN/Testers/Data/Generator.pm',
'PREREQ_PM' => {
- 'Cwd' => 0, # only for cpanstats.pl
- 'Getopt::Long' => 0, # only for cpanstats.pl
+ 'Cwd' => 0, # only for cpanstats.pl
+ 'Getopt::Long' => 0, # only for cpanstats.pl
- 'Class::Accessor::Fast' => 0,
- 'CPAN::DistnameInfo' => 0,
- 'DBD::SQLite' => '1.07',
- 'DBI' => 0,
- 'Email::Simple' => 0,
- 'File::Basename' => 0,
- 'File::Path' => 0,
- 'MIME::Base64' => 0,
- 'MIME::QuotedPrint' => 0,
- 'Net::NNTP' => 0,
- 'Time::Local' => 0,
- 'version' => 0,
+ 'Class::Accessor::Fast' => 0,
+ 'Config::IniFiles' => 0,
+ 'CPAN::DistnameInfo' => 0,
+ 'CPAN::Testers::Common::DBUtils' => 0.03,
+ 'Email::Simple' => 0,
+ 'File::Basename' => 0,
+ 'File::Path' => 0,
+ 'MIME::Base64' => 0,
+ 'MIME::QuotedPrint' => 0,
+ 'Net::NNTP' => 0,
+ 'Time::Local' => 0,
+ 'version' => 0,
+
+ 'DBD::mysql' => 0,
+ 'DBD::SQLite' => 1.07,
+
+ 'Test::More' => 0, # only for testing
+ },
- 'Test::More' => 0, # only for testing
- },
'PL_FILES' => {},
'INSTALLDIRS' => 'site',
'EXE_FILES' => [ 'bin/cpanstats' ],
- NO_META => 1,
+ NO_META => 1,
($] >= 5.005 ? ## Add these new keywords supported since 5.005
(ABSTRACT => 'CPAN Testers Database Generator',
View
45 bin/cpanstats
@@ -1,7 +1,7 @@
#!/usr/bin/perl -w
use strict;
-my $VERSION = '0.04';
+my $VERSION = '0.05';
#----------------------------------------------------------------------------
# Library Modules
@@ -12,7 +12,7 @@ use Cwd;
use Getopt::ArgvFile default=>1;
use Getopt::Long;
-use CPAN::WWW::Testers::Generator '0.26';
+use CPAN::Testers::Data::Generator '0.31';
#----------------------------------------------------------------------------
# Variables
@@ -24,31 +24,31 @@ my %options;
init_options();
-my $directory = $options{directory} || cwd();
-my $t = CPAN::WWW::Testers::Generator->new(
+my $t = CPAN::Testers::Data::Generator->new(
ignore => $options{ignore},
nostore => $options{nostore},
- directory => $directory,
- logfile => $directory . '/logs/cpanstats.log'
+ config => $options{config},
);
-if($options{rebuild}) { $t->rebuild($options{start},$options{end}); }
-else { $t->generate; }
+if($options{rebuild}) { $t->rebuild($options{start},$options{end}); }
+else { $t->generate; }
# -------------------------------------
# Subroutines
sub init_options {
GetOptions( \%options,
- 'directory|d=s',
+ 'config|c=s',
'ignore|i',
- 'nostore|n',
- 'rebuild|r',
- 'start=i',
- 'end=i',
+ 'nostore|n',
+
+ 'rebuild|r',
+ 'start=i',
+ 'end=i',
+
'help|h',
- 'version|V'
+ 'version|v'
);
_help(1) if($options{help});
@@ -61,16 +61,16 @@ sub _help {
if($full) {
print <<HERE;
-Usage: $0 [-d directory] [-r [--start=n] [--end=n]] [-i] [-n] [-h] [-V]
+Usage: $0 [-c file] [-r [--start=n] [--end=n]] [-i] [-n] [-h] [-v]
- -d directory use named directory
+ -c file configuration file
-r rebuild cpanstats from articles
--start start id to rebuild
--end end id to rebuild
-i ignore no article errors
-n do not store old articles
-h this help screen
- -V program version
+ -v program version
HERE
@@ -89,18 +89,18 @@ cpanstats - script to access the NNTP server and update the database.
=head1 SYNOPSIS
# use defaults
- cpanstats
+ cpanstats -c=data/settings.ini
# rebuild default database
- cpanstats --rebuild --start=100 --end=200
+ cpanstats -c=data/settings.ini --rebuild --start=100 --end=200
# update database in named directory, without storing articles,
# and ignoring any missing article responses from the NNTP server
- cpanstats -d=../db/ --nostore --ignore
+ cpanstats -c=data/settings.ini --nostore --ignore
=head1 DESCRIPTION
-Acts as a wrapper script to the underlying CPAN::WWW::Testers::Generator code
+Acts as a wrapper script to the underlying CPAN::Testers::Data::Generator code
that downloads articles from the NNTP server, extracts all components of each,
then stores the data into a local SQLite database.
@@ -114,6 +114,9 @@ greatly if you are able to pinpoint problems or even supply a patch.
Fixes are dependant upon their severity and my availablity. Should a fix not
be forthcoming, please feel free to (politely) remind me.
+RT Queue -
+http://rt.cpan.org/Public/Dist/Display.html?Name=CPAN-Testers-Data-Generator
+
=head1 SEE ALSO
L<CPAN::WWW::Testers>,
View
106 bin/cpanstats-delete
@@ -2,7 +2,7 @@
use strict;
$|++;
-my $VERSION = '0.11';
+my $VERSION = '0.12';
#----------------------------------------------------------------------------
@@ -12,7 +12,7 @@ cpanstats-delete - script to delete entries from the cpanstats database.
=head1 SYNOPSIS
- perl cpanstats-delete [-a=0] [-d=0] [--database=$db] [--file=$file]
+ perl cpanstats-delete --config=<file> [-a=0] [-d=0] [--file=<file>]
=head1 DESCRIPTION
@@ -24,6 +24,10 @@ creates each HTML page of the site.
=over 4
+=item --config
+
+Configuration file contain database access details.
+
=item -a | --all
Delete all the NNTP ids greater than the specified number.
@@ -32,10 +36,6 @@ Delete all the NNTP ids greater than the specified number.
Delete a single NNTP id.
-=item --database
-
-Specify the exact path to the cpanstats database if not ./cpanstats.db.
-
=item --file
Named file used when deleting a list of NNTP ids.
@@ -47,19 +47,14 @@ Named file used when deleting a list of NNTP ids.
# -------------------------------------
# Library Modules
-use lib qw(./lib ../lib);
-
-use DBI;
+use Config::IniFiles;
+use CPAN::Testers::Common::DBUtils;
use Getopt::Long;
use IO::File;
-use CPAN::WWW::Testers::Generator::Database;
-
# -------------------------------------
# Variables
-use constant DATABASE => 'cpanstats.db';
-
my (%options);
# -------------------------------------
@@ -69,22 +64,23 @@ my (%options);
init_options();
-$options{database} ||= DATABASE;
-
-my $dbi = CPAN::WWW::Testers::Generator::Database->new(database => $options{database}, AutoCommit => 1);
-print STDERR "Cannot connect to database [$options{database}]\n" unless($dbi);
-
-
##### MAIN #####
my @list = get_list();
for my $id (@list) {
- print "Deleting ... $id\n";
- $dbi->do_query("DELETE from cpanstats WHERE id=$id");
+ print "Deleting ... $id\n";
+ $options{CPANSTATS}->do_query("DELETE from cpanstats WHERE id=$id");
+ $options{LITESTATS}->do_query("DELETE from cpanstats WHERE id=$id");
}
-do_query("DELETE from cpanstats WHERE id>$options{all}") if($options{all});
-do_query("DELETE from cpanstats WHERE id=$options{delete}") if($options{delete});
+if($options{all}) {
+ $options{CPANSTATS}->do_query("DELETE from cpanstats WHERE id>$options{all}");
+ $options{LITESTATS}->do_query("DELETE from cpanstats WHERE id>$options{all}");
+}
+if($options{delete}) {
+ $options{CPANSTATS}->do_query("DELETE from cpanstats WHERE id=$options{delete}");
+ $options{LITESTATS}->do_query("DELETE from cpanstats WHERE id=$options{delete}");
+}
# -------------------------------------
# Subroutines
@@ -96,18 +92,18 @@ Returns the list of NNTP ids from the named file.
=cut
sub get_list {
- my @list;
- my $file = $options{file} || return ();
- die "file [$file] not found" unless(-f $file);
-
- my $fh = IO::File->new($file) or die "Cannot open file [$file]: $!";
- while(<$fh>) {
- chomp;
- my ($num) = (m/^(\d+)/);
- push @list, $num if($num);
- }
- $fh->close;
- return @list;
+ my @list;
+ my $file = $options{file} || return ();
+ die "file [$file] not found" unless(-f $file);
+
+ my $fh = IO::File->new($file) or die "Cannot open file [$file]: $!";
+ while(<$fh>) {
+ chomp;
+ my ($num) = (m/^(\d+)/);
+ push @list, $num if($num);
+ }
+ $fh->close;
+ return @list;
}
=item init_options
@@ -118,35 +114,49 @@ Determine command line options and initialise any defaults.
sub init_options {
GetOptions( \%options,
+ 'config|c=s',
'all|a=i',
'delete|d=i',
- 'database=s',
'file=s',
'help|h',
- 'version|V'
+ 'version|v'
);
- _help(1) if($options{help});
- _help(0) if($options{version});
+ help(1) if($options{help});
+ help(0) if($options{version});
+
+ help(1,"Must specify the configuration file") unless($options{config});
+ help(1,"Configuration file [$options{config}] not found") unless(-f $options{config});
+
+ # load configuration
+ my $cfg = Config::IniFiles->new( -file => $options{config} );
+
+ # configure databases
+ for my $db (qw(CPANSTATS LITESTATS)) {
+ die "No configuration for $db database\n" unless($cfg->SectionExists($db));
+ my %opts = map {$_ => $cfg->val($db,$_);} qw(driver database dbfile dbhost dbport dbuser dbpass);
+ $options{$db} = CPAN::Testers::Common::DBUtils->new(%opts);
+ die "Cannot configure $db database\n" unless($options{$db});
+ }
}
-sub _help {
- my $full = shift;
+sub help {
+ my ($full,$mess) = @_;
+
+ print "\n$mess\n\n" if($mess);
if($full) {
print <<HERE;
-Usage: $0 \\
- [--database=<db>] \\
- ( --all=<num> | --delete=<num> | --file=<file> ) \\
- [-h] [-V]
+Usage: $0 --config=<file> \\
+ ( [--all=<num>] [--delete=<num>] [--file=<file>] | -h | -v )
- --database=<db> use named database
+ --config=<file> configuration file
--all=<num> delete all entried greater than given id
--delete=<num> delete given id
--file=<file> delete multiple ids (1 per line)
-h this help screen
- -V program version
+ -v program version
HERE
@@ -170,7 +180,7 @@ Fixes are dependant upon their severity and my availablity. Should a fix not
be forthcoming, please feel free to (politely) remind me.
RT Queue -
-http://rt.cpan.org/Public/Dist/Display.html?Name=CPAN-WWW-Testers-Generator
+http://rt.cpan.org/Public/Dist/Display.html?Name=CPAN-Testers-Data-Generator
=head1 SEE ALSO
View
58 bin/cpanstats-reparse
@@ -12,16 +12,15 @@ cpanstats-reparse - script to reparse an NNTP article.
=head1 SYNOPSIS
- perl cpanstats-reparse \
- [--directory=<dir>] \
+ perl cpanstats-reparse \
+ --config=<file> \
[--check|c] [--localonly|l] \
( [--id=<nntpid>] | [--file=<filename>] ) \
- [--exclude|x=<fields>] \
- [--help|h]
+ [--exclude|x=<fields>]
=head1 DESCRIPTION
-This script is used to reparse an NNTP article, which may have been
+This program is used to reparse an NNTP article, which may have been
incorrectly parsed by the cpanstats, and should feature in the stats
for the CPAN Testers Statistics database.
@@ -48,7 +47,7 @@ use Getopt::ArgvFile default=>1;
use Getopt::Long;
use IO::File;
-use CPAN::WWW::Testers::Generator;
+use CPAN::Testers::Data::Generator '0.31';
# -------------------------------------
# Variables
@@ -62,10 +61,8 @@ my (%options,@exclude);
init_options();
-my $directory = $options{directory} || cwd();
-my $t = CPAN::WWW::Testers::Generator->new(
- directory => $directory,
- logfile => $directory . '/logs/cpanstats.log'
+my $t = CPAN::Testers::Data::Generator->new(
+ config => $options{config}
);
# GetOptions allows several different ways of passing multiple values, this
@@ -115,49 +112,46 @@ Determine command line options and initialise any defaults.
sub init_options {
GetOptions( \%options,
- 'directory|d=s',
+ 'config=s',
'check|c',
'localonly|l',
'id|i=i',
'file=s',
'exclude|x=s' => \@exclude,
'help|h',
- 'version|V'
+ 'version|v'
);
- _help(1) if($options{help});
- _help(0) if($options{version});
+ help(1) if($options{help});
+ help(0) if($options{version});
- unless($options{directory} && -d $options{directory}) {
- print "Source directory not found [$options{directory}]\n\n";
- _help(1);
- }
+ help(1,"Must specify the configuration file") unless($options{config});
+ help(1,"Configuration file [$options{config}] not found") unless(-f $options{config});
- unless($options{id} || -f $options{file}) {
- print "Must specify an ID or FILE to reparse\n\n";
- _help(1);
- }
+ help(1,"Must specify an ID or FILE to reparse") unless($options{id} || -f $options{file});
}
-sub _help {
- my $full = shift;
+sub help {
+ my ($full,$mess) = @_;
+
+ print "\n$mess\n\n" if($mess);
if($full) {
print <<HERE;
-Usage: $0 \\
- [-d=<directory>] [--check|c] [--localonly|l] \\
- ( --id|i=<num> | --file=<file> ) \\
- [--exclude=<list>] [-h] [-V]
+Usage: $0 --config=<file> \\
+ [--check|c] [--localonly|l] \\
+ ( --id|i=<num> | --file=<file> ) \\
+ [--exclude=<list>] [-h] [-v]
- -d=<directory> use named directory
+ --config=<file> configuration file
-c check only do not update
- -l local only lookup
+ -l local only lookup
-i=<num> named id to reparse
--file=<file> file containing ids to reparse
--exclude=<list> exclude fields from parsing
-h this help screen
- -V program version
+ -v program version
HERE
@@ -181,7 +175,7 @@ Fixes are dependant upon their severity and my availablity. Should a fix not
be forthcoming, please feel free to (politely) remind me.
RT Queue -
-http://rt.cpan.org/Public/Dist/Display.html?Name=CPAN-WWW-Testers-Generator
+http://rt.cpan.org/Public/Dist/Display.html?Name=CPAN-Testers-Data-Generator
=head1 SEE ALSO
View
69 bin/cpanstats-select
@@ -13,7 +13,11 @@ cpanstats-select - select stats from the CPAN Testers Statistics database.
=head1 SYNOPSIS
perl cpanstats-select \
- [--database=<db>] \
+ --config=<file> \
+ --database=<dbcode> \
+ \
+ [--sql=<sql>] \
+ \
[--nntp|-n=<nntpid>] \
[--grade|-g=<grade>] \
[--disto|-m=<distname>] \
@@ -22,8 +26,7 @@ cpanstats-select - select stats from the CPAN Testers Statistics database.
[--date|-y=<YYYYMM>] \
[--tester|-t=<email>] \
[--platform|-o=<platform>] \
- [--perl|-p=<perlversion>] \
- [--help|-h]
+ [--perl|-p=<perlversion>]
=head1 DESCRIPTION
@@ -34,9 +37,13 @@ all the required data.
=over 4
+=item --config
+
+Configuration file contain database access details.
+
=item --database
-Specify the exact path to the cpanstats database if not ./cpanstats.db.
+Specify the database to use, CPANSTATS or LITESTATS
=back
@@ -45,20 +52,15 @@ Specify the exact path to the cpanstats database if not ./cpanstats.db.
# -------------------------------------
# Library Modules
-use lib qw(./lib ../lib);
-
-use DBI;
+use Config::IniFiles;
+use CPAN::Testers::Common::DBUtils;
use Getopt::ArgvFile default=>1;
use Getopt::Long;
-use CPAN::WWW::Testers::Generator::Database;
-
# -------------------------------------
# Variables
-use constant DATABASE => 'cpanstats.db';
-
-my (%options);
+my (%options,$dbi);
# -------------------------------------
# Program
@@ -67,18 +69,12 @@ my (%options);
init_options();
-$options{database} ||= DATABASE;
-
-my $dbi = CPAN::WWW::Testers::Generator::Database->new(database => $options{database}, AutoCommit => 1);
-print STDERR "Cannot connect to database [$options{database}]\n" unless($dbi);
-
-
##### MAIN #####
# '(id,state,postdate,tester,dist,version,platform,perl) '.
if($options{sql}) {
- my @rows = $dbi->get_query($options{sql});
+ my @rows = $dbi->get_query('array',$options{sql});
if(@rows) {
for my $row (@rows) {
print join(",",@$row) . "\n";
@@ -107,7 +103,7 @@ if($options{sql}) {
push @where, "perl=''" if(defined $options{perl} && $options{perl} eq '-');
if(@where) {
- my @rows = $dbi->get_query($sql . join(' AND ',@where));
+ my @rows = $dbi->get_query('array',$sql . join(' AND ',@where));
if(@rows) {
for my $row (@rows) {
print join(",",@$row) . "\n";
@@ -125,6 +121,7 @@ if($options{sql}) {
sub init_options {
GetOptions( \%options,
+ 'config=s',
'database=s',
'sql=s',
'nntp|n=s',
@@ -138,23 +135,40 @@ sub init_options {
'version|v=s',
'date|y=s',
'help|h',
- 'version|V'
);
- _help(1) if($options{help});
- _help(0) if($options{Version});
+ help(1) if($options{help});
$options{grade} = lc $options{grade} if($options{grade});
+
+ help(1,"Must specify the configuration file") unless($options{config});
+ help(1,"Configuration file [$options{config}] not found") unless(-f $options{config});
+
+ help(1,"Must specify the database code (CPANSTATS or LITESTATS)")
+ unless($options{database} && $options{database} =~ /^CPANSTATS|LITESTATS$/);
+
+ # load configuration
+ my $cfg = Config::IniFiles->new( -file => $options{config} );
+
+ # configure databases
+ my $db = $options{database};
+ die "No configuration for $db database\n" unless($cfg->SectionExists($db));
+ my %opts = map {$_ => $cfg->val($db,$_);} qw(driver database dbfile dbhost dbport dbuser dbpass);
+ $dbi = CPAN::Testers::Common::DBUtils->new(%opts);
+ die "Cannot configure $db database\n" unless($dbi);
}
-sub _help {
- my $full = shift;
+sub help {
+ my ($full,$mess) = @_;
+
+ print "\n$mess\n\n" if($mess);
if($full) {
print <<HERE;
Usage: $0
- [--database=<db>] - path to cpanstats database
+ --config=<file> - configuration file
+ --database=<dbcode> - CPANSTATS or LITESTATS
[--nntp|-n=<nntpid>] - NNTP article id
[--grade|-g=<grade>] - report grade
@@ -167,7 +181,6 @@ Usage: $0
[--perl|-p=<perlversion>] - perl version
[--help|-h] - this screen
- [--Version|-V] - program version
Notes:
- combine options (except help) to refine your search
@@ -198,7 +211,7 @@ Fixes are dependant upon their severity and my availablity. Should a fix not
be forthcoming, please feel free to (politely) remind me.
RT Queue -
-http://rt.cpan.org/Public/Dist/Display.html?Name=CPAN-WWW-Testers-Generator
+http://rt.cpan.org/Public/Dist/Display.html?Name=CPAN-Testers-Data-Generator
=head1 SEE ALSO
View
104 bin/cpanstats-update
@@ -2,7 +2,7 @@
use strict;
$|++;
-my $VERSION = '0.06';
+my $VERSION = '0.07';
#----------------------------------------------------------------------------
@@ -12,26 +12,33 @@ cpanstats-update - script to update entries in the cpanstats database.
=head1 SYNOPSIS
- perl cpanstats-update [-i=0] [--database=$db] [--file=$file]
+ perl cpanstats-update \
+ --config=<file> \
+ --database=<dbcode> \
+ [-i=0] [--file=<file>]
=head1 DESCRIPTION
-Using the cpanstats database, which should in the local directory, will search
-for records matching the given id on the command line, or the list provided in
-a named file. If the named file is correctly formatted, the nominated columns
-will also be updated.
+Using a nominated cpanstats database, this program will search for records
+matching the given id on the command line, or the list provided in a named file.
+If the named file is correctly formatted, the nominated columns will also be
+updated.
=head1 OPTIONS
=over 4
-=item -i | --id
+=item --config
-Display the record matching the given NNTP id.
+Configuration file contain database access details.
=item --database
-Specify the exact path to the cpanstats database if not ./cpanstats.db.
+Specify the database to use, CPANSTATS or LITESTATS
+
+=item -i | --id
+
+Display the record matching the given NNTP id.
=item --file
@@ -59,19 +66,14 @@ Note that the second entry will update multiple columns
# -------------------------------------
# Library Modules
-use lib qw(./lib ../lib);
-
-use DBI;
+use Config::IniFiles;
+use CPAN::Testers::Common::DBUtils;
use Getopt::Long;
use IO::File;
-use CPAN::WWW::Testers::Generator::Database;
-
# -------------------------------------
# Variables
-use constant DATABASE => 'cpanstats.db';
-
my (%options,@rows);
# -------------------------------------
@@ -81,28 +83,26 @@ my (%options,@rows);
init_options();
-$options{database} ||= DATABASE;
-
-my $dbi = CPAN::WWW::Testers::Generator::Database->new(database => $options{database}, AutoCommit => 1);
-print STDERR "Cannot connect to database [$options{database}]\n" unless($dbi);
-
-
##### MAIN #####
print "#id,state,postdate,tester,dist,version,platform,perl\n";
my @list = get_list();
push @list, {id=>$options{id}} if($options{id});
for my $item (@list) {
+ @rows = $options{DB}->get_query('array',"SELECT * FROM cpanstats WHERE id=$item->{id}");
+ unless(@rows) {
+ $options{DB}->do_query("INSERT INTO cpanstats (id) VALUES ($item->{id})");
+ }
if($item->{set}) {
- $dbi->do_query("UPDATE cpanstats SET $item->{set} WHERE id=$item->{id}");
+ $options{DB}->do_query("UPDATE cpanstats SET $item->{set} WHERE id=$item->{id}");
}
- @rows = $dbi->get_query("SELECT * FROM cpanstats WHERE id=$item->{id}");
+ @rows = $options{DB}->get_query('array',"SELECT * FROM cpanstats WHERE id=$item->{id}");
print join(",",@$_)."\n" for(@rows);
}
-@rows = $dbi->get_query("SELECT max(id) FROM cpanstats");
+@rows = $options{DB}->get_query('array',"SELECT max(id) FROM cpanstats");
print "\n#MAX ID=$rows[0][0]\n" if(@rows);
# -------------------------------------
@@ -117,13 +117,14 @@ Returns the list of NNTP ids from the named file.
sub get_list {
my @list;
my $file = $options{file} || return ();
- die "file [$file] not found" unless(-f $file);
+ die "file [$file] not found" unless(-f $file);
- my $fh = IO::File->new($file) or die "Cannot open file [$file]: $!";
+ my $fh = IO::File->new($file) or die "Cannot open file [$file]: $!";
while(<$fh>) {
next if(/^\s*$/); # ignore empty lines
chomp;
my ($id,$str) = (m/^(\d+)(?:,(.*))?/);
+ $str =~ s/fulldate/date/ if($options{driver} =~ /sqlite/i);
push @list, {id=>$id,set=>$str} if($id);
}
$fh->close;
@@ -139,32 +140,53 @@ Determine command line options and initialise any defaults.
sub init_options {
GetOptions( \%options,
+ 'config=s',
'database=s',
'id|i=i',
'file=s',
'help|h',
- 'version|V'
+ 'version|v'
);
- _help(1) if($options{help});
- _help(0) if($options{version});
+ help(1) if($options{help});
+ help(0) if($options{version});
+
+ help(1,"Must specify the configuration file") unless($options{config});
+ help(1,"Configuration file [$options{config}] not found") unless(-f $options{config});
+
+ help(1,"Must specify the database code (CPANSTATS or LITESTATS)")
+ unless($options{database} && $options{database} =~ /^CPANSTATS|LITESTATS$/);
+
+ # load configuration
+ my $cfg = Config::IniFiles->new( -file => $options{config} );
+
+ # configure databases
+ my $db = $options{database};
+ die "No configuration for $db database\n" unless($cfg->SectionExists($db));
+ my %opts = map {$_ => $cfg->val($db,$_);} qw(driver database dbfile dbhost dbport dbuser dbpass);
+ $options{DB} = CPAN::Testers::Common::DBUtils->new(%opts);
+ die "Cannot configure $db database\n" unless($options{DB});
+
+ $options{driver} = $opts{driver};
}
-sub _help {
- my $full = shift;
+sub help {
+ my ($full,$mess) = @_;
+
+ print "\n$mess\n\n" if($mess);
if($full) {
print <<HERE;
-Usage: $0 \\
- [--database=<db>] [--id=<num>] [--file=<file>] \\
- [-h] [-V]
+Usage: $0
+ --config=<file> - configuration file
+ --database=<dbcode> - CPANSTATS or LITESTATS
+
+ [--id=<num>] - refresh this id
+ [--file=<file>] - refresh these ids (1 per line)
- --database=<db> use named database
- --id=<num> refresh this id
- --file=<file> refresh these ids (1 per line)
- -h this help screen
- -V program version
+ [--help|-h] - this screen
+ [--Version|-v] - program version
HERE
@@ -188,7 +210,7 @@ Fixes are dependant upon their severity and my availablity. Should a fix not
be forthcoming, please feel free to (politely) remind me.
RT Queue -
-http://rt.cpan.org/Public/Dist/Display.html?Name=CPAN-WWW-Testers-Generator
+http://rt.cpan.org/Public/Dist/Display.html?Name=CPAN-Testers-Data-Generator
=head1 SEE ALSO
View
153 bin/cpanstats-verify
@@ -2,7 +2,7 @@
use strict;
$|++;
-my $VERSION = '0.20';
+my $VERSION = '0.21';
#----------------------------------------------------------------------------
@@ -12,7 +12,11 @@ cpanstats-verify - script to verify the contents of the cpanstats database.
=head1 SYNOPSIS
- perl cpanstats-verify [-c|-m] [-v|-s --file=$file] [--start=0] [--end=0]
+ perl cpanstats-verify \
+ --config=<file> \
+ --database=<dbcode> \
+ --log=<file> --out=<file> \
+ [-c|-m] [-v|-s --file=$file] [--start=0] [--end=0]
=head1 DESCRIPTION
@@ -25,6 +29,22 @@ values within the database and the script will be assumed.
=over 4
+=item --config
+
+Configuration file contain database access details.
+
+=item --database
+
+Specify the database to use, CPANSTATS or LITESTATS
+
+=item --log
+
+Specify the logfile generated by the 'cpanstats' program.
+
+=item --out
+
+Specify the output for this run.
+
=item -c | --check
Looks up the entries stored in the database between the START and END NNTP ids,
@@ -45,10 +65,6 @@ either prints the entry or highlights that it is missing.
Looks up the NNTP entries directly as listed in the named file.
-=item --database
-
-Specify the exact path to the cpanstats database if not ./cpanstats.db.
-
=item --file
Named file used when verifying a list of NNTP ids.
@@ -57,35 +73,27 @@ Named file used when verifying a list of NNTP ids.
Start and end NNTP ids.
-=item -h | --help
-
-Print the help screen
-
=back
=cut
# -------------------------------------
# Library Modules
-use lib qw(./lib ../lib);
-
-use DBI;
+use Config::IniFiles;
+use CPAN::Testers::Common::DBUtils;
use Email::Simple;
use Net::NNTP;
use Getopt::ArgvFile default=>1;
use Getopt::Long;
use IO::File;
-use CPAN::WWW::Testers::Generator::Database;
-
# -------------------------------------
# Variables
-my (%log,%options);
+my (%log,%options,$dbi);
my $PROGRESS = 0;
-use constant DATABASE => 'cpanstats.db';
use constant NNTPSTART => 872391;
# -------------------------------------
@@ -97,11 +105,6 @@ progress("init");
init_options();
-$options{database} ||= DATABASE;
-
-my $dbi = CPAN::WWW::Testers::Generator::Database->new(database => $options{database}, AutoCommit => 1);
-print STDERR "Cannot connect to database [$options{database}]\n" unless($dbi);
-
$options{start} ||= NNTPSTART;
$options{end} ||= get_lastid();
@@ -138,7 +141,7 @@ Returns the last NNTP id recorded in the database.
=cut
sub get_lastid {
- my @rows = $dbi->get_query("SELECT MAX(id) FROM cpanstats");
+ my @rows = $dbi->get_query('array',"SELECT MAX(id) FROM cpanstats");
return $rows[0]->[0];
}
@@ -155,15 +158,15 @@ sub missing_stats {
$" = ",";
my $count = 0;
- my $iterator = $dbi->get_query_iterator("SELECT * FROM cpanstats WHERE id >= $options{start} AND id <= $options{end}");
+ my $iterator = $dbi->iterator('array',"SELECT * FROM cpanstats WHERE id >= $options{start} AND id <= $options{end}");
while(my $row = $iterator->()) {
next if($row->[1] =~ /^na|pass|fail|unknown|cpan$/);
_log("BADPARSE:") unless($count++);
_log("@$row");
}
$count = 0;
- $iterator = $dbi->get_query_iterator("SELECT * FROM cpanstats WHERE state in ('na','pass','fail','unknown') AND id >= $options{start} AND id <= $options{end}");
+ $iterator = $dbi->iterator('array',"SELECT * FROM cpanstats WHERE state in ('na','pass','fail','unknown') AND id >= $options{start} AND id <= $options{end}");
while(my $row = $iterator->()) {
next if( defined $row->[2] &&
defined $row->[3] &&
@@ -175,7 +178,7 @@ sub missing_stats {
}
$count = 0;
- $iterator = $dbi->get_query_iterator("SELECT * FROM cpanstats WHERE state in ('cpan') AND id >= $options{start} AND id <= $options{end}");
+ $iterator = $dbi->iterator('array',"SELECT * FROM cpanstats WHERE state in ('cpan') AND id >= $options{start} AND id <= $options{end}");
while(my $row = $iterator->()) {
next if( defined $row->[2] &&
defined $row->[4] &&
@@ -196,22 +199,22 @@ sub check_stats {
progress("start - check stats");
my $count = $options{start};
- my $iterator = $dbi->get_query_iterator("SELECT * FROM cpanstats WHERE id >= $options{start} AND id <= $options{end} ORDER BY id");
+ my $iterator = $dbi->iterator('array',"SELECT * FROM cpanstats WHERE id >= $options{start} AND id <= $options{end} ORDER BY id");
while(my $row = $iterator->()) {
#progress("$count - $row->[0]");
# missing entries (ignore human replies)
while($count < $row->[0]) {
my ($subj,$from) = get_subject($count);
_log("$count,$from,missing - $subj") unless($subj =~ /(?:Re:|Fw:|Ab:|mirror update)/i ||
- $subj =~ /\.(?:readme|pl|cgi|pdf|html?|doc|txt|ppd|asc|yml|jpg|png|gif|rtf|css|pod|sig|diff)$/i);
+ $subj =~ /\.(?:readme|pl|cgi|pdf|html?|doc|txt|ppd|asc|yml|jpg|png|gif|rtf|css|pod|sig|diff)$/i);
- # note that in the above two regexes we ignore conversation
- # threads, mirror updates and a whole host of uploads that
- # are nothing to do with a distribution upload. The caveats
- # to this are the entries that are potentially bad uploads
- # (bad archive naming or uploading a .pm file) and any
- # reports that are in error.
+ # note that in the above two regexes we ignore conversation
+ # threads, mirror updates and a whole host of uploads that
+ # are nothing to do with a distribution upload. The caveats
+ # to this are the entries that are potentially bad uploads
+ # (bad archive naming or uploading a .pm file) and any
+ # reports that are in error.
$count++;
}
@@ -250,7 +253,7 @@ sub verify_stats {
$" = ",";
my @list = get_list();
for my $id (@list) {
- my @rows = $dbi->get_query("SELECT * FROM cpanstats WHERE id=$id");
+ my @rows = $dbi->get_query('array',"SELECT * FROM cpanstats WHERE id=$id");
if(@rows) {
my $row = $rows[0];
_log("@$row");
@@ -387,6 +390,7 @@ sub progress {
sub init_options {
GetOptions( \%options,
+ 'config=s',
'database=s',
'localonly|l',
'missing|m',
@@ -402,49 +406,59 @@ sub init_options {
'version|V'
);
- _help(1) if($options{help});
- _help(0) if($options{version});
+ help(1) if($options{help});
+ help(0) if($options{version});
- unless($options{database} && -f $options{database}) {
- print "No database specified\n\n";
- _help(1);
- }
+ help(1,"No cpanstats.log file specified") unless($options{log} && -f $options{log});
+ help(1,"No results output file specified") unless($options{out});
- unless($options{log} && -f $options{log}) {
- print "No cpanstats.log file specified\n\n";
- _help(1);
- }
+ help(1,"Must specify the configuration file") unless($options{config});
+ help(1,"Configuration file [$options{config}] not found") unless(-f $options{config});
- unless($options{out}) {
- print "No results output file specified\n\n";
- _help(1);
- }
+ help(1,"Must specify the database code (CPANSTATS or LITESTATS)")
+ unless($options{database} && $options{database} =~ /^CPANSTATS|LITESTATS$/);
+
+ # load configuration
+ my $cfg = Config::IniFiles->new( -file => $options{config} );
+
+ # configure databases
+ my $db = $options{database};
+ die "No configuration for $db database\n" unless($cfg->SectionExists($db));
+ my %opts = map {$_ => $cfg->val($db,$_);} qw(driver database dbfile dbhost dbport dbuser dbpass);
+ $dbi = CPAN::Testers::Common::DBUtils->new(%opts);
+ die "Cannot configure $db database\n" unless($dbi);
}
-sub _help {
- my $full = shift;
+sub help {
+ my ($full,$mess) = @_;
+
+ print "\n$mess\n\n" if($mess);
if($full) {
print <<HERE;
Usage: $0 \\
- [-database=<db>] [-c] [-m] [-v] [-s] \\
- [--file=<file>] [--start=n] [--end=n] \\
- [--log=<file>] [-out=<file>] [-h] [-V] \\
- [--localonly]
-
- --database=<db> use a named database
- -m check for missing entries
- -c check existing entries for bad parsing
- -v provide a verification report
- -s search stats providing id + subject
- --file file of IDs to reference
- --start start id for -m or -c
- --end end id for -m or -c
- --log log file for shortcut reference for -m and -c
- --out results output file
- -h this help screen
- -V program version
+ --config=<file> --database=<dbcode> \\
+ [-c] [-m] [-v] [-s] \\
+ [--file=<file>] [--start=n] [--end=n] \\
+ [--log=<file>] [-out=<file>] [-h] [-V] \\
+ [--localonly]
+
+ --config=<file> - configuration file
+ --database=<dbcode> - CPANSTATS or LITESTATS
+
+ -m - check for missing entries
+ -c - check existing entries for bad parsing
+ -v - provide a verification report
+ -s - search stats providing id + subject
+ --file - file of IDs to reference
+ --start - start id for -m or -c
+ --end - end id for -m or -c
+ --log - log file for shortcut reference for -m and -c
+ --out - results output file
+
+ -h - this help screen
+ -V - program version
HERE
@@ -468,7 +482,7 @@ Fixes are dependant upon their severity and my availablity. Should a fix not
be forthcoming, please feel free to (politely) remind me.
RT Queue -
-http://rt.cpan.org/Public/Dist/Display.html?Name=CPAN-WWW-Testers-Generator
+http://rt.cpan.org/Public/Dist/Display.html?Name=CPAN-Testers-Data-Generator
=head1 SEE ALSO
@@ -492,4 +506,3 @@ F<http://wiki.cpantesters.org/>
modify it under the same terms as Perl itself.
=cut
-
View
559 lib/CPAN/Testers/Data/Generator.pm
@@ -0,0 +1,559 @@
+package CPAN::Testers::Data::Generator;
+
+use warnings;
+use strict;
+use vars qw($VERSION);
+
+$VERSION = '0.31';
+
+#----------------------------------------------------------------------------
+# Library Modules
+
+use Config::IniFiles;
+use CPAN::Testers::Common::DBUtils;
+use File::Basename;
+use File::Path;
+use IO::File;
+use Net::NNTP;
+
+use CPAN::Testers::Data::Generator::Article;
+
+#----------------------------------------------------------------------------
+# The Application Programming Interface
+
+sub new {
+ my $class = shift;
+ my %hash = @_;
+
+ my $self = {};
+ bless $self, $class;
+
+ # load configuration
+ my $cfg = Config::IniFiles->new( -file => $hash{config} );
+
+ # configure databases
+ for my $db (qw(CPANSTATS LITESTATS LITEARTS)) {
+ die "No configuration for $db database\n" unless($cfg->SectionExists($db));
+ my %opts = map {$_ => $cfg->val($db,$_);} qw(driver database dbfile dbhost dbport dbuser dbpass);
+ $opts{AutoCommit} = 0;
+ $self->{$db} = CPAN::Testers::Common::DBUtils->new(%opts);
+ die "Cannot configure $db database\n" unless($self->{$db});
+ }
+
+ # command line swtiches override configuration settings
+ for my $key (qw(ignore nostore logfile)) {
+ $self->{$key} = $hash{$key} || $cfg->val('MAIN',$key);
+ }
+
+ return $self;
+}
+
+sub DESTROY {
+ my $self = shift;
+}
+
+#----------------------------------------------------------------------------
+# Public Methods
+
+sub generate {
+ my $self = shift;
+
+ $self->{nntp} ||= $self->nntp_connect();
+
+ my $start = $self->_get_lastid() +1;
+ my $end = $self->{nntp_last};
+ die "Cannot access NNTP server\n" unless($end); # better to bail out than fade away!
+
+ # starting from last retrieved article
+ for(my $id = $start; $id <= $end; $id++) {
+
+ $self->_log("ID [$id]");
+ my $article = join "", @{$self->{nntp}->article($id) || []};
+
+ # no article for that id!
+ unless($article) {
+ $self->_log(" ... no article\n");
+ if($self->{ignore}) {
+ next;
+ } else {
+ die "No article returned [$id]\n";
+ }
+ }
+
+ $self->insert_article($id,$article);
+ $self->parse_article($id,$article);
+ }
+
+ $self->cleanup if($self->{nostore});
+ $self->commit();
+}
+
+
+sub rebuild {
+ my ($self,$start,$end) = @_;
+
+ $start ||= 1;
+ $end ||= $self->_get_lastid();
+
+ $self->{CPANSTATS}->do_query("DELETE FROM cpanstats WHERE id >= $start AND id <= $end");
+ $self->{LITESTATS}->do_query("DELETE FROM cpanstats WHERE id >= $start AND id <= $end");
+
+ my $iterator = $self->{LITEARTS}->iterator('array',"SELECT * FROM articles WHERE id >= $start AND id <= $end ORDER BY id asc");
+ while(my $row = $iterator->()) {
+ my $id = $row->[0];
+ my $article = $row->[1];
+
+ $self->_log("ID [$id]");
+
+ # no article for that id!
+ unless($article) {
+ $self->_log(" ... no article\n");
+ if($self->{ignore}) {
+ next;
+ } else {
+ die "No article returned [$id]\n";
+ }
+ }
+
+ $self->parse_article($id,$article);
+ }
+
+ $self->commit();
+}
+
+sub reparse {
+ my ($self,$options,@ids) = @_;
+ return unless(@ids);
+
+ $self->{nntp} ||= $self->nntp_connect()
+ unless($options && $options->{localonly});
+
+ my $last = $self->_get_lastid();
+
+ for my $id (@ids) {
+ #print STDERR "id=[$id], last=[$last]\n";
+ next if($id < 1 || $id > $last);
+
+ my $article;
+ my @rows = $self->{LITEARTS}->get_query('array','SELECT * FROM articles WHERE id = ?',$id);
+ if(@rows) {
+ $article = $rows[0]->[1];
+ #print STDERR "got article\n";
+
+ } elsif($options && $options->{localonly}) {
+ #print STDERR "no article locally\n";
+ next;
+
+ } else {
+ $article = join "", @{$self->{nntp}->article($id) || []};
+ $self->insert_article($id,$article) if($article);
+ #print STDERR "got NNTP article\n";
+ }
+
+
+ next unless($article);
+ $self->_log("ID [$id]");
+
+ unless($options && $options->{check}) {
+ $self->{CPANSTATS}->do_query('DELETE FROM cpanstats WHERE id = ?',$id);
+ $self->{LITESTATS}->do_query('DELETE FROM cpanstats WHERE id = ?',$id);
+ }
+ $self->parse_article($id,$article,$options);
+ }
+
+ $self->commit();
+}
+
+#----------------------------------------------------------------------------
+# Private Methods
+
+sub cleanup {
+ my $self = shift;
+ my $id = $self->_get_lastid();
+ return unless($id);
+
+ $self->{LITEARTS}->do_query('DELETE FROM articles WHERE id < ?',$id);
+}
+
+sub commit {
+ my $self = shift;
+ for(qw(CPANSTATS LITESTATS LITEARTS)) {
+ next unless($self->{$_});
+ $self->{$_}->do_commit;
+ }
+}
+
+sub nntp_connect {
+ my $self = shift;
+
+ # connect to NNTP server
+ my $nntp = Net::NNTP->new("nntp.perl.org") or die "Cannot connect to nntp.perl.org";
+ ($self->{nntp_num}, $self->{nntp_first}, $self->{nntp_last}) = $nntp->group("perl.cpan.testers");
+
+ return $nntp;
+}
+
+sub parse_article {
+ my ($self,$id,$article,$options) = @_;
+ my $object = CPAN::Testers::Data::Generator::Article->new($article);
+
+ unless($object) {
+ $self->_log(" ... bad parse\n");
+ return;
+ }
+
+ my $subject = $object->subject;
+ my $from = $object->from;
+ $self->_log(" [$from] $subject\n");
+ return if($subject =~ /Re:/i);
+
+ unless($subject =~ /(CPAN|FAIL|PASS|NA|UNKNOWN)\s+/i) {
+ $self->_log(" . [$id] ... bad subject\n");
+ return;
+ }
+
+ my $state = lc $1;
+ my ($post,$date,$dist,$version,$platform,$perl,$osname,$osvers) = ();
+
+ if($state eq 'cpan') {
+ if($object->parse_upload()) {
+ $dist = $object->distribution;
+ $version = $object->version;
+ $from = $object->author;
+ }
+
+ return unless($self->_valid_field($id, 'dist' => $dist) || ($options && $options->{exclude}{dist}));
+ return unless($self->_valid_field($id, 'version' => $version) || ($options && $options->{exclude}{version}));
+ return unless($self->_valid_field($id, 'author' => $from) || ($options && $options->{exclude}{from}));
+
+ } else {
+ if($object->parse_report()) {
+ $dist = $object->distribution;
+ $version = $object->version;
+ $from = $object->from;
+ $perl = $object->perl;
+ $platform = $object->archname;
+ $osname = $object->osname;
+ $osvers = $object->osvers;
+
+ $from =~ s/'/''/g; #'
+ }
+
+ return unless($self->_valid_field($id, 'dist' => $dist) || ($options && $options->{exclude}{dist}));
+ return unless($self->_valid_field($id, 'version' => $version) || ($options && $options->{exclude}{version}));
+ return unless($self->_valid_field($id, 'from' => $from) || ($options && $options->{exclude}{from}));
+ return unless($self->_valid_field($id, 'perl' => $perl) || ($options && $options->{exclude}{perl}));
+ return unless($self->_valid_field($id, 'platform' => $platform) || ($options && $options->{exclude}{platform}));
+ return unless($self->_valid_field($id, 'osname' => $osname) || ($options && $options->{exclude}{osname}));
+ return unless($self->_valid_field($id, 'osvers' => $osvers) || ($options && $options->{exclude}{osname}));
+ }
+
+ $post = $object->postdate;
+ $date = $object->date;
+ $self->insert_stats($id,$state,$post,$from,$dist,$version,$platform,$perl,$osname,$osvers,$date)
+ unless($options && $options->{check});
+}
+
+sub insert_stats {
+ my $self = shift;
+
+ my @fields = @_;
+ $fields[$_] ||= 0 for(0);
+ $fields[$_] ||= '' for(1,2,3,4,5,6,8,9,10);
+ $fields[$_] ||= '0' for(7);
+
+ my $INSERT = 'INSERT INTO cpanstats VALUES (?,?,?,?,?,?,?,?,?,?,?)';
+
+ $self->{CPANSTATS}->do_query($INSERT,@fields);
+ $self->{LITESTATS}->do_query($INSERT,@fields);
+ if((++$self->{stat_count} % 50) == 0) {
+ $self->{CPANSTATS}->do_commit;
+ $self->{LITESTATS}->do_commit;
+ }
+}
+
+sub insert_article {
+ my $self = shift;
+
+ my @fields = @_;
+ $fields[$_] ||= 0 for(0);
+ $fields[$_] ||= '' for(1);
+
+ my $INSERT = 'INSERT INTO articles VALUES (?,?)';
+
+ $self->{LITEARTS}->do_query($INSERT,@fields);
+ if((++$self->{arts_count} % 50) == 0) {
+ $self->{LITEARTS}->do_commit;
+ }
+}
+
+#----------------------------------------------------------------------------
+# Private Functions
+
+sub _valid_field {
+ my ($self,$id,$name,$value) = @_;
+ return 1 if(defined $value);
+ $self->_log(" . [$id] ... missing field: $name\n");
+ return 0;
+}
+
+sub _get_lastid {
+ my $self = shift;
+
+ my @rows = $self->{LITEARTS}->get_query('array',"SELECT max(id) FROM articles");
+ return 0 unless(@rows);
+ return $rows[0]->[0] || 0;
+}
+
+sub _log {
+ my $self = shift;
+ my $log = $self->{logfile} or return;
+ mkpath(dirname($log)) unless(-f $log);
+ my $fh = IO::File->new($log,'a+') or die "Cannot append to log file [$log]: $!\n";
+ print $fh @_;
+ $fh->close;
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+CPAN::Testers::Data::Generator - Download and summarize CPAN Testers data
+
+=head1 SYNOPSIS
+
+ % cpanstats
+ # ... wait patiently, very patiently
+ # ... then use cpanstats.db, an SQLite database
+ # ... or the MySQL database
+
+=head1 DESCRIPTION
+
+This distribution was originally written by Leon Brocard to download and
+summarize CPAN Testers data. However, all of the original code has been
+rewritten to use the CPAN Testers Statistics database generation code. This
+now means that all the CPAN Testers sites including the Reports site, the
+Statistics site and the CPAN Dependencies site, can use the same database.
+
+This module downloads articles from the cpan-testers newsgroup, generating or
+updating an SQLite database containing all the most important information. You
+can then query this database, or use CPAN::WWW::Testers to present it over the
+web.
+
+A good example query for Acme-Colour would be:
+
+ SELECT version, status, count(*) FROM cpanstats WHERE
+ distribution = "Acme-Colour" group by version, state;
+
+To create a database from scratch can take several days, as there are now over
+2 million articles in the newgroup. As such updating from a known copy of the
+database is much more advisable. If you don't want to generate the database
+yourself, you can obtain the latest official copy (compressed with gzip) at
+http://devel.cpantesters.org/cpanstats.db.gz
+
+With over 2 million articles in the archive, if you do plan to run this
+software to generate the databases it is recommended you utilise a high-end
+processor machine. Even with a reasonable processor it can takes days!
+
+=head1 SQLite DATABASE SCHEMA
+
+The cpanstats database schema is very straightforward, one main table with
+several index tables to speed up searches. The main table is as below:
+
+ +--------------------------------+
+ | cpanstats |
+ +----------+---------------------+
+ | id | INTEGER PRIMARY KEY |
+ | state | TEXT |
+ | postdate | TEXT |
+ | tester | TEXT |
+ | dist | TEXT |
+ | version | TEXT |
+ | platform | TEXT |
+ | perl | TEXT |
+ | osname | TEXT |
+ | osvers | TEXT |
+ | date | TEXT |
+ +----------+---------------------+
+
+It should be noted that 'postdate' refers to the YYYYMM formatted date, whereas
+the 'date' field refers to the YYYYMMDDhhmm formatted date and time.
+
+The articles database schema is again very straightforward, and consists of one
+table, as below:
+
+ +--------------------------------+
+ | articles |
+ +----------+---------------------+
+ | id | INTEGER PRIMARY KEY |
+ | article | TEXT |
+ +----------+---------------------+
+
+=head1 v0.31 CHANGES
+
+With the release of v0.31, a number of changes to the codebase were made as
+a further move towards CPAN Testers 2.0. The first change is the name for this
+distribution. Now titled 'CPAN-Testers-Data-Generator', this now fits more
+appropriately within the CPAN-Testers namespace on CPAN.
+
+The second significant change is to now reference a MySQL cpanstats database.
+The SQLite version is still updated as before, as a number of other websites
+and toolsets still rely on that database file format. However, in order to make
+the CPAN Testers Reports website more dynamic, an SQLite database is not really
+appropriate for a high demand website.
+
+The database creation code is now available as a standalone program, in the
+examples directory, and all the database communication is now handled by the
+new distribution CPAN-Testers-Common-DBUtils.
+
+=head1 INTERFACE
+
+=head2 The Constructor
+
+=over
+
+=item * new
+
+Instatiates the object CPAN::Testers::Data::Generator. Accepts a hash containing
+values to prepare the object. These are described as:
+
+ my $obj = CPAN::Testers::Data::Generator->new(
+ logfile => './here/logfile',
+ config => './here/config.ini'
+ );
+
+Where 'logfile' is the location to write log messages. Log messages are only
+written if a logfile entry is specified, and will always append to any existing
+file. The 'config' should contain the path to the configuration file, used
+to define the database access and general operation settings.
+
+In addition the binary keys of 'ignore' and 'nostore' are available. 'ignore'
+is used to ignore NNTP entries which return no article and continue processing
+articles, while 'nostore' will delete all articles, except the last one
+received, thus reducing space in the SQL database.
+
+=back
+
+=head2 Public Methods
+
+=over
+
+=item * generate
+
+Starting from the last recorded article, retrieves all the more recent articles
+from the NNTP server, parsing each and recording the articles that either
+upload announcements or reports.
+
+=item * rebuild
+
+In the event that the cpanstats database needs regenerating, either in part or
+for the whole database, this method allow you to do so. You may supply
+parameters as to the 'start' and 'end' values (inclusive), where all records
+are assumed by default. Note that the 'nostore' option is ignored and no
+records are deleted from the articles database.
+
+=item * reparse
+
+Rather than a complete rebuild the option to selective reparse selected entries
+is useful if there are posts which have since been identified as valid and now
+have supporting parsing code within the codebase.
+
+In addition there is the option to exclude fields from parsing checks, where
+they may be corrupted, and can be later amended using the 'cpanstats-update'
+tool.
+
+=back
+
+=head2 Private Methods
+
+=over
+
+=item * cleanup
+
+In the event that you do not wish to store all the articles permanently in the
+articles database, this method removes all but the most recent entry, which is
+kept to ensure that subsequent runs will start from the correct article. To
+enable this feature, specify 'nostore' within the has passed to new().
+
+=item * commit
+
+To speed up the transaction process, a commit is performed every 50 inserts.
+This method is used as part of the clean up process to ensure all transactions
+are completed.
+
+=item * nntp_connect
+
+Sets up the connection to the NNTP server.
+
+=item * parse_article
+
+Parses an article extracting the metadata required for the stats database.
+
+=item * insert_article
+
+Inserts an article into the articles database.
+
+=item * insert_stats
+
+Inserts the components of a parsed article into the statistics database.
+
+=back
+
+=head1 HISTORY
+
+The CPAN testers was conceived back in May 1998 by Graham Barr and Chris
+Nandor as a way to provide multi-platform testing for modules. Today there
+are over 2 million tester reports and more than 100 testers each month
+giving valuable feedback for users and authors alike.
+
+=head1 BECOME A TESTER
+
+Whether you have a common platform or a very unusual one, you can help by
+testing modules you install and submitting reports. There are plenty of
+module authors who could use test reports and helpful feedback on their
+modules and distributions.
+
+If you'd like to get involved, please take a look at the CPAN Testers Wiki,
+where you can learn how to install and configure one of the recommended
+smoke tools.
+
+For further help and advice, please subscribe to the the CPAN Testers
+discussion mailing list.
+
+ CPAN Testers Wiki
+ - http://wiki.cpantesters.org
+ CPAN Testers Discuss mailing list
+ - http://lists.cpan.org/showlist.cgi?name=cpan-testers-discuss
+
+=head1 BUGS, PATCHES & FIXES
+
+There are no known bugs at the time of this release. However, if you spot a
+bug or are experiencing difficulties, that is not explained within the POD
+documentation, please send bug reports and patches to the RT Queue (see below).
+
+Fixes are dependant upon their severity and my availablity. Should a fix not
+be forthcoming, please feel free to (politely) remind me.
+
+RT Queue -
+http://rt.cpan.org/Public/Dist/Display.html?Name=CPAN-Testers-Data-Generator
+
+=head1 SEE ALSO
+
+L<CPAN::WWW::Testers>,
+L<CPAN::Testers::WWW::Statistics>
+
+F<http://www.cpantesters.org/>,
+F<http://stats.cpantesters.org/>,
+F<http://wiki.cpantesters.org/>
+
+=head1 AUTHOR
+
+ Original author: Leon Brocard <acme@astray.com> 200?-2008
+ Current maintainer: Barbie <barbie@cpan.org> 2008-present
+
+=head1 LICENSE
+
+This code is distributed under the same license as Perl.
View
375 lib/CPAN/Testers/Data/Generator/Article.pm
@@ -0,0 +1,375 @@
+package CPAN::Testers::Data::Generator::Article;
+
+use warnings;
+use strict;
+use vars qw($VERSION);
+
+$VERSION = '0.31';
+
+#----------------------------------------------------------------------------
+# Library Modules
+
+use CPAN::DistnameInfo;
+use Email::Simple;
+use MIME::Base64;
+use MIME::QuotedPrint;
+use Time::Local;
+
+use base qw( Class::Accessor::Fast );
+
+#----------------------------------------------------------------------------
+# Variables
+
+my %month = (
+ Jan => 1, Feb => 2, Mar => 3, Apr => 4, May => 5, Jun => 6,
+ Jul => 7, Aug => 8, Sep => 9, Oct => 10, Nov => 11, Dec => 12,
+);
+
+my %regexes = (
+ # with time
+ 1 => { re => qr/(?:\w+,)?\s+(\d+)\s+(\w+)\s+(\d+)\s+(\d+):(\d+)/, f => [qw(day month year hour min)] }, # Wed, 13 September 2004 06:29
+ 2 => { re => qr/(\d+)\s+(\w+)\s+(\d+)\s+(\d+):(\d+)/, f => [qw(day month year hour min)] }, # 13 September 2004 06:29
+ 3 => { re => qr/(\w+)?\s+(\d+),?\s+(\d+)\s+(\d+):(\d+)/, f => [qw(month day year hour min)] }, # September 22, 1999 06:29
+
+ # just the date
+ 4 => { re => qr/(?:\w+,)?\s+(\d+)\s+(\w+)\s+(\d+)/, f => [qw(day month year)] }, # Wed, 13 September 2004
+ 5 => { re => qr/(\d+)\s+(\w+)\s+(\d+)/, f => [qw(day month year)] }, # 13 September 2004
+ 6 => { re => qr/(\w+)?\s+(\d+),?\s+(\d+)/, f => [qw(month day year)] }, # September 22, 1999 06:29
+);
+
+
+#----------------------------------------------------------------------------
+# The Application Programming Interface
+
+__PACKAGE__->mk_accessors(qw(
+ postdate date epoch status from distribution version
+ perl osname osvers archname subject author filename));
+
+sub new {
+ my($class, $article) = @_;
+ my $self = {};
+ bless $self, $class;
+
+ $article = decode_qp($article) if($article =~ /=3D/);
+
+ my $mail = Email::Simple->new($article);
+ return unless $mail;
+ return if $mail->header("In-Reply-To");
+
+ my $from = $mail->header("From");
+ my $subject = $mail->header("Subject");
+ return unless $subject;
+ return if $subject =~ /::/; # it's supposed to be a distribution
+
+ $self->{mail} = $mail;
+ $self->{from} = $from;
+ $self->{subject} = $subject;
+
+ ($self->{postdate},$self->{date},$self->{epoch}) = _parse_date($mail);
+
+ return $self;
+}
+
+sub _parse_date {
+ my $mail = shift;
+ my ($date1,$date2,$date3) = _extract_date($mail->header("Date"));
+ my @received = $mail->header("Received");
+
+ for my $hdr (@received) {
+ next unless($hdr =~ /.*;\s+(.*)\s*$/);
+ my ($dt1,$dt2,$dt3) = _extract_date($1);
+ if($dt2 > $date2 + 1200) {
+ $date1 = $dt1;
+ $date2 = $dt2;
+ $date3 = $dt3;
+ }
+ }
+
+#print STDERR " ... X.[Date: ".($date||'')."]\n";
+ return($date1,$date2,$date3);
+}
+
+sub _extract_date {
+ my $date = shift;
+ my (%fields,@fields,$index);
+
+#print STDERR "# ... 0.[Date: ".($date||'')."]\n";
+
+ for my $inx (sort {$a <=> $b} keys %regexes) {
+ (@fields) = ($date =~ $regexes{$inx}->{re});
+ if(@fields) {
+ $index = $inx;
+ last;
+ }
+ }
+
+ return('000000','000000000000',0) unless(@fields && $index);
+
+ @fields{@{$regexes{$index}->{f}}} = @fields;
+
+ $fields{month} = substr($fields{month},0,3);
+ $fields{mon} = $month{$fields{month}};
+ return('000000','000000000000',0) unless($fields{mon} && $fields{year} > 1998);
+
+ $fields{$_} ||= 0 for(qw(sec min hour day mon year));
+ my @date = map { $fields{$_} } qw(sec min hour day mon year);
+
+#print STDERR "# ... 1.[$_][$fields{$_}]\n" for(qw(year month day hour min));
+ my $short = sprintf "%04d%02d", $fields{year}, $fields{mon};
+ my $long = sprintf "%04d%02d%02d%02d%02d", $fields{year}, $fields{mon}, $fields{day}, $fields{hour}, $fields{min};
+ $date[4]--;
+ my $epoch = timelocal(@date);
+
+ return($short,$long,$epoch);
+}
+
+sub parse_upload {
+ my $self = shift;
+ my $mail = $self->{mail};
+ my $subject = $self->{subject};
+
+ return 0 unless($subject =~ /CPAN Upload:\s+([-\w\/\.\+]+)/i);
+ my $distvers = $1;
+
+ # only record supported archives
+ return 0 if($distvers !~ /\.(?:(?:tar\.|t)(?:gz|bz2)|zip)$/);
+
+ # CPAN::DistnameInfo doesn't support .tar.bz2 files ... yet
+ $distvers =~ s/\.(?:tar\.|t)bz2$//i;
+ $distvers .= '.tar.gz' unless $distvers =~ /\.(?:(?:tar\.|t)gz|zip)$/i;
+
+ # CPAN::DistnameInfo doesn't support old form of uploads
+ my @parts = split("/",$distvers);
+ if(@parts == 2) {
+ my ($first,$second,$rest) = split(//,$distvers,3);
+ $distvers = "$first/$first$second/$first$second$rest";
+ }
+
+ my $d = CPAN::DistnameInfo->new($distvers);
+ $self->distribution($d->dist);
+ $self->version($d->version);
+ $self->author($d->cpanid);
+ $self->filename($d->filename);
+
+ return 1;
+}
+
+sub parse_report {
+ my $self = shift;
+ my $mail = $self->{mail};
+ my $from = $self->{from};
+ my $subject = $self->{subject};
+
+ my ($status, $distversion, $platform) = split /\s+/, $subject;
+ return 0 unless $status =~ /^(PASS|FAIL|UNKNOWN|NA)$/i;
+
+ $platform ||= "";
+ $platform =~ s/[\s&,<].*//;
+
+ $distversion =~ s!/$!!;
+ $distversion =~ s/\.tar.*/.tar.gz/;
+ $distversion .= '.tar.gz' unless $distversion =~ /\.(tar|tgz|zip)/;
+
+ my $d = CPAN::DistnameInfo->new($distversion);
+ my ($dist, $version) = ($d->dist, $d->version);
+ return 0 unless defined $dist;
+ return 0 unless defined $version;
+
+ my $encoding = $mail->header('Content-Transfer-Encoding');
+
+ my $body = $mail->body;
+ $body = decode_base64($body) if($encoding && $encoding eq 'base64');
+
+ my $perl = $self->_extract_perl_version(\$body);
+
+ my ($osname) = $body =~ /Summary of my perl5.*osname=([^\s\n,<\']+)/s;
+ my ($osvers) = $body =~ /Summary of my perl5.*osvers=([^\s\n,<\']+)/s;
+ my ($archname) = $body =~ /Summary of my perl5.*archname=([^\s\n&,<\']+)/s;
+ $archname =~ s/\n.*// if($archname);
+
+ $self->status($status);
+ $self->distribution($dist);
+ $self->version($version);
+ $self->from($from || "");
+ $self->perl($perl);
+ $self->osname($osname || "");
+ $self->osvers($osvers || "");
+ $self->filename($d->filename);
+
+ unless($archname || $platform) {
+ if($osname && $osvers) { $platform = "$osname-$osvers" }
+ elsif($osname) { $platform = $osname }
+ }
+
+ $self->archname($archname || $platform);
+
+ return 1;
+}
+
+sub passed {
+ my $self = shift;
+ return $self->status eq 'PASS';
+}
+
+sub failed {
+ my $self = shift;
+ return $self->status eq 'FAIL';
+}
+
+# there are a few old test reports that omitted the perl version number.
+# In these instances 0 is assumed. These reports are now so old, that
+# worrying about them is not worth the effort.
+
+sub _extract_perl_version {
+ my ($self, $body) = @_;
+
+ # Summary of my perl5 (revision 5.0 version 6 subversion 1) configuration:
+ # Summary of my perl5 (revision 5 version 10 subversion 0) configuration:
+ my ($rev, $ver, $sub, $extra) =
+ $$body =~ /Summary of my (?:perl\d+)? \((?:revision )?(\d+(?:\.\d+)?) (?:version|patchlevel) (\d+) subversion\s+(\d+) ?(.*?)\) configuration/s;
+
+ unless(defined $rev) {
+# warn "Cannot parse perl version for article:\n$body";
+ return 0;
+ }
+
+ my $perl = $rev + ($ver / 1000) + ($sub / 1000000);
+ $rev = int($perl);
+ $ver = int(($perl*1000)%1000);
+ $sub = int(($perl*1000000)%1000);
+
+ my $version = sprintf "%d.%d.%d", $rev, $ver, $sub;
+ $version .= " $extra" if $extra;
+ return $version;
+# return sprintf "%0.6f", $perl; # an alternate format
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+CPAN::Testers::Data::Generator::Article - Parse a CPAN Testers article
+
+=head1 DESCRIPTION
+
+This is used by CPAN::Testers::Data::Generator.
+
+=head1 INTERFACE
+
+=head2 The Constructor
+
+=over 4
+
+=item * new
+
+The constructor. Pass in a reference to the article.
+
+=back
+
+=head2 Methods
+
+=over 4
+
+=item * parse_upload
+
+Parses an upload article.
+
+=item * parse_report
+
+Parses a report article.
+
+=item * passed
+
+Whether the report was a PASS
+
+=item * failed
+
+Whether the report was a FAIL
+