Permalink
Browse files

First commit

  • Loading branch information...
0 parents commit 392fb2839d5113e516e0d21a3e9d6ec2ffe14cbf @batterseapower committed Oct 31, 2010
Showing with 13,308 additions and 0 deletions.
  1. +10 −0 .gitignore
  2. +49 −0 README.md
  3. +70 −0 charsetdetect.cpp
  4. +30 −0 charsetdetect.h
  5. +19 −0 charsetdetectPriv.h
  6. +465 −0 libcharsetdetect.xcodeproj/project.pbxproj
  7. +943 −0 mozilla/extensions/universalchardet/src/base/Big5Freq.tab
  8. +111 −0 mozilla/extensions/universalchardet/src/base/CharDistribution.cpp
  9. +236 −0 mozilla/extensions/universalchardet/src/base/CharDistribution.h
  10. +614 −0 mozilla/extensions/universalchardet/src/base/EUCKRFreq.tab
  11. +447 −0 mozilla/extensions/universalchardet/src/base/EUCTWFreq.tab
  12. +491 −0 mozilla/extensions/universalchardet/src/base/GB2312Freq.tab
  13. +589 −0 mozilla/extensions/universalchardet/src/base/JISFreq.tab
  14. +231 −0 mozilla/extensions/universalchardet/src/base/JpCntx.cpp
  15. +137 −0 mozilla/extensions/universalchardet/src/base/JpCntx.h
  16. +245 −0 mozilla/extensions/universalchardet/src/base/LangBulgarianModel.cpp
  17. +355 −0 mozilla/extensions/universalchardet/src/base/LangCyrillicModel.cpp
  18. +244 −0 mozilla/extensions/universalchardet/src/base/LangGreekModel.cpp
  19. +219 −0 mozilla/extensions/universalchardet/src/base/LangHebrewModel.cpp
  20. +242 −0 mozilla/extensions/universalchardet/src/base/LangHungarianModel.cpp
  21. +221 −0 mozilla/extensions/universalchardet/src/base/LangThaiModel.cpp
  22. +87 −0 mozilla/extensions/universalchardet/src/base/Makefile.in
  23. +88 −0 mozilla/extensions/universalchardet/src/base/nsBig5Prober.cpp
  24. +71 −0 mozilla/extensions/universalchardet/src/base/nsBig5Prober.h
  25. +125 −0 mozilla/extensions/universalchardet/src/base/nsCharSetProber.cpp
  26. +76 −0 mozilla/extensions/universalchardet/src/base/nsCharSetProber.h
  27. +108 −0 mozilla/extensions/universalchardet/src/base/nsCodingStateMachine.h
  28. +99 −0 mozilla/extensions/universalchardet/src/base/nsEUCJPProber.cpp
  29. +75 −0 mozilla/extensions/universalchardet/src/base/nsEUCJPProber.h
  30. +91 −0 mozilla/extensions/universalchardet/src/base/nsEUCKRProber.cpp
  31. +71 −0 mozilla/extensions/universalchardet/src/base/nsEUCKRProber.h
  32. +91 −0 mozilla/extensions/universalchardet/src/base/nsEUCTWProber.cpp
  33. +71 −0 mozilla/extensions/universalchardet/src/base/nsEUCTWProber.h
  34. +101 −0 mozilla/extensions/universalchardet/src/base/nsEscCharsetProber.cpp
  35. +67 −0 mozilla/extensions/universalchardet/src/base/nsEscCharsetProber.h
  36. +263 −0 mozilla/extensions/universalchardet/src/base/nsEscSM.cpp
  37. +96 −0 mozilla/extensions/universalchardet/src/base/nsGB2312Prober.cpp
  38. +73 −0 mozilla/extensions/universalchardet/src/base/nsGB2312Prober.h
  39. +194 −0 mozilla/extensions/universalchardet/src/base/nsHebrewProber.cpp
  40. +176 −0 mozilla/extensions/universalchardet/src/base/nsHebrewProber.h
  41. +182 −0 mozilla/extensions/universalchardet/src/base/nsLatin1Prober.cpp
  42. +70 −0 mozilla/extensions/universalchardet/src/base/nsLatin1Prober.h
  43. +230 −0 mozilla/extensions/universalchardet/src/base/nsMBCSGroupProber.cpp
  44. +80 −0 mozilla/extensions/universalchardet/src/base/nsMBCSGroupProber.h
  45. +626 −0 mozilla/extensions/universalchardet/src/base/nsMBCSSM.cpp
  46. +89 −0 mozilla/extensions/universalchardet/src/base/nsPkgInt.h
  47. +223 −0 mozilla/extensions/universalchardet/src/base/nsSBCSGroupProber.cpp
  48. +70 −0 mozilla/extensions/universalchardet/src/base/nsSBCSGroupProber.h
  49. +126 −0 mozilla/extensions/universalchardet/src/base/nsSBCharSetProber.cpp
  50. +124 −0 mozilla/extensions/universalchardet/src/base/nsSBCharSetProber.h
  51. +98 −0 mozilla/extensions/universalchardet/src/base/nsSJISProber.cpp
  52. +77 −0 mozilla/extensions/universalchardet/src/base/nsSJISProber.h
  53. +87 −0 mozilla/extensions/universalchardet/src/base/nsUTF8Prober.cpp
  54. +64 −0 mozilla/extensions/universalchardet/src/base/nsUTF8Prober.h
  55. +295 −0 mozilla/extensions/universalchardet/src/base/nsUniversalDetector.cpp
  56. +89 −0 mozilla/extensions/universalchardet/src/base/nsUniversalDetector.h
  57. +5 −0 nscore.h
  58. +30 −0 nspr-emu/README.md
  59. +231 −0 nspr-emu/obsolete/protypes.h
  60. +49 −0 nspr-emu/prcpucfg.h
  61. +337 −0 nspr-emu/prcpucfg_freebsd.h
  62. +707 −0 nspr-emu/prcpucfg_linux.h
  63. +145 −0 nspr-emu/prcpucfg_mac.h
  64. +337 −0 nspr-emu/prcpucfg_openbsd.h
  65. +256 −0 nspr-emu/prcpucfg_win.h
  66. +155 −0 nspr-emu/prmem.h
  67. +533 −0 nspr-emu/prtypes.h
  68. +2 −0 update-mozilla
@@ -0,0 +1,10 @@
+# OS junk
+.DS_Store
+Thumbs.db
+
+# XCode build artifacts
+build/
+
+# XCode user configuration
+*.mode1v3
+*.pbxuser
@@ -0,0 +1,49 @@
+# Universal Character Set Detector (UCSD)
+
+A library exposing a C interface and dependency-free interface to the Mozilla C++ UCSD library.
+
+Pulls together:
+
+ * A NSPR emulation library (see nspr-emu/README.md)
+ * Code written by Colin Snover to provide a command line interface to the library
+ * The UCSD library itself from the Mozilla seamonkey source tree
+
+## API documentation
+
+The library provides an opaque type of character set detectors:
+
+ typedef void* csd_t;
+
+The first thing a client should do is create one of these:
+
+ csd_t csd_open(void);
+
+A `csd_t` created in this fashion must be freed by `csd_close`. If creation fails, `csd_open` returns `(csd_t)-1`.
+
+Now you need to feed some data to the detector:
+
+ int csd_consider(csd_t csd, const char *data, int length);
+
+The meaning of the return code is as follows:
+
+ * Returns 0 if more data is needed to come to a conclusion
+ * Returns a positive number if enough data has been received to detect the character set
+ * Returns a negative number if there is an error
+
+Finally, close the detector to find out what the character set is:
+
+ const char *csd_close(csd_t csd);
+
+The detected character set name is returned as an ASCII string. This function returns `NULL` if detection failed because there was not
+enough data. It is safe to call `csd_close` at any point from creation by `csd_open` to the first call of `csd_close` on that character
+set detector.
+
+## Licensing
+
+The files libcharsetdetect.{cpp,h} are (c) 2010 Colin Snover and released under an MIT license.
+
+The UCSD is (c) mozilla.org and tri-licensed under MPL 1.1/GPL 2.0/LGPL 2.1.
+
+We incorporate header files from the NSPR emulation library, which is LGPL licensed.
+
+Thus the resulting artifact is LGPL licensed (I think).
@@ -0,0 +1,70 @@
+#include <iostream>
+#include "charsetdetect.h"
+#include "charsetdetectPriv.h"
+
+
+//
+// C++ API to the character set detector (not exported)
+//
+
+void Detector::Report(const char* aCharset) {
+ // mDone has to be set true here because the original code
+ // does not always set it, and I am trying to avoid modifying
+ // the original code.
+ mDone = PR_TRUE;
+
+ mDetectedCharset = aCharset;
+}
+
+int Detector::Consider(const char *data, int length) {
+ if (HandleData(data, length) == NS_ERROR_OUT_OF_MEMORY) {
+ // Error, signal with a negative number
+ return -1;
+ }
+
+ if (mDone) {
+ // Detected early
+ return 0;
+ }
+
+ // Need more data!
+ return 1;
+}
+
+const char *Detector::Close(void) {
+ DataEnd();
+
+ if (!mDone) {
+ if (mInputState == eEscAscii) {
+ return "ibm850";
+ }
+ else if (mInputState == ePureAscii) {
+ return "ASCII";
+ }
+
+ return NULL;
+ }
+
+ return mDetectedCharset;
+}
+
+
+//
+// C API to the character set detector (we actually export this)
+//
+
+csd_t csd_open(void) {
+ // TODO: capture exceptions thrown by "new" and return -1 in that case
+ // TODO: provide C-land with access to the language filter constructor argument
+ return new Detector(NS_FILTER_ALL);
+}
+
+int csd_consider(csd_t csd, char *data, int length) {
+ return ((Detector*)csd)->Consider(data, length);
+}
+
+const char *csd_close(csd_t csd) {
+ const char *result = ((Detector*)csd)->Close();
+ delete ((Detector*)csd);
+ return result;
+}
@@ -0,0 +1,30 @@
+#ifndef charsetdetect_
+#define charsetdetect_
+
+/* The classes below are exported */
+#pragma GCC visibility push(default)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque type of character set detectors
+typedef void* csd_t;
+
+// Create a new character set detector. Must be freed by csd_close.
+// If creation fails, returns (csd_t)-1.
+csd_t csd_open(void);
+// Feeds some more data to the character set detector. Returns 0 if it
+// needs more data to come to a conclusion and a positive number if it has enough to say what
+// the character set is. Returns a negative number if there is an error.
+int csd_consider(csd_t csd, const char *data, int length);
+// Closes the character set detector and returns the detected character set name as an ASCII string.
+// Returns NULL if detection failed.
+const char *csd_close(csd_t csd);
+
+#ifdef __cplusplus
+}
+#endif
+
+#pragma GCC visibility pop
+#endif
@@ -0,0 +1,19 @@
+/* The classes below are not exported */
+#pragma GCC visibility push(hidden)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "nscore.h"
+#include "nsUniversalDetector.h"
+
+class Detector : public nsUniversalDetector {
+public:
+ Detector(PRUint32 aLanguageFilter) : nsUniversalDetector(aLanguageFilter) {};
+ int Consider(const char *data, int length);
+ const char *Close(void);
+protected:
+ void Report(const char* aCharset);
+ const char *mDetectedCharset;
+};
+
+#pragma GCC visibility pop
Oops, something went wrong.

0 comments on commit 392fb28

Please sign in to comment.