# Open Text File with encoding handling

In [1]:
# Example of a latin1 file
s = "J'aime les frites bien grasse étalon châpeau!"
encoded_s = s.encode('latin-1')
with open('somefile.txt', 'wb') as f:
    f.write(encoded_s)

In [2]:
# Let's add another document 
s = "Un deuxième exemple de texte en utf-8 cette fois!"
encoded_s = s.encode('utf-8')
with open('someadditionalfile.txt', 'wb') as f:
    f.write(encoded_s)

# Document loader

In [3]:
from nautilus_nlp.utils.file_loader import documents_loader

In [18]:
documents_loader('somefile.txt',encoding='latin-1')

"J'aime les frites bien grasse étalon châpeau!"

In [19]:
# default encoding is UTF-8
documents_loader('someadditionalfile.txt')

'Un deuxième exemple de texte en utf-8 cette fois!'

## Detect encoding

If you don't specify encoding, `document_loader()` will try to open it as UTF-8, and if it doesn't work it will try to detect encoding.

In [15]:
documents_loader('somefile.txt')

INFO:root:somefile.txt: detected encoding is ISO-8859-1, with a confidence rate of 0.73


"J'aime les frites bien grasse étalon châpeau!"

In [17]:
# You can prevent document loader from detecting the encoding if UTF-8 fails 
documents_loader('somefile.txt', detectencoding=False)



TypeError: function takes exactly 5 arguments (1 given)

## Open several files

In [12]:
# you can use wildcards to open several documents
documents_loader('*.txt')

INFO:root:somefile.txt: detected encoding is ISO-8859-1, with a confidence rate of 0.73


{'somefile.txt': "J'aime les frites bien grasse étalon châpeau!",
 'someadditionalfile.txt': 'Un deuxième exemple de texte en utf-8 cette fois!'}

In [13]:
# you can also pass a list of filepaths
documents_loader(['somefile.txt','someadditionalfile.txt'])

INFO:root:somefile.txt: detected encoding is ISO-8859-1, with a confidence rate of 0.73


{'somefile.txt': "J'aime les frites bien grasse étalon châpeau!",
 'someadditionalfile.txt': 'Un deuxième exemple de texte en utf-8 cette fois!'}

In [14]:
# you can specify the output format when you load multiple texts
documents_loader('*.txt', output_as='list')

INFO:root:somefile.txt: detected encoding is ISO-8859-1, with a confidence rate of 0.73


["J'aime les frites bien grasse étalon châpeau!",
 'Un deuxième exemple de texte en utf-8 cette fois!']

## List files in a folder

In [7]:
from nautilus_nlp.utils.file_loader import list_files

In [8]:
list_files('.') # list files from current folders

['./somefile.txt',
 './Open a file, a list of files and detect encoding.ipynb',
 './Visualization tools.ipynb',
 './Language_identification.ipynb',
 './someadditionalfile.txt',
 './somefile',
 './Common Text Processing operations.ipynb',
 './Sentiment_analysis_FT.ipynb',
 './Spacy_model.ipynb',
 './Sentiment analysis using pre-trained models.ipynb']

In [9]:
list_files('./*.ipynb') # List files matching specific pattern

['./Open a file, a list of files and detect encoding.ipynb',
 './Visualization tools.ipynb',
 './Language_identification.ipynb',
 './Common Text Processing operations.ipynb',
 './Sentiment_analysis_FT.ipynb',
 './Spacy_model.ipynb',
 './Sentiment analysis using pre-trained models.ipynb']

In [10]:
# only files will be printed, not folders
list_files('/Users/hugo/Documents/NAUTILUS/nautilus-nlp/')

['/Users/hugo/Documents/NAUTILUS/nautilus-nlp/LICENSE',
 '/Users/hugo/Documents/NAUTILUS/nautilus-nlp/requirements.txt',
 '/Users/hugo/Documents/NAUTILUS/nautilus-nlp/Makefile',
 '/Users/hugo/Documents/NAUTILUS/nautilus-nlp/README.md',
 '/Users/hugo/Documents/NAUTILUS/nautilus-nlp/setup.py',
 '/Users/hugo/Documents/NAUTILUS/nautilus-nlp/tox.ini',
 '/Users/hugo/Documents/NAUTILUS/nautilus-nlp/test_environment.py']

## detect encoding 

In [5]:
from nautilus_nlp.utils.file_loader import detect_encoding

In [6]:
detect_encoding('somefile.txt')

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}