# Open Text File with encoding handling

In [3]:
# Example of a latin1 file
s = "J'aime les frites bien grasse étalon châpeau!"
encoded_s = s.encode('latin-1')
with open('somefile.txt', 'wb') as f:
    f.write(encoded_s)

## text loader

If you don't know what the encoding is, you can use the `text_loader()`. Il will try to open with UTF-8, and if it fails it will apply `detect_encoding()`.

In [4]:
from nautilus_nlp.utils.file_loader import text_loader

In [8]:
_=text_loader('somefile.txt',detectencoding=False)



## open text file when you know the encoding

If you know the encoding. (you can also use `open_textfile()`)

In [10]:
text_loader('somefile.txt',encoding='latin-1')

"J'aime les frites bien grasse étalon châpeau!"

## detect encoding 

In [11]:
from nautilus_nlp.utils.file_loader import detect_encoding

In [12]:
detect_encoding('somefile.txt')

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}

## List files in a folder

In [13]:
from nautilus_nlp.utils.file_loader import list_files

In [14]:
list_files('.') # list files from current folders

['./Sentiment_analysis_FT.ipynb',
 './Language_identification_with_FT.ipynb',
 './somefile.txt',
 './Sentiment analysis using pre-trained models.ipynb',
 './Language_identification.ipynb',
 './Spacy_model.ipynb',
 './Open a file, a list of files and detect encoding.ipynb',
 './Untitled1.ipynb',
 './cooking.ftz',
 './GrandDebat.ipynb',
 './Common Text Processing operations.ipynb',
 './Untitled.ipynb',
 './cooking.bin',
 './Visualization tools.ipynb']

In [15]:
list_files('./*.ipynb') # List files matching specific pattern

['./Sentiment_analysis_FT.ipynb',
 './Language_identification_with_FT.ipynb',
 './Sentiment analysis using pre-trained models.ipynb',
 './Language_identification.ipynb',
 './Spacy_model.ipynb',
 './Open a file, a list of files and detect encoding.ipynb',
 './Untitled1.ipynb',
 './GrandDebat.ipynb',
 './Common Text Processing operations.ipynb',
 './Untitled.ipynb',
 './Visualization tools.ipynb']

In [21]:
# only files will be printed, not folders
list_files('/home/robin/nautilus_nlp/*')

['/home/robin/nautilus_nlp/requirements.txt',
 '/home/robin/nautilus_nlp/test_environment.py',
 '/home/robin/nautilus_nlp/Makefile',
 '/home/robin/nautilus_nlp/setup.py',
 '/home/robin/nautilus_nlp/download_spacy_models.sh',
 '/home/robin/nautilus_nlp/Dockerfile',
 '/home/robin/nautilus_nlp/LICENSE',
 '/home/robin/nautilus_nlp/README.md',
 '/home/robin/nautilus_nlp/tox.ini',
 '/home/robin/nautilus_nlp/Untitled.ipynb',
 '/home/robin/nautilus_nlp/VERSION']

## Open several text files

In [22]:
# Let's add another document 
s = "Un deuxième exemple de texte en utf-8 cette fois!"
encoded_s = s.encode('utf-8')
with open('someadditionalfile.txt', 'wb') as f:
    f.write(encoded_s)

In [23]:
from nautilus_nlp.utils.file_loader import documents_loader

In [28]:
_=documents_loader('*.txt')

INFO:root:detected encoding is ISO-8859-1, with a confidence rate of 0.73
INFO:root:detected encoding is ISO-8859-1, with a confidence rate of 0.73


In [29]:
_

{'somefile.txt': "J'aime les frites bien grasse étalon châpeau!",
 'someadditionalfile.txt': "J'aime les frites bien grasse étalon châpeau!"}

In [30]:
documents_loader(['somefile.txt','someadditionalfile.txt'])

INFO:root:detected encoding is ISO-8859-1, with a confidence rate of 0.73
INFO:root:detected encoding is ISO-8859-1, with a confidence rate of 0.73


{'somefile.txt': "J'aime les frites bien grasse étalon châpeau!",
 'someadditionalfile.txt': "J'aime les frites bien grasse étalon châpeau!"}

In [42]:
json.loads(documents_loader('test.json',encoding='utf-8'))

{'somefile.txt': "J'aime les frites bien grasse étalon châpeau!",
 'someadditionalfile.txt': "J'aime les frites bien grasse étalon châpeau!"}

In [40]:
import json

with open('test.json','r',encoding='utf-8') as fp:
   __= json.load(fp)

In [41]:
__

{'somefile.txt': "J'aime les frites bien grasse étalon châpeau!",
 'someadditionalfile.txt': "J'aime les frites bien grasse étalon châpeau!"}