# Open Text File with encoding handling

In [1]:
# Example of a latin1 file
s = "J'aime les frites bien grasse étalon châpeau!"
encoded_s = s.encode('latin-1')
with open('somefile.txt', 'wb') as f:
    f.write(encoded_s)

## text loader

If you don't know what the encoding is, you can use the `text_loader()`. Il will try to open with UTF-8, and if it fails it will apply `detect_encoding()`.

In [2]:
from nautilus_nlp.utils.file_loader import text_loader

In [3]:
text_loader('somefile.txt')

INFO:root:detected encoding is ISO-8859-1, with a confidence rate of 0.73


"J'aime les frites bien grasse étalon châpeau!"

## open text file when you know the encoding

If you know the encoding. (you can also use `open_textfile()`)

In [4]:
text_loader('somefile.txt',encoding='latin-1')

"J'aime les frites bien grasse étalon châpeau!"

## detect encoding 

In [5]:
from nautilus_nlp.utils.file_loader import detect_encoding

In [6]:
detect_encoding('somefile.txt')

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}

## List files in a folder

In [7]:
from nautilus_nlp.utils.file_loader import list_files

In [8]:
list_files('.') # list files from current folders

['./somefile.txt',
 './Open a file, a list of files and detect encoding.ipynb',
 './Visualization tools.ipynb',
 './Language_identification.ipynb',
 './someadditionalfile.txt',
 './somefile',
 './Common Text Processing operations.ipynb',
 './Sentiment_analysis_FT.ipynb',
 './Spacy_model.ipynb',
 './Sentiment analysis using pre-trained models.ipynb']

In [9]:
list_files('./*.ipynb') # List files matching specific pattern

['./Open a file, a list of files and detect encoding.ipynb',
 './Visualization tools.ipynb',
 './Language_identification.ipynb',
 './Common Text Processing operations.ipynb',
 './Sentiment_analysis_FT.ipynb',
 './Spacy_model.ipynb',
 './Sentiment analysis using pre-trained models.ipynb']

In [10]:
# only files will be printed, not folders
list_files('/Users/hugo/Documents/NAUTILUS/nautilus-nlp/')

['/Users/hugo/Documents/NAUTILUS/nautilus-nlp/LICENSE',
 '/Users/hugo/Documents/NAUTILUS/nautilus-nlp/requirements.txt',
 '/Users/hugo/Documents/NAUTILUS/nautilus-nlp/Makefile',
 '/Users/hugo/Documents/NAUTILUS/nautilus-nlp/README.md',
 '/Users/hugo/Documents/NAUTILUS/nautilus-nlp/setup.py',
 '/Users/hugo/Documents/NAUTILUS/nautilus-nlp/tox.ini',
 '/Users/hugo/Documents/NAUTILUS/nautilus-nlp/test_environment.py']

## Open several text files

In [11]:
# Let's add another document 
s = "Un deuxième exemple de texte en utf-8 cette fois!"
encoded_s = s.encode('utf-8')
with open('someadditionalfile.txt', 'wb') as f:
    f.write(encoded_s)

In [12]:
from nautilus_nlp.utils.file_loader import documents_loader

In [13]:
documents_loader('*.txt')

INFO:root:detected encoding is ISO-8859-1, with a confidence rate of 0.73
INFO:root:detected encoding is ISO-8859-1, with a confidence rate of 0.73


{'somefile.txt': "J'aime les frites bien grasse étalon châpeau!",
 'someadditionalfile.txt': "J'aime les frites bien grasse étalon châpeau!"}

In [14]:
documents_loader(['somefile.txt','someadditionalfile.txt'])

INFO:root:detected encoding is ISO-8859-1, with a confidence rate of 0.73
INFO:root:detected encoding is ISO-8859-1, with a confidence rate of 0.73


{'somefile.txt': "J'aime les frites bien grasse étalon châpeau!",
 'someadditionalfile.txt': "J'aime les frites bien grasse étalon châpeau!"}

In [15]:
documents_loader('someadditionalfile.txt',encoding='utf-8')

'Un deuxième exemple de texte en utf-8 cette fois!'