# UTF Character Web Scrape

For my Natural Language Processing project, special characters in the data were expressed as utf-8 codes, rather than ascii characters. I found this website (https://www.utf8-chartable.de/) which has a table of utf-8 characters and their corresponding ascii characters. The end goal is to create a dictionary of these mappings so that I can easily convert them in my NLP project.

## Import Libraries we need

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


## Data Loading

- Data Obtained from the table on this [web page](https://www.utf8-chartable.de/)

- The data is a table of utf-8 codes and their corresponding ascii character

In [2]:
url = 'https://www.utf8-chartable.de/'

response = requests.get(url)


## Web Scraping

In [19]:
if response.status_code == 200:
    
    soup = BeautifulSoup(response.text, 'html.parser')

    char_elements = soup.find_all('td', class_='char')

    utf8_elements = soup.find_all('td', class_='utf8')

    char_data = [char.text.strip() for char in char_elements]
    utf8_data = [utf8.text.strip() for utf8 in utf8_elements]


In [6]:
print(char_data)


['', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '', '¡', '¢', '£', '¤', '¥', '¦', '§', '¨', '©', 'ª', '«', '¬', '\xad', '®', '¯', '°', '±', '²', '³', '´', 'µ', '¶', '·', '¸', '¹', 'º', '»', '¼', '½', '¾', '¿', 'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', '×', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', '÷', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'þ', 'ÿ']


In [20]:
print(utf8_data)


['00', '01', '02', '03', '04', '05', '06', '07', '08', '09', '0a', '0b', '0c', '0d', '0e', '0f', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '1a', '1b', '1c', '1d', '1e', '1f', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '2a', '2b', '2c', '2d', '2e', '2f', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '3a', '3b', '3c', '3d', '3e', '3f', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '4a', '4b', '4c', '4d', '4e', '4f', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '5a', '5b', '5c', '5d', '5e', '5f', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '6a', '6b', '6c', '6d', '6e', '6f', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '7a', '7b', '7c', '7d', '7e', '7f', 'c2 80', 'c2 81', 'c2 82', 'c2 83', 'c2 84', 'c2 85', 'c2 86', 'c2 87', 'c2 88', 'c2 89', 'c2 8a', 'c2 8b', 'c2 8c', 'c2 8d', 'c2 8e', 'c2 8f', 'c2 90', 'c2 91', 'c2 92', 'c2 93', 'c2 94', 'c2 95', 'c2 96', 'c2 97', 'c2 98', 'c2 99

## Cleaning up the Output

### Limit characters to those after the 1st accented character

In [7]:
char_data.index('À')


127

In [10]:
filtered_char_data = char_data[127:]
print(filtered_char_data)
len(filtered_char_data)


['À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', '×', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', '÷', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'þ', 'ÿ']


64

### Limit UTF codes to those beginning with c3

In [24]:
filtered_utf8_data = [utf for utf in utf8_data if utf.startswith('c3') ]
print(filtered_utf8_data)
len(filtered_utf8_data)


['c3 80', 'c3 81', 'c3 82', 'c3 83', 'c3 84', 'c3 85', 'c3 86', 'c3 87', 'c3 88', 'c3 89', 'c3 8a', 'c3 8b', 'c3 8c', 'c3 8d', 'c3 8e', 'c3 8f', 'c3 90', 'c3 91', 'c3 92', 'c3 93', 'c3 94', 'c3 95', 'c3 96', 'c3 97', 'c3 98', 'c3 99', 'c3 9a', 'c3 9b', 'c3 9c', 'c3 9d', 'c3 9e', 'c3 9f', 'c3 a0', 'c3 a1', 'c3 a2', 'c3 a3', 'c3 a4', 'c3 a5', 'c3 a6', 'c3 a7', 'c3 a8', 'c3 a9', 'c3 aa', 'c3 ab', 'c3 ac', 'c3 ad', 'c3 ae', 'c3 af', 'c3 b0', 'c3 b1', 'c3 b2', 'c3 b3', 'c3 b4', 'c3 b5', 'c3 b6', 'c3 b7', 'c3 b8', 'c3 b9', 'c3 ba', 'c3 bb', 'c3 bc', 'c3 bd', 'c3 be', 'c3 bf']


64

### Combine lists into a DataFrame

In [17]:
df = pd.DataFrame({ 'UTF-8': filtered_utf8_data, 
                   'Char': filtered_char_data,})
df


Unnamed: 0,UTF-8,Char
0,c3 80,À
1,c3 81,Á
2,c3 82,Â
3,c3 83,Ã
4,c3 84,Ä
...,...,...
59,c3 bb,û
60,c3 bc,ü
61,c3 bd,ý
62,c3 be,þ


### Convert DataFrame into a dictionary

In [18]:
utf8_char_dict = df.set_index('UTF-8')['Char'].to_dict()
utf8_char_dict


{'c3 80': 'À',
 'c3 81': 'Á',
 'c3 82': 'Â',
 'c3 83': 'Ã',
 'c3 84': 'Ä',
 'c3 85': 'Å',
 'c3 86': 'Æ',
 'c3 87': 'Ç',
 'c3 88': 'È',
 'c3 89': 'É',
 'c3 8a': 'Ê',
 'c3 8b': 'Ë',
 'c3 8c': 'Ì',
 'c3 8d': 'Í',
 'c3 8e': 'Î',
 'c3 8f': 'Ï',
 'c3 90': 'Ð',
 'c3 91': 'Ñ',
 'c3 92': 'Ò',
 'c3 93': 'Ó',
 'c3 94': 'Ô',
 'c3 95': 'Õ',
 'c3 96': 'Ö',
 'c3 97': '×',
 'c3 98': 'Ø',
 'c3 99': 'Ù',
 'c3 9a': 'Ú',
 'c3 9b': 'Û',
 'c3 9c': 'Ü',
 'c3 9d': 'Ý',
 'c3 9e': 'Þ',
 'c3 9f': 'ß',
 'c3 a0': 'à',
 'c3 a1': 'á',
 'c3 a2': 'â',
 'c3 a3': 'ã',
 'c3 a4': 'ä',
 'c3 a5': 'å',
 'c3 a6': 'æ',
 'c3 a7': 'ç',
 'c3 a8': 'è',
 'c3 a9': 'é',
 'c3 aa': 'ê',
 'c3 ab': 'ë',
 'c3 ac': 'ì',
 'c3 ad': 'í',
 'c3 ae': 'î',
 'c3 af': 'ï',
 'c3 b0': 'ð',
 'c3 b1': 'ñ',
 'c3 b2': 'ò',
 'c3 b3': 'ó',
 'c3 b4': 'ô',
 'c3 b5': 'õ',
 'c3 b6': 'ö',
 'c3 b7': '÷',
 'c3 b8': 'ø',
 'c3 b9': 'ù',
 'c3 ba': 'ú',
 'c3 bb': 'û',
 'c3 bc': 'ü',
 'c3 bd': 'ý',
 'c3 be': 'þ',
 'c3 bf': 'ÿ'}