<a href="https://colab.research.google.com/github/bncolorado/Processing-ELTeC-corpus/blob/main/COLAB_notebooks/ELTeC_CodeSwitching.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extracting code switching

One of the TEI tags used in ELTeC corpus is "foreign", that marks code switching. This notebook shows how to extract the languages other than Spanish (castilian) in ELTeC-SPA and to generates a word cloud.

## Loading ELTeC-SPA corpus in Colab

In [None]:
import zipfile

!wget "https://github.com/COST-ELTeC/ELTeC-spa/archive/refs/heads/master.zip" # paste here corpus url

zip_ref = zipfile.ZipFile('master.zip', 'r') #Opens the zip file in read mode
zip_ref.extractall() #Extracts files here (/content/)
zip_ref.close() 
!rm master.zip #Removes ZIP to save space

## Open each file and extract information about code switching

---



In [None]:
import os
import re
from bs4 import BeautifulSoup

dir_in = "/content/ELTeC-spa-master/level1/"

foreign_lexicon = {}
print('Processing', dir_in)

for base, directorios, ficheros in os.walk(dir_in):
  for fichero in ficheros:
    ficheroEntrada = base + fichero
    directorio = base.split('/')[-1]
    if fichero[0:3] == "SPA": # Language ID. Change if you are processing text from ther collection.
      with open(ficheroEntrada, 'r') as tei: #Opens the file
        soup = BeautifulSoup(tei, 'xml') #Parse the XML
        #print("Processing", ficheroEntrada) #Only to see the process. Comment if it's not important.
        if soup.foreign != None:
          foreigns = soup.find_all('foreign') #Extract all tags "foreign" 
          for item in foreigns:
            #print(item)
            lang = item["xml:lang"] # Extract the name of the language
            spam = item.text.lower() # Extract the words and tranform to lower case
            spam = re.sub('\n*','',spam) # A set of regular expressions to clean fragments.
            spam = re.sub('\t*','',spam)
            spam = re.sub('  ','',spam)
            spam = re.sub('^ ','',spam)
            spam = re.sub(' $','',spam)
            spam = re.sub('\!','',spam)
            spam = re.sub('¡','',spam)
            spam = re.sub('\?','',spam)
            spam = re.sub('¿','',spam)
            spam = re.sub(',','',spam)
            spam = re.sub('\.*','',spam)
            spam = re.sub(';','',spam)
            spam = re.sub('«','',spam)
            spam = re.sub('»','',spam)
            spam = re.sub('\)','',spam)
            spam = re.sub('\(','',spam)
            spam = re.sub('^-','',spam)
            spam = re.sub('-$','',spam)
            spam = re.sub(':','',spam)
            
            if lang not in foreign_lexicon.keys(): # Create a dictionary: language and its words.
             foreign_lexicon[lang] = []
             foreign_lexicon[lang].append(spam)
            else:
              foreign_lexicon[lang].append(spam)

for item in foreign_lexicon.items():
  print(item[0], item[1]) # Show langauges and words


## Word list

In [None]:
print("Generating frequent list")

freqlist = []
NumberOfWords = {}

for item in foreign_lexicon.items():
  lang = item[0]
  #print(lang)
  NumberOfWords[lang]=0#To count the number of words for each language
  out = open(lang+'_WordList.txt', 'w') #Opens a file in write mode ("w").
  words_out = ''
  for item in item[1]:
    wrd = item.split(' ') # An extremely simple tokenization
    for w in wrd:
      #print(w)
      NumberOfWords[lang]+=1 #Add 1 to the number of words for this langauge
      words_out+=w+'\n'

  out.write(words_out)
  out.close()
print('Done!')

print('Results. Number of words for each language:')
results = 'Number of words for each language:\n'
for item in NumberOfWords.items():
  print(item[0], item[1])
  results+=item[0]+':\t'+str(item[1])+'\n'

out_results = open('results.txt', 'w')
out_results.write(results)
out_results.close()

### Plotting

In [None]:
import matplotlib.pyplot as plt

x = NumberOfWords.keys() #Languages
y = NumberOfWords.values() #Number of words for each lang.

plt.bar(x,y) #Creates the plot
plt.xlabel('Language')
plt.title('Number of word for each language')
plt.show()

## Show words as WordCloud

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator

#A custom stopwords list:
stopwords = ['por', 'ch', 'ei', 'que','ei', 'io', 'se', 'de', 'est', 'ad', 'et', 'non', 'hoc', 'ex', 'le', 'la', 'qui', 'il', 'di', 'per', 'che', 'les', 'des', 'si', 'un']
for item in foreign_lexicon.items():
  text = ''
  for word in item[1]:
    for w in word.split(' '):
      if w not in stopwords:
        text+=w+' ' #All words in a string

  wordcloud = WordCloud().generate(text) #Creates the wordcloud, one for each language

  # Display the generated image:
  plt.imshow(wordcloud, interpolation='bilinear')
  plt.axis("off")
  print(item[0]) #Show the language
  plt.show() #Show the cloud

