# Latex to text

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


### Preprocessing latex code

In [2]:
import re

In [3]:
latex_path = "/content/drive/MyDrive/SPIDER/docs/Paper.tex"

with open(latex_path, 'r') as file:
    content = file.read()

In [4]:
# comments
content = re.sub(r'(?<!\\)%.*', '', content)

# tables
content = re.sub(r'\\begin{table}.*?\\end{table}', '', content, flags=re.DOTALL)

# figures
content = re.sub(r'\\begin{figure}.*?\\end{figure}', '', content, flags=re.DOTALL)

# \begin{figure*} ... \end{figure*}
content = re.sub(r'\\begin{figure\*}.*?\\end{figure\*}', '', content, flags=re.DOTALL)

# equations
content = re.sub(r'\\begin{equation}.*?\\end{equation}', '', content, flags=re.DOTALL)

# \begin{CCSXML} ... \end{CCSXML}
content = re.sub(r'\\begin{CCSXML}.*?\\end{CCSXML}', '', content, flags=re.DOTALL)

# ~\ref{...} & ~\cite{...}
content = re.sub(r'~\\ref{.*?}', '', content)
content = re.sub(r'~\\cite{.*?}', '', content)

# print(content)

### Converting latex to text

In [5]:
!pip install pylatexenc

Collecting pylatexenc
  Downloading pylatexenc-2.10.tar.gz (162 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pylatexenc
  Building wheel for pylatexenc (setup.py) ... [?25l[?25hdone
  Created wheel for pylatexenc: filename=pylatexenc-2.10-py3-none-any.whl size=136816 sha256=95966b8065edcb715891b45e509db807b03c82b85da7fac84622d81682dee868
  Stored in directory: /root/.cache/pip/wheels/d3/31/8b/e09b0386afd80cfc556c00408c9aeea5c35c4d484a9c762fd5
Successfully built pylatexenc
Installing collected packages: pylatexenc
Successfully installed pylatexenc-2.10


In [6]:
from pylatexenc.latex2text import LatexNodes2Text

In [7]:
content = LatexNodes2Text().latex_to_text(content) # latex to text conversion

# extra spaces between lines
content = re.sub(r'(\n\s*){2,}', r'\n', content)

print(content)


copyrightspace
arpe@itu.dk
IT University of Copenhagen
  Rued Langgaards Vej 7
  Copenhagen
  Denmark
  2300
luai@itu.dk
IT University of Copenhagen
  Rued Langgaards Vej 7
  Copenhagen
  Denmark
  2300
Public discourse on critical issues such as climate change is progressively shifting to social media platforms that prioritize short-form video content. To improve our understanding of this transition, we studied the video content produced by 21 prominent YouTube creators who have expanded their influence to TikTok as information disseminators. Using dictionary-based tools and BERT-based embeddings, we analyzed the transcripts of nearly 7k climate-related videos across both platforms and the 574k comments they received. We found that, when using TikTok, creators use a more emotionally resonant, self-referential, and action-oriented language compared to YouTube. We also observed a strong semantic alignment between videos and comments, with creators who excel at diversifying their TikTok

In [8]:
# path to output file
text_path = 'modified_file.txt'

with open(text_path, 'w') as file:
    file.write(content)

# Word Frequency Estimation

### (a) Word Frequency Estimation using Counter class from collections library

In [9]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
from nltk.tokenize import word_tokenize as wt
from collections import Counter
import string
import pandas as pd

In [11]:
def calc_word_freq(text):
    # removing punctuation (: . , ! ? ( ) - ... )
    text = text.translate(str.maketrans('', '', string.punctuation))

    words = wt(text)

    freqs = Counter(words)

    return freqs

In [12]:
freqs_1 = dict(calc_word_freq(content))  # output is a counter object but can be converted to other formats (e.g. list, set, etc.)

print(f"Total words: {len(freqs_1)}\n")

freqs_1

Total words: 955



{'copyrightspace': 1,
 'arpeitudk': 1,
 'IT': 2,
 'University': 2,
 'of': 100,
 'Copenhagen': 4,
 'Rued': 2,
 'Langgaards': 2,
 'Vej': 2,
 '7': 2,
 'Denmark': 2,
 '2300': 2,
 'luaiitudk': 1,
 'Public': 1,
 'discourse': 10,
 'on': 55,
 'critical': 3,
 'issues': 1,
 'such': 6,
 'as': 17,
 'climate': 27,
 'change': 20,
 'is': 11,
 'progressively': 1,
 'shifting': 1,
 'to': 72,
 'social': 4,
 'media': 2,
 'platforms': 29,
 'that': 21,
 'prioritize': 1,
 'shortform': 1,
 'video': 27,
 'content': 47,
 'To': 11,
 'improve': 2,
 'our': 9,
 'understanding': 3,
 'this': 10,
 'transition': 1,
 'we': 30,
 'studied': 2,
 'the': 150,
 'produced': 3,
 'by': 11,
 '21': 4,
 'prominent': 2,
 'YouTube': 17,
 'creators': 30,
 'who': 6,
 'have': 7,
 'expanded': 1,
 'their': 22,
 'influence': 2,
 'TikTok': 19,
 'information': 3,
 'disseminators': 2,
 'Using': 1,
 'dictionarybased': 1,
 'tools': 3,
 'and': 100,
 'BERTbased': 1,
 'embeddings': 5,
 'analyzed': 2,
 'transcripts': 9,
 'nearly': 1,
 '7k': 1,
 'cl

### (b) Word Frequency Estimation from scratch

In [13]:
def count_frequency(path) :
    dictionary = {}

    with open(path, "r") as file:

      line = file.readline()

      while line:
        # removing punctuation (: . , ! ? ( ) - ... )
        text = line.translate(str.maketrans('', '', string.punctuation))

        tokens = wt(text)

        for token in tokens:
          if token in dictionary:
            dictionary[token] = dictionary[token] + 1
          else:
            dictionary[token] = 1

        line = file.readline()

    return dictionary

In [14]:
freqs_2 = count_frequency(text_path)

print(f"Total words: {len(freqs_2)}\n")

freqs_2

Total words: 955



{'copyrightspace': 1,
 'arpeitudk': 1,
 'IT': 2,
 'University': 2,
 'of': 100,
 'Copenhagen': 4,
 'Rued': 2,
 'Langgaards': 2,
 'Vej': 2,
 '7': 2,
 'Denmark': 2,
 '2300': 2,
 'luaiitudk': 1,
 'Public': 1,
 'discourse': 10,
 'on': 55,
 'critical': 3,
 'issues': 1,
 'such': 6,
 'as': 17,
 'climate': 27,
 'change': 20,
 'is': 11,
 'progressively': 1,
 'shifting': 1,
 'to': 72,
 'social': 4,
 'media': 2,
 'platforms': 29,
 'that': 21,
 'prioritize': 1,
 'shortform': 1,
 'video': 27,
 'content': 47,
 'To': 11,
 'improve': 2,
 'our': 9,
 'understanding': 3,
 'this': 10,
 'transition': 1,
 'we': 30,
 'studied': 2,
 'the': 150,
 'produced': 3,
 'by': 11,
 '21': 4,
 'prominent': 2,
 'YouTube': 17,
 'creators': 30,
 'who': 6,
 'have': 7,
 'expanded': 1,
 'their': 22,
 'influence': 2,
 'TikTok': 19,
 'information': 3,
 'disseminators': 2,
 'Using': 1,
 'dictionarybased': 1,
 'tools': 3,
 'and': 100,
 'BERTbased': 1,
 'embeddings': 5,
 'analyzed': 2,
 'transcripts': 9,
 'nearly': 1,
 '7k': 1,
 'cl

In [15]:
len(freqs_1), len(freqs_2) # both give similar results

(955, 955)

# Exporting data to csv file

In [16]:
# Convert the dictionary to a DataFrame
df = pd.DataFrame(list(freqs_1.items()), columns=['Word', 'Frequency'])

df

Unnamed: 0,Word,Frequency
0,copyrightspace,1
1,arpeitudk,1
2,IT,2
3,University,2
4,of,100
...,...,...
950,performs,1
951,crossplatform,1
952,delves,1
953,relationship,1


In [17]:
csv_path = "EngFreqs.csv"

df.to_csv(csv_path, index=False)

# Links

1. Latex file: https://drive.google.com/file/d/1Cqb1hEsCSlwjxhlp_g7xBI0dCmOWHn5Q/view?usp=sharing

( The latex file is taken from the paper: https://arxiv.org/abs/2312.04974 )