In [1]:
import pandas as pd
import re
# import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
# from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
# python -m spacy download en # in terminal

In [3]:
df = pd.read_csv('stackexchange_812k.csv')

In [4]:
df.columns

Index(['post_id', 'parent_id', 'comment_id', 'text', 'category'], dtype='object')

In [5]:
df.sample(3)

Unnamed: 0,post_id,parent_id,comment_id,text,category
494679,381701,,717918.0,@NickCox Thank you for this clarification and ...,comment
25745,408432,,,dirichlet distribution and excessively large n...,title
683315,328876,,623192.0,"Sorry, I tried to copy my output from SAS usin...",comment


In [6]:
df[df["text"].str.len() > 2000]["text"].shape

(28171,)

* post_id: A post can be a question or an answer to the original question. Original questions have a title. 
* parent_id: Posts with a parent_id are answers to the original post.
* comment_id: Comments
* text: A title, post, or comment
* category:
* * title: Titles are usually one sentence long and may be similar to the following examples:
        Question on how to construct a confidence interval
        Choosing model from Walk-Forward CV for Time Series
* * post: HTML-formatted paragraphs that contain the following:
        HTML tags
        URLs
        math equations in LaTeX
        sometimes large tables of numbers
* comment: Comments are shorter than posts. They are simple text paragraphs that can also contain URLs, math equations, and numbers but no HTML tags.

In [7]:
# Pattern to remove
regexp = re.compile(    r"<.*?>|"     # HTML tags
    r"\$.*?\$|"   # LaTeX expressions
    r"http[s]?://\S+|"  # URLs
    r"\d+|"       # Digits
    r"\n|"        # Line returns
    r"[^\w\s.,!?:]"    # Non-word characters
    )

df['text'] = df['text'].apply(lambda text: regexp.sub("", text))

In [8]:
# Check if there is empty text
(df['text'] == "").any()

True

In [9]:
# Remove empty texts
count = df.shape[0]
df = df.drop(df[df["text"]==""].index)
print(f"{count - df.shape[0]} col dropped")

1114 col dropped


In [10]:
# Remove texts that are extremely large or too short
min_text = 255 # char
max_text = 2000 # char

count = df.shape[0]
df = df.drop(df[(df['text'].str.len() < min_text)|(df['text'].str.len() > max_text)].index)
print(f"{count - df.shape[0]} col dropped")

499933 col dropped


In [11]:
test_text = df['text'].sample().values[0]
test_text

'. I do not know much about forecasting and especially not about forecasting with Gaussian Processes. But, I imagine that there will be certain special kernels involved that allow you to project the trend outside the domain of the sample like in the link of your questions the periodic kernel is used to project the period to outside the range. If you construct some confidencecredible interval then wouldnt this interval increase in size for predictions further away from the measurement points?'

In [12]:
# Use a tokenizer to create a version of the original text that is a string of space-separated lowercase tokens
import spacy

# Load the English model
nlp = spacy.load('en_core_web_sm')

# Process the text
doc = nlp(test_text)

# Tokenize and convert to lowercase
tokens = [token.text.lower() for token in doc]

# Join the tokens back into a string
processed_text = ' '.join(tokens)


In [13]:
from tqdm import tqdm
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])  # Disable unnecessary components
def process_texts(texts):
    docs = nlp.pipe(texts)
    return [[token.text.lower() for token in doc] for doc in docs].join(" ")

df['tokens'] = process_texts(tqdm(df['text']))

 17%|█▋        | 53248/311085 [04:46<23:03, 186.31it/s]

In [45]:
df.sample(3)

Unnamed: 0,post_id,parent_id,comment_id,text,category,tokens
115024,46129,40641.0,,I think the measure you are looking for is cal...,post,"[i, think, the, measure, you, are, looking, fo..."
653696,124719,,237719.0,Correct Xs and Y. The response Y is whether t...,comment,"[correct, , xs, and, y., the, response, y, is..."
146972,416674,,,It is relatively straightforward when I want t...,post,"[it, is, relatively, straightforward, when, i,..."


In [46]:
df.to_csv("output.csv")

In [None]:
df.