In [2]:
# %pip install contractions
# %pip install textblob
# %pip install nltk
# %pip install scikit-learn
# %pip install transformers
# %pip install emoji

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
from transformers import AutoTokenizer
import re
from sklearn.feature_extraction import text
import string
import contractions
import emoji
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

Loading the data:

In [4]:
csv = pd.read_csv("combined_data.csv")
data = pd.DataFrame(csv[["post_id", "comment_id", "title", "body"]])
data.columns = ["post_id", "comment_id", "title", "text"]
data.head()

  csv = pd.read_csv("combined_data.csv")


Unnamed: 0,post_id,comment_id,title,text
0,1hr4hc6,,"Weekly Self-Promotional Mega Thread 49, 01.01....",All the self-promotional posts about your AI p...
1,1ggixzy,,"AMA with OpenAI’s Sam Altman, Kevin Weil, Srin...",Consider this AMA our Reddit launch.\n\nAsk us...
2,1id5l47,,OpenAI Pleads That It Can’t Make Money Without...,"# ""It would be impossible to train today’s lea..."
3,1icyjx6,,Remember When OpenAI Threatened Your Job? A Fr...,
4,1icvvjq,,"""My AI just absolutely roasted me and I'm ques...",


Removing blank rows:

In [5]:
# Remove Blank Rows
data = data.dropna(subset=["text"])
data = data.reset_index(drop=True)
data.head()

Unnamed: 0,post_id,comment_id,title,text
0,1hr4hc6,,"Weekly Self-Promotional Mega Thread 49, 01.01....",All the self-promotional posts about your AI p...
1,1ggixzy,,"AMA with OpenAI’s Sam Altman, Kevin Weil, Srin...",Consider this AMA our Reddit launch.\n\nAsk us...
2,1id5l47,,OpenAI Pleads That It Can’t Make Money Without...,"# ""It would be impossible to train today’s lea..."
3,1iclecj,,Already DeepSick of us.,Why are we like this.
4,1idb44a,,I used ChatGPT to beat addiction,Today I'm proud to say that I am 10 days free ...


Converting texts to lowercase:

In [6]:
# Lowercase
def lower(text):
  return text.lower()

data["Cleaned Text"] = data["text"].apply(lower)
data["Cleaned Text"].head()

0    all the self-promotional posts about your ai p...
1    consider this ama our reddit launch.\n\nask us...
2    # "it would be impossible to train today’s lea...
3                               why are we like this. 
4    today i'm proud to say that i am 10 days free ...
Name: Cleaned Text, dtype: object

Removing r/, usernames, new line indicators, and links from texts:

In [7]:
# Remove unnecessary tokens
def remove_links(text):
  return re.sub(r'http[s]?://\S+|www\.\S+', '', text)

def remove_user_mentions(text):
    return re.sub(r'u/\S+', '', text)

data["Cleaned Text"] = data["Cleaned Text"].str.replace('r/', '', regex=False)
data["Cleaned Text"] = data["Cleaned Text"].str.replace("\n\n", ' ', regex=False)
data["Cleaned Text"] = data["Cleaned Text"].apply(remove_links)
data["Cleaned Text"] = data["Cleaned Text"].apply(remove_user_mentions)

data["Cleaned Text"].head()

0    all the self-promotional posts about your ai p...
1    consider this ama our reddit launch. ask us an...
2    # "it would be impossible to train today’s lea...
3                               why are we like this. 
4    today i'm proud to say that i am 10 days free ...
Name: Cleaned Text, dtype: object

Fixing spelling errors:

In [None]:
# Fix spelling
def correct_spelling(text):
    return str(TextBlob(text).correct())

data["Cleaned Text"] = data["Cleaned Text"].apply(correct_spelling)
data["Cleaned Text"].head()

Expanding contractions:

In [None]:
# Expand contractions
def expand_contractions(text):
    return contractions.fix(text)

data["Cleaned Text"] = data["Cleaned Text"].apply(expand_contractions)
data["Cleaned Text"].head()

Removing stop words:

In [None]:
# Remove stopwords
stopwords = text.ENGLISH_STOP_WORDS

data["Cleaned Text"] = data["Cleaned Text"].apply(
    lambda text: ' '.join([word for word in text.split() if word.lower() not in stopwords])
)
data["Cleaned Text"].head()

Removing punctuations:

In [None]:
# Remove punctuations
def remove_punctuations(text):
    text = re.sub(r'[-]', ' ', text)
    text = re.sub(r'(\S)[' + re.escape(string.punctuation) + r'](\S)', r'\1 \2', text)
    return text

data["Cleaned Text"] = data["Cleaned Text"].apply(remove_punctuations)

pattern_punctuations = r'[' + string.punctuation + r']'

data["Cleaned Text"] = data["Cleaned Text"].str.replace(pattern_punctuations, '', regex=True)

data["Cleaned Text"].head()

In [None]:
def convert_emojis(text):
    """
    Convert emojis in the text to their descriptive names.
    """
    return emoji.demojize(text)

# Apply the conversion to your cleaned text data
data["Cleaned Text"] = data["Cleaned Text"].apply(convert_emojis)
data["Cleaned Text"].head()

### TF-IDF Vectorizer 

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

def byte_level_tokenizer(text):
    byte_sequence = text.encode('utf-8')
    latent_tokens = [byte_sequence[i:i+2] for i in range(0, len(byte_sequence), 2)]
    return [str(token) for token in latent_tokens]

vectorizer = TfidfVectorizer(
    tokenizer=byte_level_tokenizer,
    use_idf=False
)

doc_vectors = vectorizer.fit_transform(data["Cleaned Text"])

feature_names = vectorizer.get_feature_names_out()
dense_vec = doc_vectors.todense()
dense_list = dense_vec.tolist()
tfidf_data = pd.DataFrame(dense_list, columns=feature_names)
tfidf_data

In [None]:
query = ["Discussions about ChatGPT, its performance, user experiences, applications, limitations, ethical concerns, and comparisons with other AI models developed by OpenAI."]

idf_vectorizer = TfidfVectorizer(
    tokenizer=byte_level_tokenizer,
    use_idf=True
    # turn off sublinear_tf to get the same results as the previous implementation
    sublinear_tf=False
)

idf_vectorizer.fit(data["Cleaned Text"])
query_vector = idf_vectorizer.transform([query])

similarity_scores = cosine_similarity(query_vector, doc_vectors)[0]

data["similarity"] = similarity_scores

sorted_data = data.sort_values(by="similarity", ascending=False)

sorted_data.to_csv("similarity_scores.csv", index=False)

print("Data saved to 'similarity_scores.csv'")