In [None]:
%pip install contractions
%pip install textblob
%pip install nltk
%pip install scikit-learn
%pip install transformers

In [29]:
import pandas as pd
from transformers import AutoTokenizer
import re
from sklearn.feature_extraction import text
import string
import contractions
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Loading the data:

In [None]:
csv = pd.read_csv("combined_data.csv")
data = pd.DataFrame(csv[["post_id", "comment_id", "title", "body"]])
data.columns = ["post_id", "comment_id", "title", "text"]
data.head()

Removing blank rows:

In [None]:
data = data.dropna(subset=["text"])
data = data.reset_index(drop=True)
data.head()

Converting texts to lowercase:

In [None]:
def lower(text):
  return text.lower()

data["Cleaned Text"] = data["text"].apply(lower)
data["Cleaned Text"].head()

Removing r/, usernames, new line indicators, and links from texts:

In [None]:
def remove_links(text):
  return re.sub(r'http[s]?://\S+|www\.\S+', '', text)

def remove_user_mentions(text):
    return re.sub(r'u/\S+', '', text)

data["Cleaned Text"] = data["Cleaned Text"].str.replace('r/', '', regex=False)
data["Cleaned Text"] = data["Cleaned Text"].str.replace("\n\n", ' ', regex=False)
data["Cleaned Text"] = data["Cleaned Text"].apply(remove_links)
data["Cleaned Text"] = data["Cleaned Text"].apply(remove_user_mentions)

data["Cleaned Text"].head()

Fixing spelling errors:

In [None]:
def correct_spelling(text):
    return str(TextBlob(text).correct())

data["Cleaned Text"] = data["Cleaned Text"].apply(correct_spelling)
data["Cleaned Text"].head()

Expanding contractions:

In [None]:
def expand_contractions(text):
    return contractions.fix(text)

data["Cleaned Text"] = data["Cleaned Text"].apply(expand_contractions)
data["Cleaned Text"].head()

Removing stop words:

In [None]:
stopwords = text.ENGLISH_STOP_WORDS

data["Cleaned Text"] = data["Cleaned Text"].apply(
    lambda text: ' '.join([word for word in text.split() if word.lower() not in stopwords])
)
data["Cleaned Text"].head()

Removing punctuations:

In [None]:
def remove_punctuations(text):
    text = re.sub(r'[-]', ' ', text)
    text = re.sub(r'(\S)[' + re.escape(string.punctuation) + r'](\S)', r'\1 \2', text)
    return text

data["Cleaned Text"] = data["Cleaned Text"].apply(remove_punctuations)

pattern_punctuations = r'[' + string.punctuation + r']'

data["Cleaned Text"] = data["Cleaned Text"].str.replace(pattern_punctuations, '', regex=True)

data["Cleaned Text"].head()

### TF-IDF Vectorizer 

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

def byte_level_tokenizer(text):
    byte_sequence = text.encode('utf-8')
    latent_tokens = [byte_sequence[i:i+2] for i in range(0, len(byte_sequence), 2)]
    return [str(token) for token in latent_tokens]

vectorizer = TfidfVectorizer(
    tokenizer=byte_level_tokenizer,
    use_idf=False
)

doc_vectors = vectorizer.fit_transform(data["Cleaned Text"])

feature_names = vectorizer.get_feature_names_out()
dense_vec = doc_vectors.todense()
dense_list = dense_vec.tolist()
tfidf_data = pd.DataFrame(dense_list, columns=feature_names)
tfidf_data

In [None]:
query = ["Discussions about ChatGPT, its performance, user experiences, applications, limitations, ethical concerns, and comparisons with other AI models developed by OpenAI."]

idf_vectorizer = TfidfVectorizer(
    tokenizer=byte_level_tokenizer,
    use_idf=True
)

idf_vectorizer.fit(data["Cleaned Text"])
query_vector = idf_vectorizer.transform([query])

similarity_scores = cosine_similarity(query_vector, doc_vectors)[0]

data["similarity"] = similarity_scores

sorted_data = data.sort_values(by="similarity", ascending=False)

sorted_data.to_csv("similarity_scores.csv", index=False)

print("Data saved to 'similarity_scores.csv'")