In [None]:
%pip install contractions
%pip install textblob
%pip install nltk
%pip install scikit-learn
%pip install transformers
%pip install emoji
%pip install spacy
!python -m spacy download en_core_web_sm

In [2]:
import pandas as pd
import re
from sklearn.feature_extraction import text
import string
import contractions
import emoji
import spacy
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Loading the data:

In [None]:
data = pd.read_csv("combined_data.csv")
data.head()

In [None]:
data["text"] = data.apply(lambda row: f"{row['title']} {row['body']}" if pd.isna(row['comment_id']) else row['body'], axis=1)
data["text"].head()

Removing blank rows:

In [None]:
data = data.dropna(subset=["text"])
data = data.reset_index(drop=True)
data.head()

Converting texts to lowercase:

In [None]:
def lower(text):
  return text.lower()

data["Cleaned Text"] = data["text"].apply(lower)
data["Cleaned Text"].head()

Removing r/, usernames, new line indicators, and links from texts:

In [None]:
def remove_links(text):
  return re.sub(r'http[s]?://\S+|www\.\S+', '', text)

def remove_user_mentions(text):
    return re.sub(r'u/\S+', '', text)

data["Cleaned Text"] = data["Cleaned Text"].str.replace('r/', '', regex=False)
data["Cleaned Text"] = data["Cleaned Text"].str.replace("\n\n", ' ', regex=False)
data["Cleaned Text"] = data["Cleaned Text"].apply(remove_links)
data["Cleaned Text"] = data["Cleaned Text"].apply(remove_user_mentions)

data["Cleaned Text"].head()

Fixing spelling errors:

In [8]:
# # Fix spelling
# def correct_spelling(text):
#     return str(TextBlob(text).correct())

# data["Cleaned Text"] = data["Cleaned Text"].apply(correct_spelling)
# data["Cleaned Text"].head()

Expanding contractions:

In [None]:
def expand_contractions(text):
    return contractions.fix(text)

data["Cleaned Text"] = data["Cleaned Text"].apply(expand_contractions)
data["Cleaned Text"].head()

Removing stop words:

In [None]:
stopwords = text.ENGLISH_STOP_WORDS

data["Cleaned Text"] = data["Cleaned Text"].apply(
    lambda text: ' '.join([word for word in text.split() if word.lower() not in stopwords])
)
data["Cleaned Text"].head()

Removing punctuations:

In [None]:
def remove_punctuations(text):
    text = re.sub(r'[-]', ' ', text)
    text = re.sub(r'(\S)[' + re.escape(string.punctuation) + r'](\S)', r'\1 \2', text)
    return text

data["Cleaned Text"] = data["Cleaned Text"].apply(remove_punctuations)

pattern_punctuations = r'[' + string.punctuation + r']'

data["Cleaned Text"] = data["Cleaned Text"].str.replace(pattern_punctuations, '', regex=True)

data["Cleaned Text"].head()

Converting emojis to their descriptive names:

In [None]:
def convert_emojis(text):
   return emoji.demojize(text)

data["Cleaned Text"] = data["Cleaned Text"].apply(convert_emojis)
data["Cleaned Text"].head()

Removing numbers:

In [None]:
def remove_numbers(text):
    return re.sub(r'[0-9]+', '', text)

data["Cleaned Text"] = data["Cleaned Text"].apply(remove_numbers)
data['Cleaned Text'].head()

Removing non-ASCII characters:

In [None]:
data["Cleaned Text"] = data["Cleaned Text"].str.replace(r'[^\x00-\x7F]+', '', regex=True)
data["Cleaned Text"].head()

### TF-IDF Vectorizer 

In [None]:
nlp = spacy.load("en_core_web_sm")

def custom_tokenizer(text):
    doc = nlp(text)
    tokens = []
    for ent in doc.ents:
        tokens.append(ent.text)

    non_entity_tokens = [token.lemma_ for token in doc if not token.ent_type_ and not token.is_punct and not token.is_space]
    tokens.extend(non_entity_tokens)
    return tokens

vectorizer = TfidfVectorizer(
    tokenizer=custom_tokenizer,
    use_idf=False,
    stop_words="english",
    max_features=25000,
    min_df=5
)

doc_vectors = vectorizer.fit_transform(data["Cleaned Text"]).astype("float32")
feature_names = vectorizer.get_feature_names_out()
tfidf_data = pd.DataFrame.sparse.from_spmatrix(doc_vectors, columns=feature_names)
print(tfidf_data.head()) 

In [None]:
query = ["Discussions about ChatGPT, its performance, user experiences, applications, limitations, ethical concerns, and comparisons with other AI models developed by OpenAI."]

idf_vectorizer = TfidfVectorizer(
    tokenizer=custom_tokenizer,
    use_idf=True,
    # turn off sublinear_tf to get the same results as the previous implementation
    sublinear_tf=False,
    stop_words="english",
    max_features=25000,
    min_df=5
)

idf_vectorizer.fit(data["Cleaned Text"])
query_vector = idf_vectorizer.transform([query])

similarity_scores = cosine_similarity(query_vector, doc_vectors)[0]

data["similarity"] = similarity_scores

sorted_data = data.sort_values(by="similarity", ascending=False)

sorted_data.to_csv("similarity_scores.csv", index=False)

print("Data saved to 'similarity_scores.csv'")