In [None]:
%pip install contractions
%pip install textblob
%pip install nltk
%pip install scikit-learn

In [2]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction import text
import string
import contractions
from textblob import TextBlob
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Loading the data:

In [None]:
csv = pd.read_csv("combined_data.csv")
data = pd.DataFrame(csv[["post_id", "comment_id", "title", "body"]])
data.columns = ["post_id", "comment_id", "title", "text"]
data.head()

Removing blank rows:

In [None]:
data = data.dropna(subset=["text"])
data = data.reset_index(drop=True)
data.head()

Converting texts to lowercase:

In [None]:
def lower(text):
  return text.lower()

data["Cleaned Text"] = data["text"].apply(lower)
data["Cleaned Text"].head()

Removing r/, usernames, new line indicators, and links from texts:

In [None]:
def remove_links(text):
  return re.sub(r'http[s]?://\S+|www\.\S+', '', text)

def remove_user_mentions(text):
    return re.sub(r'u/\S+', '', text)

data["Cleaned Text"] = data["Cleaned Text"].str.replace('r/', '', regex=False)
data["Cleaned Text"] = data["Cleaned Text"].str.replace("\n\n", ' ', regex=False)
data["Cleaned Text"] = data["Cleaned Text"].apply(remove_links)
data["Cleaned Text"] = data["Cleaned Text"].apply(remove_user_mentions)

data["Cleaned Text"].head()

Fixing spelling errors:

In [None]:
def correct_spelling(text):
    return str(TextBlob(text).correct())

data["Cleaned Text"] = data["Cleaned Text"].apply(correct_spelling)
data["Cleaned Text"].head()

Expanding contractions:

In [None]:
def expand_contractions(text):
    return contractions.fix(text)

data["Cleaned Text"] = data["Cleaned Text"].apply(expand_contractions)
data["Cleaned Text"].head()

Removing stop words:

In [None]:
stopwords = text.ENGLISH_STOP_WORDS

data["Cleaned Text"] = data["Cleaned Text"].apply(
    lambda text: ' '.join([word for word in text.split() if word.lower() not in stopwords])
)
data["Cleaned Text"].head()

Removing punctuations:

In [None]:
pattern_punctuations = r'[' + string.punctuation + r']'

data["Cleaned Text"] = data["Cleaned Text"].str.replace(pattern_punctuations, '', regex=True)
data["Cleaned Text"].head()

### TF-IDF Vectorizer 

Using KMeans clustering to cluster text into 3 clusters and calculating the best silhouette score in order to find the best paramets for Tf-idf vector.

In [None]:
best_score = -1
best_params = None

param_grid = [
    {'max_df': 1.0, 'min_df': 1, 'ngram_range': (1, 1), 'use_idf': True}, 
    {'max_df': 0.8, 'min_df': 2, 'ngram_range': (1, 1), 'use_idf': True}, 
    {'max_df': 1.0, 'min_df': 1, 'ngram_range': (1, 2), 'use_idf': True}, 
    {'max_df': 1.0, 'min_df': 1, 'ngram_range': (1, 1), 'use_idf': False},
    {'max_df': 0.7, 'min_df': 2, 'ngram_range': (1, 2), 'use_idf': True},
    {'max_df': 1.0, 'min_df': 1, 'ngram_range': (1, 3), 'use_idf': True},
    {'max_df': 1.0, 'min_df': 1, 'ngram_range': (1, 3), 'use_idf': False},
    {'max_df': 0.5, 'min_df': 3, 'ngram_range': (1, 1), 'use_idf': True},
    {'max_df': 1.0, 'min_df': 1, 'ngram_range': (2, 3), 'use_idf': False},
    {'max_df': 0.9, 'min_df': 1, 'ngram_range': (1, 1), 'use_idf': True},
    {'max_df': 1.0, 'min_df': 1, 'ngram_range': (1, 3), 'use_idf': True},
    {'max_df': 0.6, 'min_df': 4, 'ngram_range': (1, 3), 'use_idf': True},
    {'max_df': 1.0, 'min_df': 2, 'ngram_range': (2, 2), 'use_idf': False},
    {'max_df': 0.4, 'min_df': 5, 'ngram_range': (1, 1), 'use_idf': True},
    {'max_df': 0.75, 'min_df': 3, 'ngram_range': (1, 2), 'use_idf': True}
]

for params in param_grid:
    vectorizer = TfidfVectorizer(**params)
    tfidf_matrix = vectorizer.fit_transform(data["Cleaned Text"])
    
    num_clusters = 3
    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(tfidf_matrix)

    score = silhouette_score(tfidf_matrix, cluster_labels)
    
    print(f"Params: {params}, Silhouette Score: {score:.4f}")
    
    if score > best_score:
        best_score = score
        best_params = params

print(f"\nBest TF-IDF Parameters: {best_params} with Score: {best_score:.4f}")


In [None]:
vectorizer = TfidfVectorizer(**best_params)
vectors = vectorizer.fit_transform(data["Cleaned Text"])
feature_names = vectorizer.get_feature_names_out()
dense_vec = vectors.todense()
dense_list = dense_vec.tolist()
tfidf_data = pd.DataFrame(dense_list, columns=feature_names)
tfidf_data

In [None]:
query = ["Discussions about ChatGPT, its performance, user experiences, applications, limitations, ethical concerns, and comparisons with other AI models developed by OpenAI."]

query_tfidf = vectorizer.transform(query)
query_sim = cosine_similarity(query_tfidf, tfidf_matrix)[0]

data["similarity"] = query_sim

sorted_data = data.sort_values(by="similarity", ascending=False)

sorted_data.to_csv("similarity_scores.csv", index=False)

print("Data saved to 'similarity_scores.csv'")