In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

df = pd.read_csv('archive/abcnews-date-text.csv')
# Download the stop words and initialize the stemmer
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [None]:
import re
#remove duplicates headlines
df = df.drop_duplicates(subset='headline_text', keep='first')

#function for process each line
def process_headline(headline):
    # Remove symbols using regex
    headline = re.sub(r'[^\w\s]', '', headline)
    # Tokenize the headline
    words = word_tokenize(headline.lower())
    # Remove stop words and perform stemming
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    # Join the words back into a single string
    return ' '.join(words)

# Apply the processing function to each headline
df['processed_headline'] = df['headline_text'].apply(process_headline)


In [3]:
import numpy as np
import tensorflow as tf

# Tokenize the headlines
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df['processed_headline'])
sequences = tokenizer.texts_to_sequences(df['processed_headline'])

# Pad the sequences to have equal length
max_len = max(len(seq) for seq in sequences)
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_len)


In [None]:
#Autoencoder Training
# Define the autoencoder architecture
input_dim = padded_sequences.shape[1]
latent_dim = 32
input_layer = tf.keras.layers.Input(shape=(input_dim,))
encoded = tf.keras.layers.Dense(latent_dim, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(input_layer)
decoded = tf.keras.layers.Dense(input_dim, activation='sigmoid')(encoded)

# Define the autoencoder model
autoencoder = tf.keras.models.Model(input_layer, decoded)

# Compile the model
autoencoder.compile(optimizer='adam', loss='categorical_crossentropy')

# Train the model
history = autoencoder.fit(padded_sequences, padded_sequences, batch_size=256,validation_split=0.1)


In [None]:
#Clustering
from sklearn.cluster import KMeans

# Extract the latent representation of the headlines
encoder = tf.keras.models.Model(input_layer, encoded)
latent_vectors = encoder.predict(padded_sequences)

# Cluster the data using K-means
n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(latent_vectors)

# Add the cluster labels to the DataFrame
df['cluster'] = cluster_labels


In [None]:
#visualize the clusters
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import TruncatedSVD

# Reduce the dimensionality of the latent vectors for visualization
tsvd = TruncatedSVD(n_components=2, random_state=42)
latent_2d = tsvd.fit_transform(latent_vectors)

# Add the 2D representation of the latent vectors to the DataFrame
df['x'] = latent_2d[:, 0]
df['y'] = latent_2d[:, 1]

# Plot the clustering result
sns.scatterplot(x='x', y='y', hue='cluster', data=df)
plt.show()


In [None]:
#visualize loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.show()


In [20]:
from sklearn.metrics import silhouette_score
score = silhouette_score(latent_vectors[:5000],cluster_labels[:5000])
print(score)

0.55109006
