In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans

In [2]:
# Load the dataset from CSV
data = pd.read_excel('/content/chatgpt1-3.xlsx') 
df = data.head(1000)

In [4]:
# Data Preprocessing
df = df.dropna(subset=['Text', 'hashtag']) 

In [5]:
# Extract hashtags from tweet text
df['hashtag'] = df['Text'].str.findall(r'#\w+')

In [6]:
# Count the frequency of each hashtag in the dataset
hashtags_freq = pd.Series([x for y in df['hashtag'] for x in y]).value_counts()

In [7]:
# Create a feature matrix
vectorizer = CountVectorizer(token_pattern=r'#\w+')
X = vectorizer.fit_transform(df['Text'])
X = X.toarray()
X = normalize(X, axis=1)

In [9]:
# Train a K-means clustering model
k = 5  # Number of clusters
kmeans = KMeans(n_clusters=k)
kmeans.fit(X)

# Analyze the clusters
for i in range(k):
    cluster_tweets = df[kmeans.labels_ == i]
    cluster_hashtags = pd.Series([x for y in cluster_tweets['hashtag'] for x in y])
    common_hashtags = cluster_hashtags.value_counts().head(10)
    print(f"Cluster {i+1} common hashtags: {', '.join(common_hashtags.index)}")



Cluster 1 common hashtags: #ArtificialIntelligence, #bigdata, #AI, #ChatGPT, #ç, #ãƒ, #DataScience, #Analytics, #SEO, #note
Cluster 2 common hashtags: #ChatGPT, #chatgpt, #chatGPT, #ArtificialIntelligence, #IntelligenceArtificielle, #Ú, #teacher, #ChatGpt, #web3, #PR
Cluster 3 common hashtags: #Tech, #technology, #tech, #NewsFlash, #Technology, #Bot, #News, #ChatGPT, #Awesome5G, #internet
Cluster 4 common hashtags: #ChatGPT, #OpenAI, #chatgpt, #openai, #openAI, #gpt, #succÃ, #disruption, #chatGPT, #template
Cluster 5 common hashtags: #AI, #ChatGPT, #chatgpt, #ai, #GPT3, #GenerativeAI, #gptreport, #Blogging, #Google, #AIFuture
