In [16]:
import pandas as pd
import re
from collections import Counter
from nltk.corpus import stopwords
import nltk

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Load dataset
df = pd.read_csv("github_repos_filtered_language.csv")

# Drop rows where description is missing
df = df.dropna(subset=["description"])

# Convert descriptions to lowercase and concatenate them into one large text
text = " ".join(df["description"].astype(str).str.lower())

# Define stopwords (common words to ignore)
stop_words = set(stopwords.words("english"))

# Tokenize text and remove special characters, numbers, and stopwords
words = re.findall(r'\b[a-zA-Z]{2,}\b', text)  # Keep words with 2+ letters
filtered_words = [word for word in words if word not in stop_words]

# Count word frequencies
word_counts = Counter(filtered_words)

# Convert to DataFrame for easy viewing
word_freq_df = pd.DataFrame(word_counts.items(), columns=["Word", "Count"])
word_freq_df = word_freq_df.sort_values(by="Count", ascending=False)

# Display the top 20 most common words
print(word_freq_df.head(100))

# Save to CSV for further analysis
word_freq_df.to_csv("word_frequencies.csv", index=False)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ajayp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


            Word  Count
151          aws  19288
35         cloud  19096
42         azure  11720
29         using   9166
28       project   7583
..           ...    ...
705  engineering    779
268          cdk    769
93       example    766
10      security    759
90     framework    750

[100 rows x 2 columns]
