In [None]:
# Import necessary libraries
import json
import string
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from textblob import TextBlob
from nltk.tokenize import word_tokenize

# Load the dataset
file_path = 'wiki-articles.json'

# Utility function to preprocess text
def preprocess_text(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator).lower()

# Load dataset and preprocess articles
texts = []
titles = []
with open(file_path, 'r') as file:
    for line in file:
        article_data = json.loads(line)
        texts.append(preprocess_text(article_data['text']))
        titles.append(article_data['title'])

# === Question 1: Distribution of article lengths with enhanced analysis ===
article_lengths = [len(text) for text in texts]

# Compute statistical measures
mean_length = np.mean(article_lengths)
median_length = np.median(article_lengths)
max_length = np.max(article_lengths)
min_length = np.min(article_lengths)
std_length = np.std(article_lengths)

print(f"Mean Length: {mean_length}")
print(f"Median Length: {median_length}")
print(f"Max Length: {max_length}")
print(f"Min Length: {min_length}")
print(f"Standard Deviation: {std_length}")

# Plot histogram and density plot
plt.figure(figsize=(15, 6))
sns.histplot(article_lengths, bins=50, kde=True, color='blue')
plt.title("Distribution of Article Lengths")
plt.xlabel("Length of Articles (characters)")
plt.ylabel("Frequency")
plt.axvline(mean_length, color='red', linestyle='--', label=f'Mean: {mean_length:.2f}')
plt.axvline(median_length, color='green', linestyle='--', label=f'Median: {median_length:.2f}')
plt.legend()
plt.show()

# === Question 2: Most frequent words with TF-IDF and word clouds ===
vectorizer = TfidfVectorizer(stop_words='english', max_features=50)
tfidf_matrix = vectorizer.fit_transform(texts)
tfidf_scores = np.asarray(tfidf_matrix.sum(axis=0)).flatten()
feature_names = vectorizer.get_feature_names_out()

# Visualize the most important keywords
plt.figure(figsize=(12, 6))
sns.barplot(x=feature_names, y=tfidf_scores, palette="viridis")
plt.title("Top 50 Keywords by TF-IDF Score")
plt.xlabel("Keywords")
plt.ylabel("TF-IDF Score")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Generate a word cloud
wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(
    dict(zip(feature_names, tfidf_scores))
)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Word Cloud of Important Keywords")
plt.show()

# === Question 3: Analyze rare characters ('x', 'z', 'j', 'q') ===
rare_chars = set("xzjq")
char_counts = Counter()
article_with_all_rare_chars = 0

for text in texts:
    if all(char in text for char in rare_chars):
        article_with_all_rare_chars += 1
    for char in rare_chars:
        char_counts[char] += text.count(char)

print(f"Number of articles containing all rare characters: {article_with_all_rare_chars}")

# Bar plot for rare character frequency
plt.figure(figsize=(8, 5))
sns.barplot(x=list(char_counts.keys()), y=list(char_counts.values()), palette="magma")
plt.title("Frequency of Rare Characters ('x', 'z', 'j', 'q')")
plt.xlabel("Characters")
plt.ylabel("Frequency")
plt.show()

# === Question 4: Advanced Topic Modeling ===
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(tfidf_matrix)

# Extract topics and their words
feature_names = vectorizer.get_feature_names_out()
topic_words = []
for idx, topic in enumerate(lda.components_):
    topic_words.append([feature_names[i] for i in topic.argsort()[:-11:-1]])
    print(f"Topic {idx + 1}: {topic_words[-1]}")

# Visualize topic importance
topic_weights = lda.components_.sum(axis=1)
plt.figure(figsize=(10, 6))
sns.barplot(x=["Topic " + str(i + 1) for i in range(5)], y=topic_weights, palette="coolwarm")
plt.title("Topic Importance")
plt.xlabel("Topics")
plt.ylabel("Weight")
plt.show()

# === Question 5: Sentiment Analysis with Additional Metrics ===
polarities = []
subjectivities = []
positive_count = 0
negative_count = 0
neutral_count = 0

for text in texts:
    blob = TextBlob(text)
    polarities.append(blob.polarity)
    subjectivities.append(blob.subjectivity)
    if blob.polarity > 0:
        positive_count += 1
    elif blob.polarity < 0:
        negative_count += 1
    else:
        neutral_count += 1

print(f"Positive Articles: {positive_count}")
print(f"Negative Articles: {negative_count}")
print(f"Neutral Articles: {neutral_count}")

# Visualize sentiment polarity and subjectivity
plt.figure(figsize=(12, 6))
sns.scatterplot(x=polarities, y=subjectivities, alpha=0.6, color="purple")
plt.title("Sentiment Analysis of Articles")
plt.xlabel("Polarity")
plt.ylabel("Subjectivity")
plt.grid(True)
plt.show()

# Polarity distribution
plt.figure(figsize=(10, 5))
sns.histplot(polarities, bins=20, kde=True, color="red")
plt.title("Distribution of Polarity")
plt.xlabel("Polarity")
plt.ylabel("Frequency")
plt.show()

# Subjectivity distribution
plt.figure(figsize=(10, 5))
sns.histplot(subjectivities, bins=20, kde=True, color="blue")
plt.title("Distribution of Subjectivity")
plt.xlabel("Subjectivity")
plt.ylabel("Frequency")
plt.show()

# === Question 6: Token Length Distribution (New Analysis) ===
token_lengths = [len(word_tokenize(text)) for text in texts]

plt.figure(figsize=(10, 5))
sns.histplot(token_lengths, bins=30, kde=True, color="orange")
plt.title("Distribution of Token Lengths")
plt.xlabel("Number of Tokens")
plt.ylabel("Frequency")
plt.show()

# === Question 7: Average Word Length (New Analysis) ===
avg_word_lengths = [np.mean([len(word) for word in text.split()]) for text in texts]

plt.figure(figsize=(10, 5))
sns.histplot(avg_word_lengths, bins=30, kde=True, color="green")
plt.title("Distribution of Average Word Lengths")
plt.xlabel("Average Word Length")
plt.ylabel("Frequency")
plt.show()

# === Question 8: Lexical Diversity (New Analysis) ===
def compute_lexical_diversity(text):
    tokens = word_tokenize(text)
    return len(set(tokens)) / len(tokens)

lexical_diversities = [compute_lexical_diversity(text) for text in texts]

plt.figure(figsize=(10, 5))
sns.histplot(lexical_diversities, bins=30, kde=True, color="purple")
plt.title("Distribution of Lexical Diversity")
plt.xlabel("Lexical Diversity")
plt.ylabel("Frequency")
plt.show()
