
# # Laporan Teknis Dataset Semantic Scholar
# ## Analisis Perubahan Data

In [7]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from jinja2 import Template
import pdfkit
import os
from pathlib import Path

# Buat direktori reports jika belum ada
os.makedirs('reports', exist_ok=True)

# 1. Load data dan metrics
try:
    # Load metrics
    with open('data/processed/metrics.json') as f:
        metrics = json.load(f)
    
    # Load processed data
    df = pd.read_json('data/processed/semantic_scholar_results_clean.json')
    
    # Dapatkan daftar kolom yang tersedia
    available_columns = df.columns.tolist()
    
except Exception as e:
    print(f"Error loading data: {str(e)}")
    exit(1)

# 2. Generate report content
report_content = f"""
# Technical Report: Semantic Scholar Dataset Analysis

**Report Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## Dataset Metadata
- Original records: {metrics.get('original_count', 'N/A')}
- Processed records: {metrics.get('processed_count', 'N/A')}
- Processing date: {metrics.get('processing_date', 'N/A')}
- Available columns: {', '.join(available_columns)}

## Processing Steps
1. Text cleaning (lowercasing, punctuation removal)
2. Stopword removal
3. Lemmatization
4. Title processing ('processed_title' column added)
"""

# 3. Generate visualizations
plt.figure(figsize=(12, 6))

# Visualization 1: Publication year distribution
if 'year' in df.columns:
    year_counts = df['year'].value_counts().sort_index()
    plt.subplot(1, 2, 1)
    year_counts.plot(kind='bar', color='skyblue')
    plt.title('Publication Year Distribution')
    plt.xlabel('Year')
    plt.ylabel('Count')
else:
    report_content += "\n<div class='warning'>Warning: 'year' column not found in dataset</div>\n"

# Visualization 2: Top processed words
if 'processed_title' in df.columns:
    try:
        from wordcloud import WordCloud
        from collections import Counter
        
        all_words = ' '.join(df['processed_title'].dropna()).split()
        word_freq = Counter(all_words).most_common(20)
        
        plt.subplot(1, 2, 2)
        pd.DataFrame(word_freq, columns=['Word', 'Count']).set_index('Word').plot(
            kind='barh', color='lightgreen', legend=False
        )
        plt.title('Top 20 Processed Words')
        plt.xlabel('Frequency')
    except Exception as e:
        report_content += f"\n<div class='warning'>Word cloud error: {str(e)}</div>\n"
else:
    report_content += "\n<div class='warning'>Warning: 'processed_title' column not found in dataset</div>\n"

# Save combined visualization
plt.tight_layout()
viz_path = 'reports/dataset_visualizations.png'
plt.savefig(viz_path)
plt.close()

# Add visualizations to report
report_content += f"""
## Dataset Visualizations
![Dataset Overview]({viz_path})
"""

# 4. Add basic statistics
stats_content = ""
if 'citationCount' in df.columns:
    stats_content += f"""
### Citation Statistics
- Average citations: {df['citationCount'].mean():.1f}
- Max citations: {df['citationCount'].max()}
- Min citations: {df['citationCount'].min()}
"""
if 'influentialCitationCount' in df.columns:
    stats_content += f"""
- Average influential citations: {df['influentialCitationCount'].mean():.1f}
"""

if stats_content:
    report_content += stats_content
else:
    report_content += "\n<div class='warning'>No citation statistics available</div>\n"

# 5. Generate PDF
template = Template("""
<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <title>Semantic Scholar Dataset Report</title>
    <style>
        body { font-family: Arial, sans-serif; line-height: 1.6; margin: 2cm; }
        h1 { color: #2c3e50; border-bottom: 1px solid #eee; padding-bottom: 10px; }
        h2 { color: #3498db; margin-top: 20px; }
        h3 { color: #2c3e50; }
        .metrics { background: #f8f9fa; padding: 15px; border-radius: 5px; }
        img { max-width: 100%; height: auto; margin: 10px 0; border: 1px solid #ddd; }
        .warning { color: #e74c3c; background: #fdecea; padding: 5px; border-radius: 3px; }
        .statistics { margin-top: 20px; }
        ul { margin-top: 5px; }
    </style>
</head>
<body>
    {{ content }}
    <footer style="margin-top: 50px; font-size: 0.8em; color: #7f8c8d; text-align: center;">
        Report generated automatically on {{ date }} using DVC pipeline
    </footer>
</body>
</html>
""")

# Configure PDF options
options = {
    'page-size': 'A4',
    'margin-top': '15mm',
    'margin-right': '15mm',
    'margin-bottom': '15mm',
    'margin-left': '15mm',
    'encoding': 'UTF-8',
    'quiet': ''
}

# Generate PDF
pdf_path = 'reports/technical_report.pdf'
pdfkit.from_string(
    template.render(content=report_content, date=datetime.now().strftime('%Y-%m-%d %H:%M')),
    pdf_path,
    options=options
)

print(f"Technical report successfully generated: {pdf_path}")

Error loading data: [Errno 2] No such file or directory: 'data/processed/metrics.json'


NameError: name 'available_columns' is not defined