## 1. Setup and Imports

In [None]:
# Standard libraries
import sys
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../src')

# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Custom modules
from data_loader import ArxivDataLoader
from preprocessing import TextPreprocessor, FeatureExtractor, extract_top_keywords
from clustering import ClusterAnalyzer, DimensionalityReducer
from visualization import Visualizer

# Set random seed for reproducibility
np.random.seed(42)

# Configure display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

print("Setup complete!")

## 2. Data Acquisition

**Decision Point**: The full ArXiv dataset contains 2M+ papers. For this analysis, we'll work with:
- Papers from the last 3-5 years (more recent trends)
- A sample of 50,000-100,000 papers for computational efficiency

**Note**: Adjust `SAMPLE_SIZE` and `RECENT_YEARS` based on your computational resources.

In [None]:
# Configuration
DATA_FILE = '../data/arxiv-metadata-oai-snapshot.json'  # Path to ArXiv dataset
SAMPLE_SIZE = 50000  # Number of papers to analyze
RECENT_YEARS = 3     # Focus on last 3 years

# Initialize data loader
loader = ArxivDataLoader(data_dir='../data')

# Try to load real data, fallback to sample if not available
try:
    df_raw = loader.load_from_json(DATA_FILE, sample_size=SAMPLE_SIZE, recent_years=RECENT_YEARS)
    
    if df_raw.empty:
        print("\nCreating sample dataset for demonstration...")
        df_raw = loader.create_sample_dataset('../data/sample_arxiv.csv', n_samples=SAMPLE_SIZE)
        
except Exception as e:
    print(f"Error loading data: {e}")
    print("\nCreating sample dataset for demonstration...")
    df_raw = loader.create_sample_dataset('../data/sample_arxiv.csv', n_samples=SAMPLE_SIZE)

print(f"\nDataset shape: {df_raw.shape}")
df_raw.head()

## 3. Data Preparation

**Critical Step**: Clean and structure the data for analysis.

In [None]:
# Prepare dataset
df = loader.prepare_dataset(df_raw)

# Get statistics
stats = loader.get_dataset_statistics(df)

print("\n=== Dataset Statistics ===")
for key, value in stats.items():
    print(f"{key}: {value}")

# Display sample
print("\n=== Sample Papers ===")
df[['title', 'categories', 'year', 'abstract']].head(3)

## 4. Exploratory Data Analysis (EDA)

**Objective**: Understand the distribution and characteristics of the data before modeling.

In [None]:
# Initialize visualizer
viz = Visualizer(output_dir='../output')

# Plot category distribution
viz.plot_category_distribution(df, top_n=20, save_path='../output/category_distribution.png')

In [None]:
# Plot temporal trends
if 'year' in df.columns:
    viz.plot_temporal_trends(df, save_path='../output/temporal_trends.png')

In [None]:
# Additional statistics
print("=== Abstract Length Statistics ===")
abstract_lengths = df['abstract'].str.len()
print(f"Mean: {abstract_lengths.mean():.0f} characters")
print(f"Median: {abstract_lengths.median():.0f} characters")
print(f"Std: {abstract_lengths.std():.0f} characters")

# Plot abstract length distribution
fig, ax = plt.subplots(figsize=(10, 5))
abstract_lengths.hist(bins=50, ax=ax, edgecolor='black')
ax.set_xlabel('Abstract Length (characters)')
ax.set_ylabel('Frequency')
ax.set_title('Distribution of Abstract Lengths')
plt.tight_layout()
plt.savefig('../output/abstract_length_dist.png', dpi=300)
plt.show()

### EDA Critical Assessment

**Observations**:
1. **Category Distribution**: Identify dominant research areas
2. **Temporal Trends**: Check if data is recent and representative
3. **Abstract Quality**: Ensure sufficient text for meaningful analysis

**Decision**: Proceed with text preprocessing on abstracts as primary feature source.

## 5. Text Preprocessing

**Critical Operations**:
- Remove LaTeX, URLs, special characters
- Tokenization and lowercasing
- Stopword removal (including scientific stopwords)
- Lemmatization

**Justification**: Scientific abstracts contain domain-specific jargon and formatting that requires specialized cleaning.

In [None]:
# Initialize preprocessor
preprocessor = TextPreprocessor(use_lemmatization=True)

# Preprocess abstracts
print("Preprocessing abstracts...")
df['abstract_clean'] = preprocessor.preprocess_corpus(df['abstract'], show_progress=True)

# Display before/after examples
print("\n=== Preprocessing Examples ===")
for i in range(2):
    print(f"\n--- Paper {i+1} ---")
    print(f"Original: {df['abstract'].iloc[i][:200]}...")
    print(f"Cleaned: {df['abstract_clean'].iloc[i][:200]}...")

## 6. Feature Engineering - TF-IDF Vectorization

**Approach**: TF-IDF (Term Frequency-Inverse Document Frequency)
- Captures importance of terms relative to the corpus
- Handles both unigrams and bigrams for context
- Filters very common and very rare terms

**Parameters**:
- `max_features=5000`: Balance between information and dimensionality
- `ngram_range=(1,2)`: Capture single words and two-word phrases
- `min_df=5`: Ignore very rare terms
- `max_df=0.8`: Ignore overly common terms

In [None]:
# Initialize feature extractor
feature_extractor = FeatureExtractor(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.8
)

# Extract features
print("Extracting TF-IDF features...")
X_tfidf, feature_names = feature_extractor.fit_transform(df['abstract_clean'])

print(f"\nFeature matrix shape: {X_tfidf.shape}")
print(f"Number of features: {len(feature_names)}")
print(f"\nSample features: {feature_names[:20]}")

### Feature Engineering Critical Assessment

**Quality Check**:
- Matrix shape should be (n_papers, n_features)
- Features should include meaningful scientific terms
- Sparsity is expected and acceptable for text data

**Alternative Considered**: Word2Vec or BERT embeddings would capture semantic similarity better but require significantly more computational resources.

## 7. Dimensionality Reduction

**Two-Stage Approach**:
1. **PCA**: Reduce from 5000 to 50 dimensions (retain most variance, speed up clustering)
2. **UMAP**: Reduce to 2D for visualization (preserve local structure)

**Justification**: High-dimensional clustering is computationally expensive and prone to the curse of dimensionality.

In [None]:
# Initialize dimensionality reducer
dim_reducer = DimensionalityReducer()

# Step 1: PCA for computational efficiency
X_pca = dim_reducer.reduce_pca(X_tfidf, n_components=50)

print(f"\nReduced shape: {X_pca.shape}")

In [None]:
# Step 2: UMAP for visualization
X_umap = dim_reducer.reduce_umap(
    X_pca, 
    n_components=2, 
    n_neighbors=15, 
    min_dist=0.1,
    metric='euclidean'
)

print(f"UMAP embedding shape: {X_umap.shape}")

## 8. Determining Optimal Number of Clusters

**Methods**:
- Elbow method (inertia)
- Silhouette score (higher is better)
- Davies-Bouldin index (lower is better)
- Calinski-Harabasz score (higher is better)

**Goal**: Find k that balances cluster quality and interpretability.

In [None]:
# Initialize cluster analyzer
cluster_analyzer = ClusterAnalyzer()

# Find optimal k
optimal_k, metrics = cluster_analyzer.find_optimal_k(
    X_pca, 
    k_range=range(3, 16)
)

print(f"\nRecommended number of clusters: {optimal_k}")

In [None]:
# Visualize metrics
viz.plot_elbow_analysis(metrics, save_path='../output/elbow_analysis.png')

### Cluster Number Selection - Critical Decision

**Analysis**:
- Review the elbow plot and silhouette scores
- Consider domain knowledge (expected number of major research areas)
- Balance between granularity and interpretability

**Decision**: Use the optimal k suggested by silhouette score, or adjust based on business requirements.

## 9. K-Means Clustering

**Algorithm**: K-Means
- Fast and scalable
- Works well with PCA-reduced features
- Assumes spherical clusters

In [None]:
# Perform K-Means clustering
N_CLUSTERS = optimal_k  # or manually set based on analysis

labels_kmeans, model_kmeans = cluster_analyzer.kmeans_clustering(
    X_pca, 
    n_clusters=N_CLUSTERS
)

# Add labels to dataframe
df['cluster_kmeans'] = labels_kmeans

In [None]:
# Visualize cluster distribution
viz.plot_cluster_distribution(
    labels_kmeans, 
    method_name='K-Means',
    save_path='../output/cluster_distribution_kmeans.png'
)

In [None]:
# Visualize clusters in 2D
viz.plot_clusters_2d(
    X_umap, 
    labels_kmeans,
    method_name='UMAP',
    save_path='../output/clusters_2d_kmeans.png'
)

In [None]:
# Get cluster statistics
cluster_stats = cluster_analyzer.get_cluster_statistics(labels_kmeans, df)
print("\n=== Cluster Statistics ===")
print(cluster_stats)

## 10. Alternative Clustering - DBSCAN

**Algorithm**: DBSCAN (Density-Based Spatial Clustering)
- Finds arbitrarily shaped clusters
- Identifies outliers/noise
- Doesn't require predefined number of clusters

**Note**: Parameter tuning (eps, min_samples) is critical.

In [None]:
# Perform DBSCAN clustering
# Note: These parameters may need tuning based on your data
labels_dbscan, model_dbscan = cluster_analyzer.dbscan_clustering(
    X_pca,
    eps=3.0,
    min_samples=10
)

df['cluster_dbscan'] = labels_dbscan

In [None]:
# Visualize DBSCAN results
viz.plot_clusters_2d(
    X_umap,
    labels_dbscan,
    method_name='DBSCAN',
    save_path='../output/clusters_2d_dbscan.png'
)

## 11. Extract Cluster Keywords and Trends

**Objective**: Identify what each cluster represents by extracting top keywords.

**Method**: Calculate mean TF-IDF scores for each cluster and extract top terms.

In [None]:
# Extract top keywords for each cluster
cluster_keywords = extract_top_keywords(
    X_tfidf,
    feature_names,
    labels_kmeans,
    top_n=20
)

# Display keywords for each cluster
print("\n=== TOP KEYWORDS PER CLUSTER ===")
for cluster_id, keywords in cluster_keywords.items():
    print(f"\nCluster {cluster_id}:")
    top_10 = keywords[:10]
    keyword_str = ", ".join([f"{kw}({score:.3f})" for kw, score in top_10])
    print(f"  {keyword_str}")

In [None]:
# Create word clouds for selected clusters
for cluster_id in range(min(5, N_CLUSTERS)):  # First 5 clusters
    viz.plot_wordcloud(
        cluster_keywords[cluster_id],
        cluster_id,
        save_path=f'../output/wordcloud_cluster_{cluster_id}.png'
    )

In [None]:
# Keyword heatmap across clusters
viz.plot_cluster_keywords_heatmap(
    cluster_keywords,
    top_n=15,
    save_path='../output/keywords_heatmap.png'
)

## 12. Cluster Interpretation and Labeling

**Manual Step**: Based on top keywords, assign meaningful names to clusters.

In [None]:
# Create cluster interpretation dictionary
# MANUALLY UPDATE based on your keyword analysis
cluster_names = {
    0: "Machine Learning & AI",
    1: "Quantum Physics",
    2: "Astrophysics & Cosmology",
    3: "Biology & Genomics",
    4: "Materials Science",
    5: "Natural Language Processing",
    6: "Computer Vision",
    7: "Mathematical Optimization",
    # Add more based on your N_CLUSTERS
}

# Map cluster names
df['cluster_name'] = df['cluster_kmeans'].map(cluster_names)

# Display cluster summary
print("\n=== CLUSTER SUMMARY ===")
summary = df.groupby('cluster_kmeans').agg({
    'id': 'count',
    'categories': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'Unknown'
}).rename(columns={'id': 'count', 'categories': 'dominant_category'})

summary['cluster_name'] = summary.index.map(cluster_names)
summary['percentage'] = (summary['count'] / len(df) * 100).round(2)

print(summary[['cluster_name', 'count', 'percentage', 'dominant_category']])

## 13. Category Analysis by Cluster

**Objective**: Understand how ArXiv categories distribute across discovered clusters.

In [None]:
# Plot category distribution across clusters
viz.plot_category_by_cluster(
    df,
    labels_kmeans,
    top_categories=10,
    save_path='../output/category_by_cluster.png'
)

## 14. Sample Papers from Each Cluster

**Validation**: Review actual paper titles and abstracts to verify cluster quality.

In [None]:
# Display sample papers from each cluster
print("\n=== SAMPLE PAPERS FROM EACH CLUSTER ===")

for cluster_id in range(min(5, N_CLUSTERS)):
    print(f"\n{'='*80}")
    print(f"CLUSTER {cluster_id}: {cluster_names.get(cluster_id, 'Unknown')}")
    print(f"{'='*80}")
    
    cluster_papers = df[df['cluster_kmeans'] == cluster_id]
    samples = cluster_papers.sample(min(3, len(cluster_papers)))
    
    for idx, (_, paper) in enumerate(samples.iterrows(), 1):
        print(f"\n{idx}. {paper['title']}")
        print(f"   Category: {paper['categories']}")
        print(f"   Abstract: {paper['abstract'][:200]}...")

## 15. Trend Analysis and Insights

**Business Value**: Extract actionable insights for academic cooperation strategy.

In [None]:
# Analyze cluster growth over time
if 'year' in df.columns:
    cluster_yearly = df.groupby(['year', 'cluster_kmeans']).size().unstack(fill_value=0)
    
    fig, ax = plt.subplots(figsize=(14, 8))
    cluster_yearly.plot(ax=ax, marker='o', linewidth=2)
    ax.set_xlabel('Year', fontsize=12)
    ax.set_ylabel('Number of Papers', fontsize=12)
    ax.set_title('Cluster Trends Over Time', fontsize=14, fontweight='bold')
    ax.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('../output/cluster_trends_overtime.png', dpi=300)
    plt.show()

In [None]:
# Calculate cluster growth rates
if 'year' in df.columns and df['year'].nunique() > 1:
    print("\n=== CLUSTER GROWTH ANALYSIS ===")
    
    years = sorted(df['year'].unique())
    first_year = years[0]
    last_year = years[-1]
    
    for cluster_id in range(N_CLUSTERS):
        cluster_data = df[df['cluster_kmeans'] == cluster_id]
        
        count_first = len(cluster_data[cluster_data['year'] == first_year])
        count_last = len(cluster_data[cluster_data['year'] == last_year])
        
        if count_first > 0:
            growth = ((count_last - count_first) / count_first) * 100
            cluster_name = cluster_names.get(cluster_id, f'Cluster {cluster_id}')
            print(f"{cluster_name}: {growth:+.1f}% growth")
        else:
            print(f"Cluster {cluster_id}: New emerging area")

## 16. Export Results

**Deliverables**: Save processed data and cluster assignments for further analysis.

In [None]:
# Export cluster assignments and keywords
df[['id', 'title', 'categories', 'year', 'cluster_kmeans', 'cluster_name']].to_csv(
    '../output/cluster_assignments.csv', 
    index=False
)

# Export cluster keywords
keywords_df = pd.DataFrame([
    {
        'cluster_id': cluster_id,
        'cluster_name': cluster_names.get(cluster_id, f'Cluster {cluster_id}'),
        'keyword': kw,
        'score': score
    }
    for cluster_id, keywords in cluster_keywords.items()
    for kw, score in keywords[:20]
])

keywords_df.to_csv('../output/cluster_keywords.csv', index=False)

print("Results exported to ../output/")

## 17. Critical Assessment and Conclusions

### What We Did:
1. ✅ Acquired and prepared ArXiv scientific paper data
2. ✅ Performed comprehensive exploratory data analysis
3. ✅ Preprocessed text data with domain-specific cleaning
4. ✅ Engineered features using TF-IDF vectorization
5. ✅ Applied dimensionality reduction (PCA + UMAP)
6. ✅ Determined optimal cluster count using multiple metrics
7. ✅ Performed clustering (K-Means and DBSCAN)
8. ✅ Extracted and visualized cluster characteristics
9. ✅ Identified research trends and growth patterns

### Quality Assessment:

**Strengths**:
- Systematic preprocessing pipeline handles scientific text effectively
- Multiple clustering evaluation metrics provide confidence
- Clear, interpretable clusters with distinct keyword profiles
- Visualizations support both technical and business understanding

**Limitations**:
- TF-IDF doesn't capture semantic relationships (consider BERT for improvement)
- K-Means assumes spherical clusters (DBSCAN provides alternative view)
- Sample size may not represent entire ArXiv corpus
- Manual cluster labeling introduces subjectivity

### Recommendations for Academic Cooperation:

Based on cluster analysis, prioritize cooperation in:
1. **Fast-growing clusters**: High research momentum
2. **Large clusters**: Established communities with resources
3. **Clusters aligned with company expertise**: Strategic fit
4. **Emerging small clusters**: Early-stage opportunities

### Future Improvements:
- Incorporate citation networks for impact analysis
- Use BERT embeddings for better semantic clustering
- Implement temporal clustering to track evolving topics
- Add author collaboration network analysis
- Integrate with company's research portfolio for gap analysis

In [None]:
print("\n" + "="*80)
print("ANALYSIS COMPLETE")
print("="*80)
print(f"\nTotal papers analyzed: {len(df):,}")
print(f"Number of clusters: {N_CLUSTERS}")
print(f"Silhouette score: {cluster_analyzer.metrics['kmeans']['silhouette']:.3f}")
print(f"\nResults saved to: ../output/")
print("\nNext steps:")
print("1. Review cluster interpretations with domain experts")
print("2. Validate findings against known research trends")
print("3. Prepare presentation for stakeholders")
print("4. Define cooperation strategy based on insights")