In [None]:
CORD-19 Metadata Analysis
This notebook explores and analyzes the CORD-19 metadata dataset.


# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import kagglehub
import os
sns.set(style='whitegrid')
     
Load the Dataset
# Import utility functions for data loading
from utils import download_dataset

# Download dataset if needed
download_dataset()
     # Load metadata.csv using utility function
from utils import load_and_clean_data
df = load_and_clean_data()
df.head()
Explore the Dataset

# Shape of the dataset
print('Shape:', df.shape)
# Info about columns
df.info()
     

# Check missing values
df.isnull().sum().sort_values(ascending=False)
     
Data Cleaning
# The data has already been cleaned by the utility function
# Let's examine the cleaned dataset
print("Dataset Info:")
print(f"Shape: {df.shape}")
print(f"Year range: {df['year'].min()} - {df['year'].max()}")
print("\nColumn data types:")
print(df.dtypes)
print("\nFirst few rows:")
df.head()
     
Publications per Year

# Count publications per year
yearly_counts = df['year'].value_counts().sort_index()
plt.figure(figsize=(10,5))
sns.barplot(x=yearly_counts.index, y=yearly_counts.values, palette='Blues')
plt.xlabel('Year')
plt.ylabel('Number of Publications')
plt.title('Publications per Year')
plt.xticks(rotation=45)
plt.show()
     
Top Journals by Number of Papers

# Top 10 journals
top_journals = df['journal'].value_counts().head(10)
plt.figure(figsize=(8,6))
sns.barplot(y=top_journals.index, x=top_journals.values, palette='Greens')
plt.xlabel('Number of Papers')
plt.ylabel('Journal')
plt.title('Top Journals')
plt.show()
     
Word Frequency in Titles (Wordcloud)

# Wordcloud of title words
text = ' '.join(df['title'].dropna().astype(str))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Wordcloud of Title Words')
plt.show()
     
Distribution by Source

# Distribution by source_x
source_counts = df['source_x'].value_counts()
plt.figure(figsize=(8,6))
sns.barplot(y=source_counts.index, x=source_counts.values, palette='Purples')
plt.xlabel('Number of Papers')
plt.ylabel('Source')
plt.title('Distribution by Source')
plt.show()
     