In [8]:
# STEP BY STEP TASKS

# Part 1
# 1. Download and load the Data

# Downloading  the metadata.csv file from the CORD-19 dataset

print("metadadata.csv downloaded sucessfully from Semantic Scholar.")

metadadata.csv downloaded sucessfully from Semantic Scholar.


In [None]:
# Lading in to pandas dataframe

import pandas as pd

df=pd.read_csv(r"C:\Users\user\Desktop\my project")

In [None]:
# Examining the first few rows and data structure

# shows first few rows
print(df.head())

# show the data structure
print(df.info())

In [None]:
# 2. Basic data exploration

# Check the DataFrame dimensions (rows, columns)

print(df.shape)

In [21]:
# Identify data types of each column

print(df.dtypes)

<?xml version="1.0" encoding="UTF-8"?>    object
dtype: object


In [None]:
# Check for missing values in important columns
print(df[['title', 'abstract',
          'publish_time']].isnull().sum())



In [None]:
# Generate basic statistics for numerical columns

print(df.describe())

In [None]:
# Part 2: 
# 3.  Handle missing data

# Identify columns with many missing values

print(df.isnull().sum().sort_values(ascending= False))

In [None]:
# Decide how to handle missing values (removal or filling)

df = df.dropna()
df = df.fillna('Unknown')
df['year'] = df'year'].fillna(df['year'].mean())

In [None]:
#  Create a cleaned version of the dataset

df_clean = df.dropna().copy()
print(df_clean.info())

In [None]:
#  4. Prepare data for analysis

#  Convert date columns to datetime format


df_clean['publish_time'] = pd.to_datetime(df_clean['publish_time'], errors='coerce')

In [None]:
#  Extract year from publication date for time-based analysis

df_clean['year'] = df_clean['publish_time'].dt.year

In [None]:
#  Create new columns if needed (e.g., abstract word count)

df_clean['abstract_word_count'] = 
df_clean['abstract'].astype(str).apply(lambda x: len(x.split()))

In [None]:
#  Part 3

# 5. Perform basic analysis

#  Count papers by publication year

print(df_clean['year'].value_counts().sort_index())

In [None]:
#  Identify top journals publishing COVID-19 research

print(df['journal'].value_counts().head(10))

In [None]:
#  Find most frequent words in titles (using simple word frequency)

from collections import Counter
words = ' '.joint(df_clean['title'].dropna()).lower().split()
print(Counter(words).most_common(10))


In [None]:
# 6. Create visualizations

# Create visualizations

import matplotlib.pyplot as plt

df_clean['year'].value_counts().sort_index().plot(kind='bar')
plt.title(Number of Papers by Year')
plt.xlabel('Year')
plt.ylabel('Count')
plt.show()

In [None]:
#  Create a bar chart of top publishing journals

top_journals =
df_clean['journals'].value_counts().head(10)
top_journals.plot(kind='bar')
plt.title('Top 10 Journals Publishing COVID -19 Research')
plt.xlabel('Journal')
plt.ylabel('Number of Papers')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
#  Generate a word cloud of paper titles

from wordcloud import WordCloud

text=''.joint(df_clean['title'].dropna())

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Paper Titles')
plt.show()

In [None]:
#  Plot distribution of paper counts by source


df_clean'source_x'].value_counts().plot(kind='bar')
plt.title('Distribution of Paper Counts by Source')
plt.xlabel('Source')
plt.ylabel('Number of Papers')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
# Part 4

#  7. Build a simple Streamlit app

#  Create a basic layout with title and description

import streamlit as st

st.title("CORD-19 Research Data Analysis")
st.write("This app explores the COVID -19 Open Research Dataset (CORD-19), analyzing publication trends, top journals, and common research topics.")

In [None]:
#  Add interactive widgets (sliders, dropdowns)

st.title("CORD-19 Research Data Analysis")
st.write("Explore COVID-19 research trends by year, journal, and keywords.")

# Slider for selecting year range
year_range = st.slider("Select Year Range", int(df_clean['year'].min()), int(df_clean['year'].max()),(2020,2021))

# Dropdown for selecting journal
journal = st.selectbox("Selected Journal",df_clean['journal']dropna().unique())

st.write(f"Showing papers from
**{year_range[0]}-{year_range[1]}**in **{journal}**.")


In [None]:
#  Display your visualizations in the app

fig, ax = plt.subplots()
filtered_df['year'].value_counts().sort_index().plot(kind='bar', ax=ax)
ax.set_title(f"Papers Published per Year in {journal}")
ax.set_xlabel("Year")
ax.set_ylabel("Year")
ax.set_ylabel("Count")

st.pyplot(fig)

In [None]:
#  Show a sample of the data

st.subheader("Sample of the Dataset")
st.write(df_clean.head())