In [None]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Set a style for our plots
sns.set_style("whitegrid")

print("--- Loading Data ---")
# Load the dataset from the raw data folder
try:
    df = pd.read_csv("../data/raw/courses.csv")
    print("Dataset loaded successfully.")
    print(f"Shape of the dataset: {df.shape}")
except FileNotFoundError:
    print("Error: courses.csv not found in data/raw/. Please complete Step 4.")

# Display the first 5 rows to understand the columns
print("\n--- First 5 Rows of the Dataset ---")
display(df.head())

# --- Analysis 1: Token Distribution in Course Descriptions ---
print("\n--- Analyzing Description Length ---")
# Calculate the number of words (tokens) in each course description
df["description_tokens"] = df["description"].astype(str).str.split().str.len()

# Plot the distribution
plt.figure(figsize=(12, 6))
sns.histplot(df["description_tokens"], bins=50, kde=True)
plt.title("Distribution of Token Count in Course Descriptions", fontsize=16)
plt.xlabel("Number of Tokens (Words)")
plt.ylabel("Number of Courses")
plt.axvline(df["description_tokens"].median(), color='red', linestyle='--', label=f'Median: {df["description_tokens"].median():.0f} tokens')
plt.legend()
plt.show()

print(f"Key Insight: The median course description length is ~{df['description_tokens'].median():.0f} tokens.")
print("This is well within the context window of small LLMs like distilgpt2 (512-1024 tokens).")


# --- Analysis 2: Top Skills Mentioned in the Dataset ---
print("\n--- Analyzing Top Skills ---")
# The 'skills' column is a string of comma-separated values. We need to parse it.
# Drop rows with no skills, then split the string into a list, and explode the list into separate rows.
skills_series = df.dropna(subset=['skills'])['skills'].str.split(', ').explode()

# Count the frequency of each skill
top_skills = skills_series.value_counts().head(15)

# Plot the most common skills
plt.figure(figsize=(12, 8))
sns.barplot(x=top_skills.values, y=top_skills.index, palette="viridis")
plt.title("Top 15 Most Common Skills on Coursera", fontsize=16)
plt.xlabel("Number of Courses")
plt.ylabel("Skill")
plt.show()

print("Key Insight: Foundational tech skills like Python, SQL, and Machine Learning are highly prevalent.")