In [11]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Cleaning and Visualization\n",
    "This notebook demonstrates data preprocessing, cleaning, and visualization using Python. It is part of my Data Science coursework."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import Libraries"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "# Set plot style\n",
    "sns.set(style=\"whitegrid\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Cleaned Data"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "data = pd.read_csv('../data/cleaned_data.csv')\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data Exploration"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# Basic info\n",
    "data.info()\n",
    "\n",
    "# Summary statistics\n",
    "data.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualizations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Age Distribution"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "plt.figure(figsize=(6,4))\n",
    "sns.histplot(data['Age'], bins=5, kde=True, color='skyblue')\n",
    "plt.title('Age Distribution')\n",
    "plt.xlabel('Age')\n",
    "plt.ylabel('Count')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Score Distribution"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "plt.figure(figsize=(6,4))\n",
    "sns.histplot(data['Score'], bins=5, kde=True, color='orange')\n",
    "plt.title('Score Distribution')\n",
    "plt.xlabel('Score')\n",
    "plt.ylabel('Count')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Scores by Name"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "plt.figure(figsize=(6,4))\n",
    "sns.barplot(x='Name', y='Score', data=data, palette='viridis', hue=None, dodge=False)\n",
    "plt.title('Scores by Name')\n",
    "plt.xlabel('Name')\n",
    "plt.ylabel('Score')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Correlation Heatmap"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# Select numeric columns only\n",
    "numeric_data = data.select_dtypes(include='number')\n",
    "plt.figure(figsize=(4,3))\n",
    "sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm')\n",
    "plt.title('Correlation Matrix')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Optional: Save Plots for Submission"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# Example: save Age distribution\n",
    "plt.figure(figsize=(6,4))\n",
    "sns.histplot(data['Age'], bins=5, kde=True, color='skyblue')\n",
    "plt.title('Age Distribution')\n",
    "plt.xlabel('Age')\n",
    "plt.ylabel('Count')\n",
    "plt.savefig('../plots/age_distribution_notebook.png')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Conclusion\n",
    "This notebook provides an interactive demonstration of data cleaning and visualization, including histograms, bar charts, and correlation analysis."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


{'cells': [{'cell_type': 'markdown',
   'metadata': {},
   'source': ['# Data Cleaning and Visualization\n',
    'This notebook demonstrates data preprocessing, cleaning, and visualization using Python. It is part of my Data Science coursework.']},
  {'cell_type': 'markdown', 'metadata': {}, 'source': ['## Import Libraries']},
  {'cell_type': 'code',
   'metadata': {},
   'source': ['import pandas as pd\n',
    'import matplotlib.pyplot as plt\n',
    'import seaborn as sns\n',
    '\n',
    '# Set plot style\n',
    'sns.set(style="whitegrid")']},
  {'cell_type': 'markdown',
   'metadata': {},
   'source': ['## Load Cleaned Data']},
  {'cell_type': 'code',
   'metadata': {},
   'source': ["data = pd.read_csv('../data/cleaned_data.csv')\n",
    'data.head()']},
  {'cell_type': 'markdown', 'metadata': {}, 'source': ['## Data Exploration']},
  {'cell_type': 'code',
   'metadata': {},
   'source': ['# Basic info\n',
    'data.info()\n',
    '\n',
    '# Summary statistics\n',
    'data.de