In [None]:
{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": ["# Text Summarization Assignment\n", "## Part 1: Data Loading"]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {},
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "\n",
        "# Load datasets\n",
        "train_df = pd.read_csv('train.csv')\n",
        "test_df = pd.read_csv('test.csv')\n",
        "\n",
        "# Sample 100 rows\n",
        "train_df = train_df.sample(100, random_state=42)\n",
        "test_df = test_df.sample(100, random_state=42)\n",
        "\n",
        "print('Train shape:', train_df.shape)\n",
        "print('Test shape:', test_df.shape)\n",
        "print('First 3 examples:')\n",
        "train_df.head(3)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": ["## Part 2: Model Setup"]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {},
      "outputs": [],
      "source": [
        "from transformers import T5ForConditionalGeneration, AutoTokenizer\n",
        "import torch\n",
        "\n",
        "model = T5ForConditionalGeneration.from_pretrained('t5-small')\n",
        "tokenizer = AutoTokenizer.from_pretrained('t5-small')\n",
        "\n",
        "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
        "model = model.to(device)\n",
        "model.eval()\n",
        "\n",
        "print(f'Model on {device}')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": ["## Part 3: Summarization"]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {},
      "outputs": [],
      "source": [
        "def summarize(text):\n",
        "    inputs = tokenizer('summarize: ' + text, return_tensors='pt', max_length=512, truncation=True)\n",
        "    inputs = inputs.to(device)\n",
        "    outputs = model.generate(**inputs, max_length=150)\n",
        "    return tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
        "\n",
        "# Test on examples\n",
        "for i in range(5):\n",
        "    text = test_df.iloc[i]['text']\n",
        "    summary = summarize(text)\n",
        "    print(f'Summary {i+1}: {summary}')"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}