In [2]:
# Install required libraries (offline setup)
%pip install transformers torch pandas numpy scikit-learn nltk rouge-score joblib -q

# Import necessary libraries
import os
import pandas as pd
import numpy as np
import torch
import joblib
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
nltk.download('punkt', quiet=True)
try:
    from rouge import Rouge
except ImportError:
    Rouge = None


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.


In [None]:
# Configuration and quick validation
import os
from pathlib import Path

# Update these paths if your files are located elsewhere
PATH_A = r"C:\Users\Admin\Desktop\Dataset\userA_chats.csv"
PATH_B = r"C:\Users\Admin\Desktop\Dataset\userB_chats.csv"

# If you prefer a single combined file (Excel/CSV), set USE_COMBINED=True and provide COMBINED_PATH
USE_COMBINED = True
COMBINED_PATH = r"C:\Users\Admin\Desktop\Dataset\conversationfile.xlsx"

# Controls
RUN_TRAINING = False  # Set to True to run training
DEV_RUN = True        # Set to True to use a tiny subset for fast tests
JOBLIB_OUT = 'Model.joblib'


def validate_paths(paths=(PATH_A, PATH_B), preview_lines=6):
    """Check existence of dataset files and print a small preview (header + few rows).
    Prints clear guidance if files are missing or unreadable.
    """
    for p in paths:
        print(f"Checking: {p}")
        if not os.path.exists(p):
            print("MISSING")
            print(" - Tip: place your CSV at this path or update PATH_A/PATH_B in the notebook.")
            print(" - Expected columns: timestamp, message (optional: user_id, conversation_id)")
        else:
            print("Exists -> preview:")
            try:
                with open(p, 'r', encoding='utf-8', errors='replace') as f:
                    for i, line in enumerate(f):
                        if i >= preview_lines:
                            break
                        print(line.rstrip())
            except Exception as e:
                print('Error reading file:', e)
        print('----')


def validate_combined(combined_path=COMBINED_PATH, preview_lines=6):
    """Validate combined Excel/CSV file existence and print a brief preview.
    Tries to read as Excel first, but falls back to CSV when read_excel fails.
    """
    print(f"Checking combined file: {combined_path}")
    if not os.path.exists(combined_path):
        print('MISSING')
        print(' - Tip: place the combined conversation file at COMBINED_PATH or update the path above.')
        return
    # Try to read with pandas: excel first, then csv fallback
    try:
        import pandas as pd
        try:
            df = pd.read_excel(combined_path)
            print(f'Loaded with pandas.read_excel: {len(df)} rows; columns: {list(df.columns[:10])}')
            print('Preview (first rows):')
            with pd.option_context('display.max_colwidth', 120):
                print(df.head(5))
        except Exception as e_excel:
            print('read_excel failed, trying read_csv fallback ->', e_excel)
            try:
                df = pd.read_csv(combined_path, encoding='utf-8')
                print(f'Loaded as CSV fallback with {len(df)} rows; columns: {list(df.columns[:10])}')
                print('Preview (first rows):')
                with pd.option_context('display.max_colwidth', 120):
                    print(df.head(5))
            except Exception as e_csv:
                print('Could not read combined file as CSV either:', e_csv)
    except Exception as e:
        print('Pandas is not available or another error occurred:', e)
    print('----')

# Run quick validation when this cell is executed
if USE_COMBINED:
    validate_combined(COMBINED_PATH)
else:
    validate_paths()

In [None]:

COMBINED_PATH = r"C:\Users\Admin\Downloads\conversationfile.xlsx - userAuserB.csv"

import pandas as pd

def split_combined_conversations(combined_path=COMBINED_PATH, path_a=PATH_A, path_b=PATH_B):
    """Load a combined conversation file and write two CSVs: one for User A and one for User B.
    Expects columns (case-insensitive): conversation_id, timestamp, sender, message
    """
    if not os.path.exists(combined_path):
        print(f"Combined file not found: {combined_path}")
        print("If you have a single combined CSV/Excel, set COMBINED_PATH accordingly or upload the file to the Dataset folder.")
        return None

    # Try Excel first, then CSV
    try:
        df = pd.read_excel(combined_path)
        print('Loaded as Excel')
    except Exception:
        try:
            df = pd.read_csv(combined_path, encoding='utf-8')
            print('Loaded as CSV')
        except Exception as e:
            print('Failed to read combined file:', e)
            return None

    # Normalize column names to lower-case keys
    cols = {c.lower(): c for c in df.columns}
    lower = {k.lower(): k for k in df.columns}

    # Detect required columns
    def find_col(key_substrs):
        for k in lower:
            lk = k.lower()
            for s in key_substrs:
                if s in lk:
                    return lower[k]
        return None

    col_conv = find_col(['conversation'])
    col_time = find_col(['time', 'timestamp', 'date'])
    col_sender = find_col(['sender', 'user', 'from'])
    col_msg = find_col(['message', 'msg', 'text', 'content'])

    print('Detected columns:')
    print(' conversation_id ->', col_conv)
    print(' timestamp       ->', col_time)
    print(' sender          ->', col_sender)
    print(' message         ->', col_msg)

    if col_msg is None or col_sender is None:
        print('Required columns not found. Ensure the file has at least "sender" and "message" columns.')
        return None

    # Keep needed columns and sanitize
    df = df.rename(columns={col_msg: 'message'}) if col_msg != 'message' else df
    if col_time:
        df = df.rename(columns={col_time: 'timestamp'}) if col_time != 'timestamp' else df
    if col_conv:
        df = df.rename(columns={col_conv: 'conversation_id'}) if col_conv != 'conversation_id' else df
    if col_sender:
        df = df.rename(columns={col_sender: 'sender'}) if col_sender != 'sender' else df

    # Clean message text
    df['message'] = df['message'].astype(str).str.replace('\n', ' ').str.strip()

    # Normalize sender values to 'A' or 'B'
    def map_sender(x):
        if pd.isna(x):
            return None
        s = str(x).strip().lower()
        if s in ('user a', 'a', 'user_a', 'user-a'):
            return 'A'
        if s in ('user b', 'b', 'user_b', 'user-b'):
            return 'B'
        # catch common patterns
        if s.startswith('user a') or s == 'a':
            return 'A'
        if s.startswith('user b') or s == 'b':
            return 'B'
        # If sender contains just a single letter
        if s == 'a':
            return 'A'
        if s == 'b':
            return 'B'
        return s  # fallback: keep original

    df['sender_norm'] = df['sender'].apply(map_sender)

    # Convert timestamp where present
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

    # Split
    dfA = df[df['sender_norm'] == 'A'].copy()
    dfB = df[df['sender_norm'] == 'B'].copy()

    print(f'Found {len(dfA)} messages for User A and {len(dfB)} messages for User B')

    # Ensure target directory exists
    for target in (path_a, path_b):
        pdir = os.path.dirname(target)
        if pdir and not os.path.exists(pdir):
            os.makedirs(pdir, exist_ok=True)

    # Save with consistent columns (timestamp, message, conversation_id)
    save_cols = [c for c in ['conversation_id', 'timestamp', 'message'] if c in df.columns]
    if save_cols == []:
        save_cols = ['message']

    dfA.to_csv(path_a, index=False, columns=save_cols)
    dfB.to_csv(path_b, index=False, columns=save_cols)

    print('Wrote:')
    print(' ', path_a)
    print(' ', path_b)
    return (path_a, path_b)

# Run splitting helper (will print results)
split_combined_conversations()

In [None]:
# Preprocessing: load and prepare conversations (accepts combined file directly)
import pandas as pd

def load_and_prepare_data(pathA=None, pathB=None, combined_path=None, min_context=3):
    
   
    def _from_combined(df):
        # normalize column names
        lower = {c.lower(): c for c in df.columns}
        def find_col(key_substrs):
            for k in lower:
                lk = k.lower()
                for s in key_substrs:
                    if s in lk:
                        return lower[k]
            return None
        col_time = find_col(['time', 'timestamp', 'date'])
        col_sender = find_col(['sender', 'user', 'from'])
        col_msg = find_col(['message', 'msg', 'text', 'content'])
        if col_msg is None or col_sender is None:
            raise ValueError('Combined file must contain at least sender and message columns')
        # rename
        if col_msg != 'message':
            df = df.rename(columns={col_msg: 'message'})
        if col_time and col_time != 'timestamp':
            df = df.rename(columns={col_time: 'timestamp'})
        if col_sender and col_sender != 'sender':
            df = df.rename(columns={col_sender: 'sender'})
        # clean message
        df['message'] = df['message'].astype(str).str.replace('\n', ' ').str.strip()
        # normalize sender to 'A'/'B'
        def map_sender(x):
            if pd.isna(x):
                return None
            s = str(x).strip().lower()
            if s in ('user a', 'a', 'user_a', 'user-a') or s == 'a':
                return 'A'
            if s in ('user b', 'b', 'user_b', 'user-b') or s == 'b':
                return 'B'
            # fallback: try to extract last character if it's A/B
            s_clean = s.replace(' ', '').replace('_','').replace('-','')
            if s_clean.endswith('a'):
                return 'A'
            if s_clean.endswith('b'):
                return 'B'
            return s
        df['speaker'] = df['sender'].apply(map_sender)
        # convert timestamp
        if 'timestamp' in df.columns:
            df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
        else:
            # if no timestamp, create a sequential index to preserve order
            df['timestamp'] = pd.RangeIndex(start=0, stop=len(df))
        df = df.sort_values('timestamp').reset_index(drop=True)
        dfA = df[df['speaker'] == 'A'].copy()
        dfB = df[df['speaker'] == 'B'].copy()
        return dfA, dfB, df

    # Load from combined
    if combined_path is not None:
        if not os.path.exists(combined_path):
            raise FileNotFoundError(f'Combined file not found: {combined_path}')
        try:
            df_comb = pd.read_excel(combined_path)
        except Exception:
            df_comb = pd.read_csv(combined_path, encoding='utf-8')
        dfA, dfB, merged = _from_combined(df_comb)
    else:
        # require both paths
        if pathA is None or pathB is None:
            raise ValueError('Provide either combined_path or both pathA and pathB')
        dfA = pd.read_csv(pathA)
        dfB = pd.read_csv(pathB)
        for df in (dfA, dfB):
            if 'timestamp' not in df.columns:
                df['timestamp'] = pd.RangeIndex(start=0, stop=len(df))
            if 'message' not in df.columns:
                df['message'] = ''
            df['message'] = df['message'].astype(str).str.replace('\n', ' ').str.strip()
        dfA['speaker'] = 'A'
        dfB['speaker'] = 'B'
        merged = pd.concat([dfA, dfB], ignore_index=True).sort_values('timestamp').reset_index(drop=True)

    # Build conversation sliding windows where the target response is by 'A'
    conversations = []
    for i in range(min_context, len(merged)):
        if merged.loc[i, 'speaker'] == 'A':
            context_parts = []
            for j in range(i-min_context, i):
                s = merged.loc[j, 'speaker']
                m = merged.loc[j, 'message']
                context_parts.append(f"{s}: {m}")
            context = ' | '.join(context_parts)
            response = merged.loc[i, 'message']
            conversations.append(f"{context} <SEP> A: {response}")

    return pd.DataFrame({'text': conversations})

# Load data (select combined or separated files based on config flags)
print('Loading data...')
try:
    if 'USE_COMBINED' in globals() and USE_COMBINED:
        data = load_and_prepare_data(combined_path=COMBINED_PATH)
    else:
        data = load_and_prepare_data(pathA=PATH_A, pathB=PATH_B)
    print(f'Total pairs: {len(data)}')
except Exception as e:
    print('Error while preparing data:', e)
    data = pd.DataFrame({'text': []})


In [None]:
# Tokenization, Dataset and Model Init
class ChatDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=256):
        self.texts = texts.tolist() if hasattr(texts, 'tolist') else list(texts)
        self.encodings = tokenizer(self.texts, truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.encodings['input_ids'][idx]
        }

print('Loading tokenizer & model...')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Create datasets (use small dev sample if DEV_RUN True)
train_texts, val_texts = train_test_split(data['text'], test_size=0.15, random_state=42)
if DEV_RUN:
    train_texts = train_texts.iloc[:32]
    val_texts = val_texts.iloc[:8]

train_dataset = ChatDataset(train_texts, tokenizer)
val_dataset = ChatDataset(val_texts, tokenizer)
print(f'Train: {len(train_dataset)} | Val: {len(val_dataset)}')

In [None]:
# Training setup (guarded)
training_args = TrainingArguments(
    output_dir='./chat_model',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=100,
    weight_decay=0.01,
    logging_steps=50,
    evaluation_strategy='steps',
    eval_steps=200,
    save_steps=500,
    save_total_limit=1,
    fp16=torch.cuda.is_available(),
    learning_rate=5e-5
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
)

if RUN_TRAINING:
    print('Starting training...')
    trainer.train()
    print('✓ Training complete!')
else:
    print('RUN_TRAINING is False — skipping trainer.train()')

In [None]:
# Generation utility
def generate_reply(context, model, tokenizer, num_replies=3, max_length=100):
    input_text = f"{context} <SEP> A:"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    if torch.cuda.is_available():
        input_ids = input_ids.to(model.device)

    outputs = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=num_replies,
        temperature=0.8,
        top_k=50,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    replies = [tokenizer.decode(out, skip_special_tokens=True).split('A:')[-1].strip() for out in outputs]
    return replies


# context = "B: How are you? | A: Good! | B: What's up?"
# print(generate_reply(context, model, tokenizer))

In [None]:
# Evaluation utilities
def calculate_metrics(references, predictions, model=None, tokenizer=None):
    smooth = SmoothingFunction()
    bleu_scores = []
    for ref, pred in zip(references, predictions):
        try:
            s = sentence_bleu([ref.split()], pred.split(), smoothing_function=smooth.method1)
        except Exception:
            s = 0.0
        bleu_scores.append(s)
    bleu = float(np.mean(bleu_scores)) if bleu_scores else 0.0

    if Rouge is not None:
        try:
            rouge_scores = Rouge().get_scores(predictions, references, avg=True)
        except Exception:
            rouge_scores = {'rouge-1': {'f': 0.0}, 'rouge-2': {'f': 0.0}, 'rouge-l': {'f': 0.0}}
    else:
        rouge_scores = {'rouge-1': {'f': 0.0}, 'rouge-2': {'f': 0.0}, 'rouge-l': {'f': 0.0}}

    if model is not None and tokenizer is not None and len(references) > 0:
        model.eval()
        total_loss = 0.0
        n = min(50, len(references))
        with torch.no_grad():
            for text in references[:n]:
                inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=256)
                if torch.cuda.is_available():
                    inputs = {k: v.to(model.device) for k, v in inputs.items()}
                outputs = model(**inputs, labels=inputs['input_ids'])
                total_loss += outputs.loss.item()
        perplexity = float(np.exp(total_loss / n)) if n > 0 else float('inf')
    else:
        perplexity = float('inf')

    return bleu, rouge_scores, perplexity

In [None]:

def save_pipeline_package(save_path=JOBLIB_OUT, bleu=0.0, rouge=None, perplexity=float('inf')):
    rouge = rouge or {'rouge-1': {'f': 0.0}, 'rouge-2': {'f': 0.0}, 'rouge-l': {'f': 0.0}}
    pipeline_package = {
        'model_name': 'gpt2-chat-finetuned',
        'tokenizer': tokenizer,
        'generation_config': {
            'max_length': 100,
            'temperature': 0.8,
            'top_k': 50,
            'top_p': 0.95
        },
        'metrics': {
            'bleu': float(bleu),
            'rouge_1': float(rouge['rouge-1']['f']),
            'rouge_2': float(rouge['rouge-2']['f']),
            'rouge_l': float(rouge['rouge-l']['f']),
            'perplexity': float(perplexity)
        }
    }
    joblib.dump(pipeline_package, save_path)
    print(f'\nModel package saved as "{save_path}"')

NameError: name 'JOBLIB_OUT' is not defined