In [2]:
# Install required libraries (offline setup)
%pip install transformers torch pandas numpy scikit-learn nltk rouge-score joblib -q

# Import necessary libraries
import os
import pandas as pd
import numpy as np
import torch
import joblib
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
nltk.download('punkt', quiet=True)
try:
    from rouge import Rouge
except ImportError:
    Rouge = None


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.


In [3]:
# Configuration and quick validation
import os
from pathlib import Path

# Update these paths if your files are located elsewhere
PATH_A = r"C:\Users\Admin\Desktop\Dataset\userA_chats.csv"
PATH_B = r"C:\Users\Admin\Desktop\Dataset\userB_chats.csv"

# If you prefer a single combined file (Excel/CSV), set USE_COMBINED=True and provide COMBINED_PATH
USE_COMBINED = True
COMBINED_PATH = r"C:\Users\Admin\Desktop\Dataset\conversationfile.xlsx"

# Controls
RUN_TRAINING = False  # Set to True to run training
DEV_RUN = True        # Set to True to use a tiny subset for fast tests
JOBLIB_OUT = 'Model.joblib'


def validate_paths(paths=(PATH_A, PATH_B), preview_lines=6):
    for p in paths:
        print(f"Checking: {p}")
        if not os.path.exists(p):
            print("MISSING")
            print(" - Tip: place your CSV at this path or update PATH_A/PATH_B in the notebook.")
            print(" - Expected columns: timestamp, message (optional: user_id, conversation_id)")
        else:
            print("Exists -> preview:")
            try:
                with open(p, 'r', encoding='utf-8', errors='replace') as f:
                    for i, line in enumerate(f):
                        if i >= preview_lines:
                            break
                        print(line.rstrip())
            except Exception as e:
                print('Error reading file:', e)
        print('----')


def validate_combined(combined_path=COMBINED_PATH, preview_lines=6):
    print(f"Checking combined file: {combined_path}")
    if not os.path.exists(combined_path):
        print('MISSING')
        return
    # Try to read with pandas: excel first, then csv fallback
    try:
        import pandas as pd
        try:
            df = pd.read_excel(combined_path)
            print(f'Loaded with pandas.read_excel: {len(df)} rows; columns: {list(df.columns[:10])}')
            print('Preview (first rows):')
            with pd.option_context('display.max_colwidth', 120):
                print(df.head(5))
        except Exception as e_excel:
            print('read_excel failed, trying read_csv fallback ->', e_excel)
            try:
                df = pd.read_csv(combined_path, encoding='utf-8')
                print(f'Loaded as CSV fallback with {len(df)} rows; columns: {list(df.columns[:10])}')
                print('Preview (first rows):')
                with pd.option_context('display.max_colwidth', 120):
                    print(df.head(5))
            except Exception as e_csv:
                print('Could not read combined file as CSV either:', e_csv)
    except Exception as e:
        print('Pandas is not available or another error occurred:', e)
    print('----')

# Run quick validation when this cell is executed
if USE_COMBINED:
    validate_combined(COMBINED_PATH)
else:
    validate_paths()

Checking combined file: C:\Users\Admin\Desktop\Dataset\conversationfile.xlsx
read_excel failed, trying read_csv fallback -> Excel file format cannot be determined, you must specify an engine manually.
Loaded as CSV fallback with 22 rows; columns: ['Conversation ID', 'Timestamp', 'Sender', 'Message']
Preview (first rows):
   Conversation ID            Timestamp  Sender  \
0                1  2025-10-07 10:15:12  User B   
1                1  2025-10-07 10:15:45  User A   
2                1  2025-10-07 10:16:05  User B   
3                1  2025-10-07 10:16:38  User A   
4                1  2025-10-07 10:17:01  User B   

                                                                       Message  
0                       Hey, did you see the client's feedback on the mockups?  
1                 Just saw it. They want a lot of changes to the color scheme.  
2  Yeah, that's what I was thinking. It's a big shift from the original brief.  
3            I'll start on the revisions. Can 

In [4]:
COMBINED_PATH = r"C:\Users\Admin\Downloads\conversationfile.xlsx - userAuserB.csv"

import os, re
import pandas as pd

def split_combined_conversations(combined_path=COMBINED_PATH, path_a=PATH_A, path_b=PATH_B):
    if not os.path.exists(combined_path):
        print(f"Combined file not found: {combined_path}")
        print("If you have a single combined CSV/Excel, set COMBINED_PATH accordingly or upload the file to the Dataset folder.")
        return None

    # Try Excel first, then CSV
    try:
        df = pd.read_excel(combined_path)
        print("Loaded as Excel")
    except Exception:
        try:
            df = pd.read_csv(combined_path, encoding="utf-8")
            print("Loaded as CSV")
        except Exception as e:
            print("Failed to read combined file:", e)
            return None

    # helper to find columns by substring
    def find_col(df, *keys):
        keys = [k.lower() for k in keys]
        return next((c for c in df.columns if any(k in c.lower() for k in keys)), None)

    col_conv = find_col(df, "conversation")
    col_time = find_col(df, "time", "timestamp", "date")
    col_sender = find_col(df, "sender", "user", "from")
    col_msg = find_col(df, "message", "msg", "text", "content")

    print("Detected columns:")
    print(" conversation_id ->", col_conv)
    print(" timestamp       ->", col_time)
    print(" sender          ->", col_sender)
    print(" message         ->", col_msg)

    if col_msg is None or col_sender is None:
        print('Required columns not found. Ensure the file has at least "sender" and "message" columns.')
        return None

    # rename detected columns to canonical names
    rename_map = {}
    if col_msg and col_msg != "message":     rename_map[col_msg] = "message"
    if col_time and col_time != "timestamp": rename_map[col_time] = "timestamp"
    if col_conv and col_conv != "conversation_id": rename_map[col_conv] = "conversation_id"
    if col_sender and col_sender != "sender": rename_map[col_sender] = "sender"
    if rename_map:
        df = df.rename(columns=rename_map)

    # clean text
    df["message"] = df["message"].astype(str).str.replace("\n", " ").str.strip()

    # normalize sender to 'A' or 'B'
    def map_sender(x):
        if pd.isna(x):
            return None
        s = re.sub(r"[\s_\-]+", "", str(x).strip().lower())
        if not s:
            return None
        if s[0] == "a":
            return "A"
        if s[0] == "b":
            return "B"
        return str(x)

    df["sender_norm"] = df["sender"].apply(map_sender)

    if "timestamp" in df.columns:
        df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")

    dfA = df[df["sender_norm"] == "A"].copy()
    dfB = df[df["sender_norm"] == "B"].copy()
    print(f"Found {len(dfA)} messages for User A and {len(dfB)} messages for User B")

    # ensure directories exist
    for target in (path_a, path_b):
        pdir = os.path.dirname(target)
        if pdir and not os.path.exists(pdir):
            os.makedirs(pdir, exist_ok=True)

    save_cols = [c for c in ("conversation_id", "timestamp", "message") if c in df.columns] or ["message"]
    dfA.to_csv(path_a, index=False, columns=save_cols)
    dfB.to_csv(path_b, index=False, columns=save_cols)

    print("Wrote:")
    print(" ", path_a)
    print(" ", path_b)
    return (path_a, path_b)

# Run splitting helper (will print results)
split_combined_conversations()

Loaded as CSV
Detected columns:
 conversation_id -> Conversation ID
 timestamp       -> Timestamp
 sender          -> Sender
 message         -> Message
Found 0 messages for User A and 0 messages for User B
Wrote:
  C:\Users\Admin\Desktop\Dataset\userA_chats.csv
  C:\Users\Admin\Desktop\Dataset\userB_chats.csv


('C:\\Users\\Admin\\Desktop\\Dataset\\userA_chats.csv',
 'C:\\Users\\Admin\\Desktop\\Dataset\\userB_chats.csv')

In [5]:
# Preprocessing: load and prepare conversations (accepts combined file directly)
import pandas as pd

def load_and_prepare_data(pathA=None, pathB=None, combined_path=None, min_context=3):
    
   
    def _from_combined(df):
        # normalize column names
        lower = {c.lower(): c for c in df.columns}
        def find_col(key_substrs):
            for k in lower:
                lk = k.lower()
                for s in key_substrs:
                    if s in lk:
                        return lower[k]
            return None
        col_time = find_col(['time', 'timestamp', 'date'])
        col_sender = find_col(['sender', 'user', 'from'])
        col_msg = find_col(['message', 'msg', 'text', 'content'])
        if col_msg is None or col_sender is None:
            raise ValueError('Combined file must contain at least sender and message columns')
        # rename
        if col_msg != 'message':
            df = df.rename(columns={col_msg: 'message'})
        if col_time and col_time != 'timestamp':
            df = df.rename(columns={col_time: 'timestamp'})
        if col_sender and col_sender != 'sender':
            df = df.rename(columns={col_sender: 'sender'})
        # clean message
        df['message'] = df['message'].astype(str).str.replace('\n', ' ').str.strip()
        # normalize sender to 'A'/'B'
        def map_sender(x):
            if pd.isna(x):
                return None
            s = str(x).strip().lower()
            if s in ('user a', 'a', 'user_a', 'user-a') or s == 'a':
                return 'A'
            if s in ('user b', 'b', 'user_b', 'user-b') or s == 'b':
                return 'B'
            # fallback: try to extract last character if it's A/B
            s_clean = s.replace(' ', '').replace('_','').replace('-','')
            if s_clean.endswith('a'):
                return 'A'
            if s_clean.endswith('b'):
                return 'B'
            return s
        df['speaker'] = df['sender'].apply(map_sender)
        # convert timestamp
        if 'timestamp' in df.columns:
            df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
        else:
            # if no timestamp, create a sequential index to preserve order
            df['timestamp'] = pd.RangeIndex(start=0, stop=len(df))
        df = df.sort_values('timestamp').reset_index(drop=True)
        dfA = df[df['speaker'] == 'A'].copy()
        dfB = df[df['speaker'] == 'B'].copy()
        return dfA, dfB, df

    # Load from combined
    if combined_path is not None:
        if not os.path.exists(combined_path):
            raise FileNotFoundError(f'Combined file not found: {combined_path}')
        try:
            df_comb = pd.read_excel(combined_path)
        except Exception:
            df_comb = pd.read_csv(combined_path, encoding='utf-8')
        dfA, dfB, merged = _from_combined(df_comb)
    else:
        # require both paths
        if pathA is None or pathB is None:
            raise ValueError('Provide either combined_path or both pathA and pathB')
        dfA = pd.read_csv(pathA)
        dfB = pd.read_csv(pathB)
        for df in (dfA, dfB):
            if 'timestamp' not in df.columns:
                df['timestamp'] = pd.RangeIndex(start=0, stop=len(df))
            if 'message' not in df.columns:
                df['message'] = ''
            df['message'] = df['message'].astype(str).str.replace('\n', ' ').str.strip()
        dfA['speaker'] = 'A'
        dfB['speaker'] = 'B'
        merged = pd.concat([dfA, dfB], ignore_index=True).sort_values('timestamp').reset_index(drop=True)

    # Build conversation sliding windows where the target response is by 'A'
    conversations = []
    for i in range(min_context, len(merged)):
        if merged.loc[i, 'speaker'] == 'A':
            context_parts = []
            for j in range(i-min_context, i):
                s = merged.loc[j, 'speaker']
                m = merged.loc[j, 'message']
                context_parts.append(f"{s}: {m}")
            context = ' | '.join(context_parts)
            response = merged.loc[i, 'message']
            conversations.append(f"{context} <SEP> A: {response}")

    return pd.DataFrame({'text': conversations})

# Load data (select combined or separated files based on config flags)
print('Loading data...')
try:
    if 'USE_COMBINED' in globals() and USE_COMBINED:
        data = load_and_prepare_data(combined_path=COMBINED_PATH)
    else:
        data = load_and_prepare_data(pathA=PATH_A, pathB=PATH_B)
    print(f'Total pairs: {len(data)}')
except Exception as e:
    print('Error while preparing data:', e)
    data = pd.DataFrame({'text': []})


Loading data...
Total pairs: 10


In [6]:
# Tokenization, Dataset and Model Init (short fallback)
class ChatDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=256):
        self.texts = texts.tolist() if hasattr(texts, 'tolist') else list(texts)
        if self.texts:
            self.encodings = tokenizer(self.texts, truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')
        else:
            import torch as _t
            self.encodings = {'input_ids': _t.empty((0, max_len), dtype=_t.long), 'attention_mask': _t.empty((0, max_len), dtype=_t.long)}
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, i):
        return {'input_ids': self.encodings['input_ids'][i], 'attention_mask': self.encodings['attention_mask'][i], 'labels': self.encodings['input_ids'][i]}

print('Loading tokenizer & model...')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')


if 'data' not in globals() or data is None:
    data = None
    if 'load_and_prepare_data' in globals():
        try:
            data = load_and_prepare_data(combined_path=COMBINED_PATH) if globals().get('USE_COMBINED') else load_and_prepare_data(pathA=PATH_A, pathB=PATH_B)
        except Exception as e:
            print('helper failed ->', e)
            data = None
    if data is None:
        try:
            if globals().get('USE_COMBINED') and os.path.exists(COMBINED_PATH):
                try:
                    df = pd.read_excel(COMBINED_PATH)
                except Exception:
                    df = pd.read_csv(COMBINED_PATH, encoding='utf-8')
                msg = next((c for c in df.columns if 'message' in c.lower() or 'text' in c.lower()), None)
                snd = next((c for c in df.columns if 'sender' in c.lower() or 'user' in c.lower()), None)
                ts  = next((c for c in df.columns if 'time' in c.lower() or 'date' in c.lower()), None)
                if msg and snd:
                    df['message'] = df[msg].astype(str).str.replace('\n',' ').str.strip()
                    df['speaker'] = df[snd].astype(str).str.strip().str.lower().str[0].map({'a':'A','b':'B'})
                    if ts is not None:
                        try:
                            df[ts] = pd.to_datetime(df[ts], errors='coerce')
                            df = df.sort_values(ts).reset_index(drop=True)
                        except Exception:
                            pass
                    merged = df
                    convs = []
                    for i in range(3, len(merged)):
                        if merged.loc[i, 'speaker'] == 'A':
                            ctx = ' | '.join(f"{merged.loc[j,'speaker']}: {merged.loc[j,'message']}" for j in range(i-3, i))
                            convs.append(f"{ctx} <SEP> A: {merged.loc[i,'message']}")
                    data = pd.DataFrame({'text': convs})
                else:
                    data = pd.DataFrame({'text': []})
            else:
                if os.path.exists(PATH_A) and os.path.exists(PATH_B):
                    dfA = pd.read_csv(PATH_A); dfB = pd.read_csv(PATH_B)
                    for d in (dfA, dfB):
                        if 'message' not in d.columns:
                            d['message'] = ''
                    dfA['speaker'] = 'A'; dfB['speaker'] = 'B'
                    merged = pd.concat([dfA, dfB], ignore_index=True).reset_index(drop=True)
                    convs = []
                    for i in range(3, len(merged)):
                        if merged.loc[i, 'speaker'] == 'A':
                            ctx = ' | '.join(f"{merged.loc[j,'speaker']}: {merged.loc[j,'message']}" for j in range(i-3, i))
                            convs.append(f"{ctx} <SEP> A: {merged.loc[i,'message']}")
                    data = pd.DataFrame({'text': convs})
                else:
                    data = pd.DataFrame({'text': []})
        except Exception as e:
            print('Fallback load failed ->', e)
            data = pd.DataFrame({'text': []})

texts = pd.Series(data['text']) if isinstance(data, pd.DataFrame) and 'text' in data.columns else pd.Series(list(data) if data is not None else [], dtype=object)
texts = texts.reset_index(drop=True)
# Split safely
if len(texts) < 2:
    train_texts, val_texts = texts, texts.iloc[:0]
else:
    train_texts, val_texts = map(pd.Series, train_test_split(texts, test_size=0.15, random_state=42))
if globals().get('DEV_RUN'):
    train_texts, val_texts = train_texts.iloc[:32], val_texts.iloc[:8]
train_dataset, val_dataset = ChatDataset(train_texts, tokenizer), ChatDataset(val_texts, tokenizer)
print(f'Train: {len(train_dataset)} | Val: {len(val_dataset)}')

Loading tokenizer & model...
Train: 8 | Val: 2
Train: 8 | Val: 2


In [7]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
import torch

_dev = globals().get('DEV_RUN', False)
RUN_TRAINING = globals().get('RUN_TRAINING', False)

if _dev and not RUN_TRAINING:
    print('DEV_RUN=True and RUN_TRAINING=False — enabling a quick smoke run for this session')
    _epochs = 1
    _batch = 1
    _logging = 1
    RUN_TRAINING = True
else:
    _epochs = 3
    _batch = 4
    _logging = 50

training_args = TrainingArguments(
    output_dir='./chat_model',
    num_train_epochs=_epochs,
    per_device_train_batch_size=_batch,
    per_device_eval_batch_size=_batch,
    warmup_steps=10 if _dev else 100,
    weight_decay=0.01,
    logging_steps=_logging,
    do_eval=True,
    save_steps=500,
    save_total_limit=1,
    fp16=torch.cuda.is_available(),
    learning_rate=5e-5
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
)

if RUN_TRAINING:
    print("🚀 Starting training (smoke-run settings if DEV_RUN=True)...")
    trainer.train()
    print("✅ Training complete!")
else:
    print("RUN_TRAINING is False — skipping trainer.train()")

DEV_RUN=True and RUN_TRAINING=False — enabling a quick smoke run for this session
🚀 Starting training (smoke-run settings if DEV_RUN=True)...


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1,3.3738
2,3.8652
3,4.0246
4,3.7578
5,3.6158
6,3.2471
7,3.1896
8,3.2079


✅ Training complete!


In [9]:
# Generation utility
def generate_reply(context, model, tokenizer, num_replies=3, max_length=100):
    input_text = f"{context} <SEP> A:"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    if torch.cuda.is_available():
        input_ids = input_ids.to(model.device)

    outputs = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=num_replies,
        temperature=0.8,
        top_k=50,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    replies = [tokenizer.decode(out, skip_special_tokens=True).split('A:')[-1].strip() for out in outputs]
    return replies


context = "B: How are you? | A: Good! | B: What's up?"
print(generate_reply(context, model, tokenizer))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


["Well, it's a good idea, so you", "Okay, we're", 'Well, I\'m really confused. I have a little bit of a story.\n\n<SE: Yeah, I can tell you, "I\'m a little bit of a kid. And I am a kid. I have a good little story, and I\'m happy to talk to you."\n\n<P> I guess that\'s the point. I think you could']


In [10]:
# Evaluation utilities
def calculate_metrics(references, predictions, model=None, tokenizer=None):
    smooth = SmoothingFunction()
    bleu_scores = []
    for ref, pred in zip(references, predictions):
        try:
            s = sentence_bleu([ref.split()], pred.split(), smoothing_function=smooth.method1)
        except Exception:
            s = 0.0
        bleu_scores.append(s)
    bleu = float(np.mean(bleu_scores)) if bleu_scores else 0.0

    if Rouge is not None:
        try:
            rouge_scores = Rouge().get_scores(predictions, references, avg=True)
        except Exception:
            rouge_scores = {'rouge-1': {'f': 0.0}, 'rouge-2': {'f': 0.0}, 'rouge-l': {'f': 0.0}}
    else:
        rouge_scores = {'rouge-1': {'f': 0.0}, 'rouge-2': {'f': 0.0}, 'rouge-l': {'f': 0.0}}

    if model is not None and tokenizer is not None and len(references) > 0:
        model.eval()
        total_loss = 0.0
        n = min(50, len(references))
        with torch.no_grad():
            for text in references[:n]:
                inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=256)
                if torch.cuda.is_available():
                    inputs = {k: v.to(model.device) for k, v in inputs.items()}
                outputs = model(**inputs, labels=inputs['input_ids'])
                total_loss += outputs.loss.item()
        perplexity = float(np.exp(total_loss / n)) if n > 0 else float('inf')
    else:
        perplexity = float('inf')

    return bleu, rouge_scores, perplexity

In [11]:

def save_pipeline_package(save_path=JOBLIB_OUT, bleu=0.0, rouge=None, perplexity=float('inf')):
    rouge = rouge or {'rouge-1': {'f': 0.0}, 'rouge-2': {'f': 0.0}, 'rouge-l': {'f': 0.0}}
    pipeline_package = {
        'model_name': 'gpt2-chat-finetuned',
        'tokenizer': tokenizer,
        'generation_config': {
            'max_length': 100,
            'temperature': 0.8,
            'top_k': 50,
            'top_p': 0.95
        },
        'metrics': {
            'bleu': float(bleu),
            'rouge_1': float(rouge['rouge-1']['f']),
            'rouge_2': float(rouge['rouge-2']['f']),
            'rouge_l': float(rouge['rouge-l']['f']),
            'perplexity': float(perplexity)
        }
    }
    joblib.dump(pipeline_package, save_path)
    print(f'\nModel package saved as "{save_path}"')

In [12]:
# Quick preview: dataset sizes and up to 3 training examples
print('Train dataset length:', len(train_dataset))
print('Val dataset length:  ', len(val_dataset))

try:
    for i in range(min(3, len(train_dataset))):
        ids = train_dataset[i]['input_ids']
        txt = tokenizer.decode(ids, skip_special_tokens=True).strip()
        print(f"Example {i+1}: {txt}")
except Exception as e:
    print('Could not decode preview examples:', e)

print('\nTo run training now: set RUN_TRAINING=True and (optionally) DEV_RUN=True for a quick smoke run, then re-run the Training cell.')

Train dataset length: 8
Val dataset length:   2
Example 1: A: "How about around 3 PM?" | A: "Are you free? My laptop just went blank." | B: "Oh no. Did you try a hard reboot?" <SEP> A: "Tried it twice. Nothing."
Example 2: B: "Hey, did you see the client's feedback on the mockups?" | A: "Just saw it. They want a lot of changes to the color scheme." | B: "Yeah, that's what I was thinking. It's a big shift from the original brief." <SEP> A: "I'll start on the revisions. Can you update the project timeline?"
Example 3: B: "Okay, try connecting it to an external monitor. Maybe the display is the issue." | A: "Good idea, let me find a cable." | B: "Let me know if that works. If not, we might have to call IT." <SEP> A: "Finally watched that new sci-fi movie everyone's talking about."

To run training now: set RUN_TRAINING=True and (optionally) DEV_RUN=True for a quick smoke run, then re-run the Training cell.
