<a href="https://colab.research.google.com/github/b2220356179/finetune/blob/main/distilbert_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
import tensorflow as tf
from sklearn.metrics import confusion_matrix
import tf_keras as keras

# Load and preprocess dataset
df = pd.read_csv('/content/drive/MyDrive/mbti_1.csv', names=["type", "posts"])

# Convert your target labels into one-hot encoded format (you have 16 classes)
df['label'] = pd.Categorical(df['type'])
df['label'] = df['label'].cat.codes

# Check for missing values
print(df.isnull().sum())

# Optionally drop missing or invalid rows
df.dropna(inplace=True)

# Check for empty strings in posts
empty_posts = df['posts'].apply(lambda x: len(str(x).strip()) == 0).sum()
print(f'Empty posts: {empty_posts}')

# Remove empty posts if found
df = df[df['posts'].apply(lambda x: len(str(x).strip()) > 0)]

# Split data
X_train, X_test, y_train, y_test = train_test_split(df['posts'], df['label'], test_size=0.2, random_state=42)

print(f'Min label: {df["label"].min()}, Max label: {df["label"].max()}')

# Initialize tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=16)

# Tokenize text
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=512)


# Convert to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test))

# Batch and shuffle datasets
train_dataset = train_dataset.batch(16).shuffle(len(X_train))
test_dataset = test_dataset.batch(16)

# Compile model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# Train model
model.fit(train_dataset, epochs=3, validation_data=test_dataset)

# Evaluate model
model.evaluate(test_dataset)

# Make predictions
predictions = model.predict(test_dataset)
y_pred = tf.argmax(predictions.logits, axis=-1)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

type     0
posts    0
label    0
dtype: int64
Empty posts: 0
Min label: 0, Max label: 16


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/3
Epoch 2/3
Epoch 3/3
[[ 14   0   0   0   0   0   0   0   8   4   1   2   1   0   1   1   0]
 [  1  75   1   4   1   0   0   1  10  24   4   5   2   3   2   4   0]
 [  3   3  19   2   0   0   0   0   6   6   2   6   0   0   0   2   0]
 [  1  11   3  57   2   0   0   0  22  13   8  11   4   0   1   7   0]
 [  0   0   0   0   3   0   0   0   0   0   0   0   2   0   0   0   0]
 [  0   1   0   1   0   0   0   0   1   1   1   0   1   1   0   3   0]
 [  0   0   0   0   0   0   0   1   1   2   1   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   3   3   2   3   0   0   0   1   4   0]
 [  0  11   1   2   1   0   0   0 176  56  10   7   1   2   4   9   0]
 [  4  13   0   1   1   0   0   0  58 274  11   9   1   6   3   6   0]
 [  1   9   3   6   1   0   0   1  32  25 119  17   2   1   1   5   0]
 [  1   3   7  11   1   0   0   0  40  48  18 132   5   1   0   7   0]
 [  1   0   0   0   0   0   0   0   6   3   3   1  19   0   0   0   0]
 [  0   2   0   0   0   0   0   1   1  23   1  

In [8]:
import os

# Create directories for the model and tokenizer
os.makedirs('/content/fine_tuned_model/model', exist_ok=True)
os.makedirs('/content/fine_tuned_model/tokenizer', exist_ok=True)

# Save model and tokenizer to separate subdirectories
model.save_pretrained('/content/fine_tuned_model/model')
tokenizer.save_pretrained('/content/fine_tuned_model/tokenizer')

('/content/fine_tuned_model/tokenizer/tokenizer_config.json',
 '/content/fine_tuned_model/tokenizer/special_tokens_map.json',
 '/content/fine_tuned_model/tokenizer/vocab.txt',
 '/content/fine_tuned_model/tokenizer/added_tokens.json',
 '/content/fine_tuned_model/tokenizer/tokenizer.json')