In [None]:
import pandas as pd

df = pd.read_csv("data/twcs.csv")

user_tweets = df[df['inbound'] == True].copy()

user_tweets[['author_id', 'text', 'created_at']].head()

抽样部分数据,手动标注

In [3]:
sample = user_tweets[['text']].sample(200, random_state=42)
sample.to_csv("labels/labeled_sample.csv", index=False)

In [5]:
add_column = pd.read_csv("labels/labeled_sample.csv")

add_column['label'] = ""
add_column.to_csv("labels/labeled_sample_")

Some data have been sampled (200, random), and 60 have been manually annotated

评估模型效果(labeled_sample_)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, learning_curve
import numpy as np

# Loading data
df = pd.read_csv("labels/labeled_sample_.csv")
df.dropna(subset=['text', 'label'], inplace=True)

# TF-IDF Vectorizing
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['text']).toarray()

# Label Encoding
le = LabelEncoder()
y = le.fit_transform(df['label'])

# Split into training/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Using Apple MLX

In [10]:
import mlx.core as mx

X_train_mx = mx.array(X_train)
y_train_mx = mx.array(y_train)
X_test_mx = mx.array(X_test)
y_test_mx = mx.array(y_test)

STEP 1: model, optimizer

In [32]:
import mlx.nn as nn
import mlx.optimizers as optim

n_classes = len(set(y_test)) # Number of label categories
input_dim = X_train.shape[1] # TF-IDF vector dimensions

model = nn.Sequential(
    nn.Linear(input_dim, 128),
    nn.ReLU(),
    nn.Linear(128, n_classes)
)

# loss_fn = nn.losses.cross_entropy(logits=X_train_mx, targets=y_train_mx, reduction="mean")

# loss+gradient func
loss_and_grad_fn = nn.value_and_grad(
    model,
    lambda m, X, y: nn.losses.cross_entropy(
        logits=m(X),
        targets=y,
        reduction="mean"
    )
)

optimizer = optim.Adam(learning_rate=0.01)
mx.eval(model.parameters())
mx.eval(optimizer.state)

In [24]:
print(model(X_train_mx))
print(loss_fn)

array([[0.015317, -0.0823755, -0.0036931, -0.0576102, 0.0240737],
       [0.0227535, -0.07477, 0.00722791, -0.0663485, 0.0122911],
       [0.00730453, -0.0843332, 0.00105042, -0.0602917, 0.0337839],
       ...,
       [0.0142277, -0.0773441, 0.0138322, -0.0672997, 0.00961285],
       [0.018615, -0.0760249, 0.0133837, -0.046209, 0.00635851],
       [0.0114859, -0.0767276, 0.0015609, -0.0571076, -0.00077112]], dtype=float32)
array(6.42905, dtype=float32)


STEP 2: Training model + printing loss

In [34]:
import mlx.core as mx
import matplotlib.pyplot as plt

# Preparing Parameter
epochs = 20
loss_history = []

model.train()

for epoch in range(epochs) :
    # Forward Propagation
    # logits = model(X_train_mx)

    # Compute loss and grads
    loss_val, grads = loss_and_grad_fn(model, X_test_mx, y_test_mx)

    # Parameter update
    optimizer.update(model, grads)
    # Force execution of lazy updates
    mx.eval(model.parameters(), optimizer.state)

    # Save loss
    loss_history.append(loss_val)

    print(type(loss_val), loss_val.shape if hasattr(loss_val, "shape") else "")
    print(f"Epoch {epoch + 1}/{epochs} - Loss: {loss_val:.4f}")

<class 'mlx.core.array'> ()
Epoch 1/20 - Loss: -73.8160
<class 'mlx.core.array'> ()
Epoch 2/20 - Loss: -83.0614
<class 'mlx.core.array'> ()
Epoch 3/20 - Loss: -92.9538
<class 'mlx.core.array'> ()
Epoch 4/20 - Loss: -103.4153
<class 'mlx.core.array'> ()
Epoch 5/20 - Loss: -114.5154
<class 'mlx.core.array'> ()
Epoch 6/20 - Loss: -126.2358
<class 'mlx.core.array'> ()
Epoch 7/20 - Loss: -138.5824
<class 'mlx.core.array'> ()
Epoch 8/20 - Loss: -151.5489
<class 'mlx.core.array'> ()
Epoch 9/20 - Loss: -165.1765
<class 'mlx.core.array'> ()
Epoch 10/20 - Loss: -179.4348
<class 'mlx.core.array'> ()
Epoch 11/20 - Loss: -194.3154
<class 'mlx.core.array'> ()
Epoch 12/20 - Loss: -209.8525
<class 'mlx.core.array'> ()
Epoch 13/20 - Loss: -225.9667
<class 'mlx.core.array'> ()
Epoch 14/20 - Loss: -242.6726
<class 'mlx.core.array'> ()
Epoch 15/20 - Loss: -260.0201
<class 'mlx.core.array'> ()
Epoch 16/20 - Loss: -277.9582
<class 'mlx.core.array'> ()
Epoch 17/20 - Loss: -296.4980
<class 'mlx.core.array'> (