# Linear-Log Model 

## 0.导入库

In [1]:
import math
import re
from collections import Counter

import nltk
import numpy as np
import pandas as pd
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from scipy.sparse import csr_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

In [2]:
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Error loading wordnet: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading omw-1.4: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [Errno 11001] getaddrinfo failed>


False

## 1.数据导入与预处理

In [3]:
df = pd.read_csv(
    "ag_news_csv/train.csv", header=None, names=["label", "title", "description"]
)

In [4]:
print(df.head())

   label                                              title  \
0      3  Wall St. Bears Claw Back Into the Black (Reuters)   
1      3  Carlyle Looks Toward Commercial Aerospace (Reu...   
2      3    Oil and Economy Cloud Stocks' Outlook (Reuters)   
3      3  Iraq Halts Oil Exports from Main Southern Pipe...   
4      3  Oil prices soar to all-time record, posing new...   

                                         description  
0  Reuters - Short-sellers, Wall Street's dwindli...  
1  Reuters - Private investment firm Carlyle Grou...  
2  Reuters - Soaring crude prices plus worries\ab...  
3  Reuters - Authorities have halted oil export\f...  
4  AFP - Tearaway world oil prices, toppling reco...  


In [5]:
def replace_space(word):
    return re.sub(r"[-\\/&]", " ", word)

In [6]:
df["title"] = df["title"].apply(replace_space)
df["description"] = df["description"].apply(replace_space)

In [7]:
def replace_num(word):
    return re.sub(r"\d+", "<NUM>", word)

In [8]:
df["title"] = df["title"].apply(replace_num)
df["description"] = df["description"].apply(replace_num)

In [9]:
def separate_num(word):
    return re.sub(r"(<NUM>)", r" \1 ", word)

In [10]:
df["title"] = df["title"].apply(separate_num)
df["description"] = df["description"].apply(separate_num)

In [11]:
print(df)

        label                                              title  \
0           3  Wall St. Bears Claw Back Into the Black (Reuters)   
1           3  Carlyle Looks Toward Commercial Aerospace (Reu...   
2           3    Oil and Economy Cloud Stocks' Outlook (Reuters)   
3           3  Iraq Halts Oil Exports from Main Southern Pipe...   
4           3  Oil prices soar to all time record, posing new...   
...       ...                                                ...   
119995      1  Pakistan's Musharraf Says Won't Quit as Army C...   
119996      2                  Renteria signing a top shelf deal   
119997      2                    Saban not going to Dolphins yet   
119998      2                                  Today's NFL games   
119999      2                       Nets get Carter from Raptors   

                                              description  
0       Reuters   Short sellers, Wall Street's dwindli...  
1       Reuters   Private investment firm Carlyle Grou...  
2  

In [12]:
def tokenize(text):
    return text.split()

In [13]:
df["tokens"] = df["title"].apply(tokenize) + df["description"].apply(tokenize)

In [14]:
print(df["tokens"])

0         [Wall, St., Bears, Claw, Back, Into, the, Blac...
1         [Carlyle, Looks, Toward, Commercial, Aerospace...
2         [Oil, and, Economy, Cloud, Stocks', Outlook, (...
3         [Iraq, Halts, Oil, Exports, from, Main, Southe...
4         [Oil, prices, soar, to, all, time, record,, po...
                                ...                        
119995    [Pakistan's, Musharraf, Says, Won't, Quit, as,...
119996    [Renteria, signing, a, top, shelf, deal, Red, ...
119997    [Saban, not, going, to, Dolphins, yet, The, Mi...
119998    [Today's, NFL, games, PITTSBURGH, at, NY, GIAN...
119999    [Nets, get, Carter, from, Raptors, INDIANAPOLI...
Name: tokens, Length: 120000, dtype: object


In [15]:
df.drop("description", axis=1, inplace=True)
df.drop("title", axis=1, inplace=True)

In [16]:
print(df)

        label                                             tokens
0           3  [Wall, St., Bears, Claw, Back, Into, the, Blac...
1           3  [Carlyle, Looks, Toward, Commercial, Aerospace...
2           3  [Oil, and, Economy, Cloud, Stocks', Outlook, (...
3           3  [Iraq, Halts, Oil, Exports, from, Main, Southe...
4           3  [Oil, prices, soar, to, all, time, record,, po...
...       ...                                                ...
119995      1  [Pakistan's, Musharraf, Says, Won't, Quit, as,...
119996      2  [Renteria, signing, a, top, shelf, deal, Red, ...
119997      2  [Saban, not, going, to, Dolphins, yet, The, Mi...
119998      2  [Today's, NFL, games, PITTSBURGH, at, NY, GIAN...
119999      2  [Nets, get, Carter, from, Raptors, INDIANAPOLI...

[120000 rows x 2 columns]


In [17]:
def lower(tokens):
    return [word.lower() for word in tokens]

In [18]:
df["tokens"] = df["tokens"].apply(lower)

In [19]:
print(df["tokens"])

0         [wall, st., bears, claw, back, into, the, blac...
1         [carlyle, looks, toward, commercial, aerospace...
2         [oil, and, economy, cloud, stocks', outlook, (...
3         [iraq, halts, oil, exports, from, main, southe...
4         [oil, prices, soar, to, all, time, record,, po...
                                ...                        
119995    [pakistan's, musharraf, says, won't, quit, as,...
119996    [renteria, signing, a, top, shelf, deal, red, ...
119997    [saban, not, going, to, dolphins, yet, the, mi...
119998    [today's, nfl, games, pittsburgh, at, ny, gian...
119999    [nets, get, carter, from, raptors, indianapoli...
Name: tokens, Length: 120000, dtype: object


In [20]:
def remove_word_suffixes(word):
    if word.endswith("'s"):
        word = word[:-2]
    # elif word.endswith("s"):
    #    word = word[:-1]
    else:
        return re.sub(r'[.,:()\'"?;#$!]', "", word)

In [21]:
def remove_suffixes(tokens):
    return [remove_word_suffixes(word) for word in tokens]

In [22]:
df["tokens"] = df["tokens"].apply(remove_suffixes)

In [23]:
print(df["tokens"])

0         [wall, st, bears, claw, back, into, the, black...
1         [carlyle, looks, toward, commercial, aerospace...
2         [oil, and, economy, cloud, stocks, outlook, re...
3         [iraq, halts, oil, exports, from, main, southe...
4         [oil, prices, soar, to, all, time, record, pos...
                                ...                        
119995    [None, musharraf, says, wont, quit, as, army, ...
119996    [renteria, signing, a, top, shelf, deal, red, ...
119997    [saban, not, going, to, dolphins, yet, the, mi...
119998    [None, nfl, games, pittsburgh, at, ny, giants,...
119999    [nets, get, carter, from, raptors, indianapoli...
Name: tokens, Length: 120000, dtype: object


In [24]:
def remove_stopwords(tokens):
    return [word for word in tokens if (word not in stopwords) and (word is not None)]

In [25]:
with open("stopwords.txt") as file:
    stopwords = file.read().split(",")

In [26]:
print(stopwords)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'A', 'about', 'above', 'across', 'after', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'an', 'and', 'another', 'any', 'anyone', 'anything', 'anywhere', 'are', "aren't", 'around', 'as', 'at', 'b', 'B', 'back', 'be', 'became', 'because', 'become', 'becomes', 'been', 'before', 'behind', 'being', 'below', 'between', 'both', 'but', 'by', 'c', 'C', 'can', 'cannot', "can't", 'could', "couldn't", 'd', 'D', 'did', "didn't", 'do', 'does', "doesn't", 'doing', 'done', "don't", 'down', 'during', 'e', 'E', 'each', 'either', 'enough', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'f', 'F', 'few', 'find', 'first', 'for', 'four', 'from', 'full', 'further', 'g', 'G', 'get', 'give', 'go', 'h', 'H', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', "here's", 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 

In [27]:
df["tokens"] = df["tokens"].apply(remove_stopwords)

In [28]:
print(df["tokens"])

0         [wall, st, bears, claw, black, reuters, reuter...
1         [carlyle, looks, commercial, aerospace, reuter...
2         [oil, economy, cloud, stocks, outlook, reuters...
3         [iraq, halts, oil, exports, main, southern, pi...
4         [oil, prices, soar, time, record, posing, new,...
                                ...                        
119995    [musharraf, says, wont, quit, army, chief, kar...
119996    [renteria, signing, top, shelf, deal, red, sox...
119997    [saban, going, dolphins, miami, dolphins, cour...
119998    [nfl, games, pittsburgh, ny, giants, time, <nu...
119999    [nets, carter, raptors, indianapolis, star, vi...
Name: tokens, Length: 120000, dtype: object


In [29]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith("J"):
        return wordnet.ADJ
    elif treebank_tag.startswith("V"):
        return wordnet.VERB
    elif treebank_tag.startswith("N"):
        return wordnet.NOUN
    elif treebank_tag.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [30]:
def lemmatize_with_pos(tokens):
    pos_tagged = pos_tag(tokens)
    return [
        lemmatizer.lemmatize(token, get_wordnet_pos(pos)) for token, pos in pos_tagged
    ]

In [31]:
lemmatizer = WordNetLemmatizer()

df["tokens"] = df["tokens"].apply(lemmatize_with_pos)

In [32]:
print(df["tokens"])

0         [wall, st, bear, claw, black, reuters, reuters...
1         [carlyle, look, commercial, aerospace, reuters...
2         [oil, economy, cloud, stock, outlook, reuters,...
3         [iraq, halt, oil, export, main, southern, pipe...
4         [oil, price, soar, time, record, pose, new, me...
                                ...                        
119995    [musharraf, say, wont, quit, army, chief, kara...
119996    [renteria, sign, top, shelf, deal, red, sox, g...
119997    [saban, go, dolphin, miami, dolphin, courtship...
119998    [nfl, game, pittsburgh, ny, giant, time, <num>...
119999    [net, carter, raptor, indianapolis, star, vinc...
Name: tokens, Length: 120000, dtype: object


In [33]:
df.to_pickle("processed_train_data.pkl")

## 2.TF-IDF编码

In [2]:
df = pd.read_pickle("processed_train_data.pkl")

In [3]:
print(df)

        label                                             tokens
0           3  [wall, st, bear, claw, black, reuters, reuters...
1           3  [carlyle, look, commercial, aerospace, reuters...
2           3  [oil, economy, cloud, stock, outlook, reuters,...
3           3  [iraq, halt, oil, export, main, southern, pipe...
4           3  [oil, price, soar, time, record, pose, new, me...
...       ...                                                ...
119995      1  [musharraf, say, wont, quit, army, chief, kara...
119996      2  [renteria, sign, top, shelf, deal, red, sox, g...
119997      2  [saban, go, dolphin, miami, dolphin, courtship...
119998      2  [nfl, game, pittsburgh, ny, giant, time, <num>...
119999      2  [net, carter, raptor, indianapolis, star, vinc...

[120000 rows x 2 columns]


In [4]:
words_counter = Counter()
for tokens in df["tokens"]:
    words_counter.update(tokens)
vocabulary = dict(words_counter)

In [5]:
counter = 0
for key, value in vocabulary.items():
    if counter < 20:
        print(f"{key}: {value}")
        counter += 1
    else:
        break

wall: 1500
st: 1679
bear: 722
claw: 36
black: 836
reuters: 17270
short: 924
seller: 105
dwindle: 48
band: 240
ultra: 81
cynic: 6
see: 1861
green: 864
carlyle: 16
look: 2786
commercial: 541
aerospace: 129
private: 721
investment: 986


In [6]:
def compute_tf(tokens):
    tf = Counter(tokens)
    for i in tf:
        tf[i] = (1 + math.log10(tf[i])) if tf[i] != 0 else 0
    return dict(tf)

In [7]:
TF = [compute_tf(tokens) for tokens in df["tokens"]]

In [8]:
counter = 0
for i in TF:
    if counter < 10:
        print(i)
        counter += 1
    else:
        break

{'wall': 1.3010299956639813, 'st': 1.0, 'bear': 1.0, 'claw': 1.0, 'black': 1.0, 'reuters': 1.3010299956639813, 'short': 1.0, 'seller': 1.0, 'dwindle': 1.0, 'band': 1.0, 'ultra': 1.0, 'cynic': 1.0, 'see': 1.0, 'green': 1.0}
{'carlyle': 1.3010299956639813, 'look': 1.0, 'commercial': 1.0, 'aerospace': 1.0, 'reuters': 1.3010299956639813, 'private': 1.0, 'investment': 1.0, 'firm': 1.0, 'group': 1.0, 'reputation': 1.0, 'make': 1.0, 'time': 1.0, 'occasionally': 1.0, 'controversial': 1.0, 'play': 1.0, 'defense': 1.0, 'industry': 1.0, 'quietly': 1.0, 'place': 1.0, 'bet': 1.0, 'market': 1.0}
{'oil': 1.0, 'economy': 1.3010299956639813, 'cloud': 1.0, 'stock': 1.3010299956639813, 'outlook': 1.3010299956639813, 'reuters': 1.3010299956639813, 'soar': 1.0, 'crude': 1.0, 'price': 1.0, 'plus': 1.0, 'worry': 1.0, 'earnings': 1.0, 'expect': 1.0, 'hang': 1.0, 'market': 1.0, 'week': 1.0, 'depth': 1.0, 'summer': 1.0, 'doldrums': 1.0}
{'iraq': 1.3010299956639813, 'halt': 1.3010299956639813, 'oil': 1.477121254

In [9]:
def compute_idf(dft, df_tokens_len):
    return math.log10(df_tokens_len / dft)

In [10]:
IDF = {word: compute_idf(dft, len(df["tokens"])) for word, dft in vocabulary.items()}

In [11]:
counter = 0
for key, value in IDF.items():
    if counter < 20:
        print(f"{key}: {value}")
        counter += 1
    else:
        break

wall: 1.9030899869919435
st: 1.8541305499095762
bear: 2.2206440484779857
claw: 3.5228787452803374
black: 2.1569749686086084
reuters: 0.8418889084801661
short: 2.113509274827518
seller: 3.057991946977687
dwindle: 3.3979400086720375
band: 2.6989700043360187
ultra: 3.170696227168975
cynic: 4.301029995663981
see: 1.809434872916858
green: 2.1426675035687315
carlyle: 3.8750612633917
look: 1.6342001339596801
commercial: 2.3459839809410554
aerospace: 2.968591535748376
private: 2.2212459813281957
investment: 2.0853043311064137


In [12]:
data = []
indices = []
indptr = [0]

In [13]:
word_list = list(IDF.keys())
word_to_index = {word: i for i, word in enumerate(word_list)}

In [14]:
for i in range(len(TF)):
    for word, tf in TF[i].items():
        if word in IDF:
            tf_idf = tf * IDF[word]
            data.append(tf_idf)
            indices.append((df["label"][i] - 1) * len(IDF) + word_to_index[word])
    indptr.append(len(data))

In [15]:
tf_idf = csr_matrix((data, indices, indptr), shape=(len(TF), len(IDF) * 4), dtype=float)

In [16]:
print(tf_idf)

  (0, 116296)	2.4759771575242944
  (0, 116297)	1.8541305499095762
  (0, 116298)	2.2206440484779857
  (0, 116299)	3.5228787452803374
  (0, 116300)	2.1569749686086084
  (0, 116301)	1.0953227229495044
  (0, 116302)	2.113509274827518
  (0, 116303)	3.057991946977687
  (0, 116304)	3.3979400086720375
  (0, 116305)	2.6989700043360187
  (0, 116306)	3.170696227168975
  (0, 116307)	4.301029995663981
  (0, 116308)	1.809434872916858
  (0, 116309)	2.1426675035687315
  (1, 116310)	5.0415709387081655
  (1, 116311)	1.6342001339596801
  (1, 116312)	2.3459839809410554
  (1, 116313)	2.968591535748376
  (1, 116301)	1.0953227229495044
  (1, 116314)	2.2212459813281957
  (1, 116315)	2.0853043311064137
  (1, 116316)	1.6824589675438515
  (1, 116317)	1.3535962737769305
  (1, 116318)	2.9822712330395684
  (1, 116319)	1.4116349065361085
  :	:
  (119998, 65816)	2.8916605252111616
  (119998, 58900)	1.7262273343375372
  (119998, 58393)	1.432875565762866
  (119998, 58234)	0.9259458273953294
  (119998, 59223)	2.13371266

In [None]:
print(tf_idf.shape)

## 3.构建并训练Log-Linear模型

In [1]:
class LogLinearModel:
    def __init__(self, n_features, n_classes):
        self.n_features = n_features
        self.n_classes = n_classes
        self.weights = np.zeros((n_classes, n_features))

    def train(self,X,y,lr=0.01,epochs=10):
        for epoch in range(epochs):
            for i in range(len(X)):
                scores = X[i] @ self.weights.T
                probs = self.mysoftmax(scores)

                delta=np.outer(probs-y[i],X[i])
                self.weights -= lr*delta

    

In [17]:
labels = df["label"]
lambda_vector = np.random.randn(tf_idf.shape[1]) * 0.01
epochs = 50
lr = 0.01
batch_size = 128
num_docs = len(TF)
num_words = len(IDF)

In [35]:
def f_i(word_index, doc_index, label):
    return tf_idf[doc_index][(label - 1) * num_words + word_index]

In [19]:
def p(doc_index, label):
    return np.exp(f_i(document, label).dot(lambda_vector)[0]) / np.exp(
        [f_i(document, label).dot(lambda_vector)[0] for label_prime in range(1, 5)]
    )

In [32]:
def gradient(batch_indices):
    return [
        np.sum([tf_idf[doc_index][lambda_index] for doc_index in range(num_docs)])
        - np.sum(
            [
                np.sum(f_i(tf_idf[doc_index], label_prime))
                * p(tf_idf[doc_index], label_prime)
                for label_prime in range(1, 5)
            ]
        )
        for lambda_index in range(tf_idf.shape[1])
    ]

In [33]:
def log_likelihood():
    return np.sum(
        [f_i(tf_idf[doc_index], labels[doc_index]).dot(lambda_vector)[0]]
        for doc_index in range(num_docs)
    ) - np.sum(
        [
            np.log10(
                np.sum(
                    [
                        f_i(tf_idf[doc_index], label_prime).dot(lambda_vector)[0]
                        for label_prime in range(1, 5)
                    ]
                )
            )
            for doc_index in range(num_docs)
        ]
    )

In [34]:
for t in tqdm(range(epochs), desc="Epochs"):
    shuffled_indices = np.random.permutation(num_docs)

    for start_idx in tqdm(range(0, num_docs, batch_size), desc="Batchs", leave=False):
        end_idx = min(start_idx + batch_size, num_docs)
        batch_indices = shuffled_indices[start_idx:end_idx]

        grad = gradient(batch_indices)
        lambda_vector += lr * np.array(grad)

    ll = log_likelihood()
    print(f"Epoch {t + 1}/{epochs}, Log-Likelihood: {ll:.4f}")

Epochs:   0%|          | 0/50 [00:00<?, ?it/s]

Batchs:   0%|          | 0/938 [00:00<?, ?it/s]

(1, 232592)


ValueError: operands could not be broadcast together with shapes (232592,) (128,) (232592,) 

In [27]:
with open("lambda.pkl", "wb") as file:
    pickle.dump(lambda_vector, file)

(58148,)


## 4.测试

In [None]:
def predict(lambda_vector, tf_idf):
    scores = tf_idf.dot(lambda_vector)
    probs = softmax(scores)
    return np.argmax(probs, axis=1) + 1

In [27]:
print(tf_idf[0].shape)

(1, 58148)


In [28]:
print(np.nonzero(tf_idf[1])[0])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
