# Linear-Log Model 

## 0.导入库

In [1]:
import numpy as np
import pandas as pd
import re
import math
from collections import Counter

import nltk
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ZDF\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ZDF\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ZDF\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

## 1.数据导入与预处理

In [3]:
df = pd.read_csv('ag_news_csv/train.csv',
                 header=None,
                 names=['label', 'title', 'description'])

In [4]:
print(df.head())

   label                                              title  \
0      3  Wall St. Bears Claw Back Into the Black (Reuters)   
1      3  Carlyle Looks Toward Commercial Aerospace (Reu...   
2      3    Oil and Economy Cloud Stocks' Outlook (Reuters)   
3      3  Iraq Halts Oil Exports from Main Southern Pipe...   
4      3  Oil prices soar to all-time record, posing new...   

                                         description  
0  Reuters - Short-sellers, Wall Street's dwindli...  
1  Reuters - Private investment firm Carlyle Grou...  
2  Reuters - Soaring crude prices plus worries\ab...  
3  Reuters - Authorities have halted oil export\f...  
4  AFP - Tearaway world oil prices, toppling reco...  


In [5]:
def replace_space(word):
    return re.sub(r'[-\\/&]', ' ', word)

In [6]:
df['title'] = df['title'].apply(replace_space)
df['description'] = df['title'].apply(replace_space)

In [7]:
def replace_num(word):
    return re.sub(r'\d+', '<NUM>', word)

In [8]:
df['title'] = df['title'].apply(replace_num)
df['description'] = df['title'].apply(replace_num)

In [9]:
def separate_num(word):
    return re.sub(r'(<NUM>)', r' \1 ', word)

In [10]:
df['title'] = df['title'].apply(separate_num)
df['description'] = df['title'].apply(separate_num)

In [11]:
print(df)

        label                                              title  \
0           3  Wall St. Bears Claw Back Into the Black (Reuters)   
1           3  Carlyle Looks Toward Commercial Aerospace (Reu...   
2           3    Oil and Economy Cloud Stocks' Outlook (Reuters)   
3           3  Iraq Halts Oil Exports from Main Southern Pipe...   
4           3  Oil prices soar to all time record, posing new...   
...       ...                                                ...   
119995      1  Pakistan's Musharraf Says Won't Quit as Army C...   
119996      2                  Renteria signing a top shelf deal   
119997      2                    Saban not going to Dolphins yet   
119998      2                                  Today's NFL games   
119999      2                       Nets get Carter from Raptors   

                                              description  
0       Wall St. Bears Claw Back Into the Black (Reuters)  
1       Carlyle Looks Toward Commercial Aerospace (Reu...  
2  

In [12]:
def tokenize(text):
    return text.split()

In [13]:
df['tokens'] = df['title'].apply(tokenize) + df['description'].apply(tokenize)

In [14]:
print(df['tokens'])

0         [Wall, St., Bears, Claw, Back, Into, the, Blac...
1         [Carlyle, Looks, Toward, Commercial, Aerospace...
2         [Oil, and, Economy, Cloud, Stocks', Outlook, (...
3         [Iraq, Halts, Oil, Exports, from, Main, Southe...
4         [Oil, prices, soar, to, all, time, record,, po...
                                ...                        
119995    [Pakistan's, Musharraf, Says, Won't, Quit, as,...
119996    [Renteria, signing, a, top, shelf, deal, Rente...
119997    [Saban, not, going, to, Dolphins, yet, Saban, ...
119998           [Today's, NFL, games, Today's, NFL, games]
119999    [Nets, get, Carter, from, Raptors, Nets, get, ...
Name: tokens, Length: 120000, dtype: object


In [15]:
df.drop('description', axis=1, inplace=True)
df.drop('title', axis=1, inplace=True)
df.drop(36065, axis=0, inplace=True)

In [16]:
print(df)

        label                                             tokens
0           3  [Wall, St., Bears, Claw, Back, Into, the, Blac...
1           3  [Carlyle, Looks, Toward, Commercial, Aerospace...
2           3  [Oil, and, Economy, Cloud, Stocks', Outlook, (...
3           3  [Iraq, Halts, Oil, Exports, from, Main, Southe...
4           3  [Oil, prices, soar, to, all, time, record,, po...
...       ...                                                ...
119995      1  [Pakistan's, Musharraf, Says, Won't, Quit, as,...
119996      2  [Renteria, signing, a, top, shelf, deal, Rente...
119997      2  [Saban, not, going, to, Dolphins, yet, Saban, ...
119998      2         [Today's, NFL, games, Today's, NFL, games]
119999      2  [Nets, get, Carter, from, Raptors, Nets, get, ...

[119999 rows x 2 columns]


In [17]:
def lower(tokens):
    return [word.lower() for word in tokens]

In [18]:
df['tokens'] = df['tokens'].apply(lower)

In [19]:
print(df['tokens'])

0         [wall, st., bears, claw, back, into, the, blac...
1         [carlyle, looks, toward, commercial, aerospace...
2         [oil, and, economy, cloud, stocks', outlook, (...
3         [iraq, halts, oil, exports, from, main, southe...
4         [oil, prices, soar, to, all, time, record,, po...
                                ...                        
119995    [pakistan's, musharraf, says, won't, quit, as,...
119996    [renteria, signing, a, top, shelf, deal, rente...
119997    [saban, not, going, to, dolphins, yet, saban, ...
119998           [today's, nfl, games, today's, nfl, games]
119999    [nets, get, carter, from, raptors, nets, get, ...
Name: tokens, Length: 119999, dtype: object


In [20]:
def remove_word_suffixes(word):
    if word.endswith("'s"):
        word = word[:-2]
    # elif word.endswith("s"):
    #    word = word[:-1]
    else:
        return re.sub(r'[.,:()\'"?;#$!]', '', word)

In [21]:
def remove_suffixes(tokens):
    return [remove_word_suffixes(word) for word in tokens]

In [22]:
df['tokens'] = df['tokens'].apply(remove_suffixes)

In [23]:
print(df['tokens'])

0         [wall, st, bears, claw, back, into, the, black...
1         [carlyle, looks, toward, commercial, aerospace...
2         [oil, and, economy, cloud, stocks, outlook, re...
3         [iraq, halts, oil, exports, from, main, southe...
4         [oil, prices, soar, to, all, time, record, pos...
                                ...                        
119995    [None, musharraf, says, wont, quit, as, army, ...
119996    [renteria, signing, a, top, shelf, deal, rente...
119997    [saban, not, going, to, dolphins, yet, saban, ...
119998                 [None, nfl, games, None, nfl, games]
119999    [nets, get, carter, from, raptors, nets, get, ...
Name: tokens, Length: 119999, dtype: object


In [24]:
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stopwords and word != '']

In [25]:
with open('stopwords.txt') as file:
    stopwords = file.read().split(',')

In [26]:
print(stopwords)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'A', 'about', 'above', 'across', 'after', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'an', 'and', 'another', 'any', 'anyone', 'anything', 'anywhere', 'are', "aren't", 'around', 'as', 'at', 'b', 'B', 'back', 'be', 'became', 'because', 'become', 'becomes', 'been', 'before', 'behind', 'being', 'below', 'between', 'both', 'but', 'by', 'c', 'C', 'can', 'cannot', "can't", 'could', "couldn't", 'd', 'D', 'did', "didn't", 'do', 'does', "doesn't", 'doing', 'done', "don't", 'down', 'during', 'e', 'E', 'each', 'either', 'enough', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'f', 'F', 'few', 'find', 'first', 'for', 'four', 'from', 'full', 'further', 'g', 'G', 'get', 'give', 'go', 'h', 'H', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', "here's", 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 

In [27]:
df['tokens'] = df['tokens'].apply(remove_stopwords)

In [28]:
print(df['tokens'])

0         [wall, st, bears, claw, black, reuters, wall, ...
1         [carlyle, looks, commercial, aerospace, reuter...
2         [oil, economy, cloud, stocks, outlook, reuters...
3         [iraq, halts, oil, exports, main, southern, pi...
4         [oil, prices, soar, time, record, posing, new,...
                                ...                        
119995    [None, musharraf, says, wont, quit, army, chie...
119996    [renteria, signing, top, shelf, deal, renteria...
119997     [saban, going, dolphins, saban, going, dolphins]
119998                 [None, nfl, games, None, nfl, games]
119999       [nets, carter, raptors, nets, carter, raptors]
Name: tokens, Length: 119999, dtype: object


In [29]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [30]:
def lemmatize_with_pos(tokens):
    pos_tagged = pos_tag(tokens)
    return [
        lemmatizer.lemmatize(token, get_wordnet_pos(pos)) for token, pos in pos_tagged
    ]

In [31]:
lemmatizer = WordNetLemmatizer()

df['tokens'] = df['tokens'].apply(lemmatize_with_pos)

NameError: name 'lemmatizer' is not defined

In [None]:
print(df['tokens'])

## 2.TF-IDF编码

In [28]:
words_counter = Counter()
for tokens in df['tokens']:
    words_counter.update(tokens)
vocabulary = dict(words_counter)

In [29]:
counter = 0
for key, value in vocabulary.items():
    if counter < 1000:
        print(f"{key}: {value}")
        counter += 1
    else:
        break

wall: 534
st: 994
bears: 320
claw: 10
black: 520
reuters: 8522
carlyle: 16
looks: 506
commercial: 104
aerospace: 34
oil: 5048
economy: 804
cloud: 86
stocks: 2934
outlook: 744
iraq: 4416
halts: 108
exports: 180
main: 58
southern: 282
pipeline: 140
prices: 2328
soar: 194
time: 1500
record: 1748
posing: 8
new: 8064
menace: 20
afp: 3934
end: 1694
near: 900
year: 1666
lows: 158
money: 338
funds: 350
fell: 92
latest: 530
week: 982
ap: 15562
fed: 682
minutes: 56
dissent: 12
inflation: 242
usatodaycom: 284
safety: 272
net: 988
forbescom: 170
need: 368
opec: 406
pump: 44
iran: 1564
gov: 108
non: 70
nations: 266
output: 284
purnomo: 4
google: 1986
ipo: 482
auction: 214
rocky: 36
start: 1116
dollar: 1608
falls: 888
broadly: 20
trade: 1030
gap: 198
rescuing: 6
old: 466
saver: 14
kids: 132
rule: 320
school: 550
market: 1314
head: 672
value: 116
deficit: 264
swells: 20
june: 74
shell: 236
target: 662
total: 134
faces: 766
playboy: 44
slip: 310
eurozone: 62
keeps: 474
growing: 188
expansion: 194
slow

In [43]:
def compute_tf(tokens):
    tf = Counter(tokens)
    for i in tf:
        tf[i] = (1 + math.log10(tf[i])) if tf[i] != 0 else 0
    return dict(tf)

In [44]:
TF = [compute_tf(tokens) for tokens in df['tokens']]

In [45]:
counter = 0
for i in TF:
    if counter < 100:
        print(i)
        counter += 1
    else:
        break

{'wall': 1.3010299956639813, 'st': 1.3010299956639813, 'bears': 1.3010299956639813, 'claw': 1.3010299956639813, 'black': 1.3010299956639813, 'reuters': 1.3010299956639813}
{'carlyle': 1.3010299956639813, 'looks': 1.3010299956639813, 'commercial': 1.3010299956639813, 'aerospace': 1.3010299956639813, 'reuters': 1.3010299956639813}
{'oil': 1.3010299956639813, 'economy': 1.3010299956639813, 'cloud': 1.3010299956639813, 'stocks': 1.3010299956639813, 'outlook': 1.3010299956639813, 'reuters': 1.3010299956639813}
{'iraq': 1.3010299956639813, 'halts': 1.3010299956639813, 'oil': 1.3010299956639813, 'exports': 1.3010299956639813, 'main': 1.3010299956639813, 'southern': 1.3010299956639813, 'pipeline': 1.3010299956639813, 'reuters': 1.3010299956639813}
{'oil': 1.3010299956639813, 'prices': 1.3010299956639813, 'soar': 1.3010299956639813, 'time': 1.3010299956639813, 'record': 1.3010299956639813, 'posing': 1.3010299956639813, 'new': 1.3010299956639813, 'menace': 1.3010299956639813, 'economy': 1.301029

In [35]:
def compute_idf(dft, df_tokens_len):
    return math.log10(df_tokens_len / dft)

In [38]:
IDF = {
    word: compute_idf(dft, len(df['tokens']))
    for word, dft in vocabulary.items()
}

In [46]:
counter = 0
for key, value in IDF.items():
    if counter < 50:
        print(f"{key}: {value}")
        counter += 1
    else:
        break

wall: 2.3516363698833063
st: 2.0817912425145493
bears: 2.5740276485919567
claw: 4.0791776269118625
black: 2.3631742832770635
reuters: 1.148636097047428
carlyle: 3.875057644255938
looks: 2.3750271100720632
commercial: 3.062144287613082
aerospace: 3.5476987098696076
oil: 1.3760582806757846
economy: 2.1739215781634114
cloud: 3.144679175668295
stocks: 1.6117175174045988
outlook: 2.207604691365984
iraq: 1.43414856219072
halts: 3.045753871424913
exports: 2.8239051218085565
main: 3.315749633348925
southern: 2.6289285185925015
pipeline: 2.9330495912336243
prices: 1.7121946509340116
soar: 2.7913758969816365
time: 1.9030863678561813
record: 1.8366361986134783
posing: 4.176087639919919
new: 1.1726271078104125
menace: 3.778147631247881
afp: 1.4843432713285447
end: 1.8502642209171745
near: 2.1249351174725377
year: 1.8575026298410937
lows: 2.88052053995744
money: 2.550260926634208
funds: 2.535109582561587
fell: 3.1153897995663073
latest: 2.3549017573110733
week: 2.087066139124913
ap: 0.8871122159325