# Linear-Log Model 

## 0.导入库

In [1]:
import numpy as np
import pandas as pd
import re
import math
from collections import Counter

import nltk
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ZDF\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ZDF\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ZDF\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## 1.数据导入与预处理

In [3]:
df = pd.read_csv('ag_news_csv/train.csv',
                 header=None,
                 names=['label', 'title', 'description'])

In [4]:
print(df.head())

   label                                              title  \
0      3  Wall St. Bears Claw Back Into the Black (Reuters)   
1      3  Carlyle Looks Toward Commercial Aerospace (Reu...   
2      3    Oil and Economy Cloud Stocks' Outlook (Reuters)   
3      3  Iraq Halts Oil Exports from Main Southern Pipe...   
4      3  Oil prices soar to all-time record, posing new...   

                                         description  
0  Reuters - Short-sellers, Wall Street's dwindli...  
1  Reuters - Private investment firm Carlyle Grou...  
2  Reuters - Soaring crude prices plus worries\ab...  
3  Reuters - Authorities have halted oil export\f...  
4  AFP - Tearaway world oil prices, toppling reco...  


In [5]:
def replace_space(word):
    return re.sub(r'[-\\/&]', ' ', word)

In [6]:
df['title'] = df['title'].apply(replace_space)
df['description'] = df['description'].apply(replace_space)

In [7]:
def replace_num(word):
    return re.sub(r'\d+', '<NUM>', word)

In [8]:
df['title'] = df['title'].apply(replace_num)
df['description'] = df['description'].apply(replace_num)

In [9]:
def separate_num(word):
    return re.sub(r'(<NUM>)', r' \1 ', word)

In [10]:
df['title'] = df['title'].apply(separate_num)
df['description'] = df['description'].apply(separate_num)

In [11]:
print(df)

        label                                              title  \
0           3  Wall St. Bears Claw Back Into the Black (Reuters)   
1           3  Carlyle Looks Toward Commercial Aerospace (Reu...   
2           3    Oil and Economy Cloud Stocks' Outlook (Reuters)   
3           3  Iraq Halts Oil Exports from Main Southern Pipe...   
4           3  Oil prices soar to all time record, posing new...   
...       ...                                                ...   
119995      1  Pakistan's Musharraf Says Won't Quit as Army C...   
119996      2                  Renteria signing a top shelf deal   
119997      2                    Saban not going to Dolphins yet   
119998      2                                  Today's NFL games   
119999      2                       Nets get Carter from Raptors   

                                              description  
0       Reuters   Short sellers, Wall Street's dwindli...  
1       Reuters   Private investment firm Carlyle Grou...  
2  

In [12]:
def tokenize(text):
    return text.split()

In [13]:
df['tokens'] = df['title'].apply(tokenize) + df['description'].apply(tokenize)

In [14]:
print(df['tokens'])

0         [Wall, St., Bears, Claw, Back, Into, the, Blac...
1         [Carlyle, Looks, Toward, Commercial, Aerospace...
2         [Oil, and, Economy, Cloud, Stocks', Outlook, (...
3         [Iraq, Halts, Oil, Exports, from, Main, Southe...
4         [Oil, prices, soar, to, all, time, record,, po...
                                ...                        
119995    [Pakistan's, Musharraf, Says, Won't, Quit, as,...
119996    [Renteria, signing, a, top, shelf, deal, Red, ...
119997    [Saban, not, going, to, Dolphins, yet, The, Mi...
119998    [Today's, NFL, games, PITTSBURGH, at, NY, GIAN...
119999    [Nets, get, Carter, from, Raptors, INDIANAPOLI...
Name: tokens, Length: 120000, dtype: object


In [15]:
df.drop('description', axis=1, inplace=True)
df.drop('title', axis=1, inplace=True)
df.drop(36065, axis=0, inplace=True)

In [16]:
print(df)

        label                                             tokens
0           3  [Wall, St., Bears, Claw, Back, Into, the, Blac...
1           3  [Carlyle, Looks, Toward, Commercial, Aerospace...
2           3  [Oil, and, Economy, Cloud, Stocks', Outlook, (...
3           3  [Iraq, Halts, Oil, Exports, from, Main, Southe...
4           3  [Oil, prices, soar, to, all, time, record,, po...
...       ...                                                ...
119995      1  [Pakistan's, Musharraf, Says, Won't, Quit, as,...
119996      2  [Renteria, signing, a, top, shelf, deal, Red, ...
119997      2  [Saban, not, going, to, Dolphins, yet, The, Mi...
119998      2  [Today's, NFL, games, PITTSBURGH, at, NY, GIAN...
119999      2  [Nets, get, Carter, from, Raptors, INDIANAPOLI...

[119999 rows x 2 columns]


In [17]:
def lower(tokens):
    return [word.lower() for word in tokens]

In [18]:
df['tokens'] = df['tokens'].apply(lower)

In [19]:
print(df['tokens'])

0         [wall, st., bears, claw, back, into, the, blac...
1         [carlyle, looks, toward, commercial, aerospace...
2         [oil, and, economy, cloud, stocks', outlook, (...
3         [iraq, halts, oil, exports, from, main, southe...
4         [oil, prices, soar, to, all, time, record,, po...
                                ...                        
119995    [pakistan's, musharraf, says, won't, quit, as,...
119996    [renteria, signing, a, top, shelf, deal, red, ...
119997    [saban, not, going, to, dolphins, yet, the, mi...
119998    [today's, nfl, games, pittsburgh, at, ny, gian...
119999    [nets, get, carter, from, raptors, indianapoli...
Name: tokens, Length: 119999, dtype: object


In [20]:
def remove_word_suffixes(word):
    if word.endswith("'s"):
        word = word[:-2]
    # elif word.endswith("s"):
    #    word = word[:-1]
    else:
        return re.sub(r'[.,:()\'"?;#$!]', '', word)

In [21]:
def remove_suffixes(tokens):
    return [remove_word_suffixes(word) for word in tokens]

In [22]:
df['tokens'] = df['tokens'].apply(remove_suffixes)

In [23]:
print(df['tokens'])

0         [wall, st, bears, claw, back, into, the, black...
1         [carlyle, looks, toward, commercial, aerospace...
2         [oil, and, economy, cloud, stocks, outlook, re...
3         [iraq, halts, oil, exports, from, main, southe...
4         [oil, prices, soar, to, all, time, record, pos...
                                ...                        
119995    [None, musharraf, says, wont, quit, as, army, ...
119996    [renteria, signing, a, top, shelf, deal, red, ...
119997    [saban, not, going, to, dolphins, yet, the, mi...
119998    [None, nfl, games, pittsburgh, at, ny, giants,...
119999    [nets, get, carter, from, raptors, indianapoli...
Name: tokens, Length: 119999, dtype: object


In [24]:
def remove_stopwords(tokens):
    return [
        word for word in tokens
        if (word not in stopwords) and (word is not None)
    ]

In [25]:
with open('stopwords.txt') as file:
    stopwords = file.read().split(',')

In [26]:
print(stopwords)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'A', 'about', 'above', 'across', 'after', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'an', 'and', 'another', 'any', 'anyone', 'anything', 'anywhere', 'are', "aren't", 'around', 'as', 'at', 'b', 'B', 'back', 'be', 'became', 'because', 'become', 'becomes', 'been', 'before', 'behind', 'being', 'below', 'between', 'both', 'but', 'by', 'c', 'C', 'can', 'cannot', "can't", 'could', "couldn't", 'd', 'D', 'did', "didn't", 'do', 'does', "doesn't", 'doing', 'done', "don't", 'down', 'during', 'e', 'E', 'each', 'either', 'enough', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'f', 'F', 'few', 'find', 'first', 'for', 'four', 'from', 'full', 'further', 'g', 'G', 'get', 'give', 'go', 'h', 'H', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', "here's", 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 

In [27]:
df['tokens'] = df['tokens'].apply(remove_stopwords)

In [28]:
print(df['tokens'])

0         [wall, st, bears, claw, black, reuters, reuter...
1         [carlyle, looks, commercial, aerospace, reuter...
2         [oil, economy, cloud, stocks, outlook, reuters...
3         [iraq, halts, oil, exports, main, southern, pi...
4         [oil, prices, soar, time, record, posing, new,...
                                ...                        
119995    [musharraf, says, wont, quit, army, chief, kar...
119996    [renteria, signing, top, shelf, deal, red, sox...
119997    [saban, going, dolphins, miami, dolphins, cour...
119998    [nfl, games, pittsburgh, ny, giants, time, <nu...
119999    [nets, carter, raptors, indianapolis, star, vi...
Name: tokens, Length: 119999, dtype: object


In [29]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [30]:
def lemmatize_with_pos(tokens):
    pos_tagged = pos_tag(tokens)
    return [
        lemmatizer.lemmatize(token, get_wordnet_pos(pos)) for token, pos in pos_tagged
    ]

In [32]:
lemmatizer = WordNetLemmatizer()

df['tokens'] = df['tokens'].apply(lemmatize_with_pos)

In [33]:
print(df['tokens'])

0         [wall, st, bear, claw, black, reuters, reuters...
1         [carlyle, look, commercial, aerospace, reuters...
2         [oil, economy, cloud, stock, outlook, reuters,...
3         [iraq, halt, oil, export, main, southern, pipe...
4         [oil, price, soar, time, record, pose, new, me...
                                ...                        
119995    [musharraf, say, wont, quit, army, chief, kara...
119996    [renteria, sign, top, shelf, deal, red, sox, g...
119997    [saban, go, dolphin, miami, dolphin, courtship...
119998    [nfl, game, pittsburgh, ny, giant, time, <num>...
119999    [net, carter, raptor, indianapolis, star, vinc...
Name: tokens, Length: 119999, dtype: object


In [42]:
df.to_csv('df.csv')

## 2.TF-IDF编码

In [34]:
words_counter = Counter()
for tokens in df['tokens']:
    words_counter.update(tokens)
vocabulary = dict(words_counter)

In [35]:
counter = 0
for key, value in vocabulary.items():
    if counter < 1000:
        print(f"{key}: {value}")
        counter += 1
    else:
        break

wall: 1500
st: 1678
bear: 722
claw: 36
black: 836
reuters: 17268
short: 924
seller: 105
dwindle: 48
band: 240
ultra: 81
cynic: 6
see: 1861
green: 864
carlyle: 16
look: 2786
commercial: 541
aerospace: 129
private: 721
investment: 986
firm: 2493
group: 5316
reputation: 125
make: 4651
time: 6634
occasionally: 16
controversial: 432
play: 3058
defense: 1262
industry: 2470
quietly: 141
place: 1655
bet: 249
market: 4750
oil: 7547
economy: 1751
cloud: 213
stock: 6954
outlook: 903
soar: 636
crude: 1371
price: 6906
plus: 347
worry: 889
earnings: 2054
expect: 3524
hang: 243
week: 6873
depth: 84
summer: 552
doldrums: 16
iraq: 5954
halt: 527
export: 585
main: 723
southern: 1186
pipeline: 264
authority: 1028
flow: 224
intelligence: 451
show: 2183
rebel: 1852
militia: 306
strike: 2011
infrastructure: 201
official: 5173
say: 29010
saturday: 3986
record: 4341
pose: 262
new: 21584
menace: 44
afp: 4527
tearaway: 1
world: 7785
toppling: 1
strain: 279
wallet: 72
present: 309
economic: 1645
barely: 165
mont

In [36]:
def compute_tf(tokens):
    tf = Counter(tokens)
    for i in tf:
        tf[i] = (1 + math.log10(tf[i])) if tf[i] != 0 else 0
    return dict(tf)

In [37]:
TF = [compute_tf(tokens) for tokens in df['tokens']]

In [38]:
counter = 0
for i in TF:
    if counter < 100:
        print(i)
        counter += 1
    else:
        break

{'wall': 1.3010299956639813, 'st': 1.0, 'bear': 1.0, 'claw': 1.0, 'black': 1.0, 'reuters': 1.3010299956639813, 'short': 1.0, 'seller': 1.0, 'dwindle': 1.0, 'band': 1.0, 'ultra': 1.0, 'cynic': 1.0, 'see': 1.0, 'green': 1.0}
{'carlyle': 1.3010299956639813, 'look': 1.0, 'commercial': 1.0, 'aerospace': 1.0, 'reuters': 1.3010299956639813, 'private': 1.0, 'investment': 1.0, 'firm': 1.0, 'group': 1.0, 'reputation': 1.0, 'make': 1.0, 'time': 1.0, 'occasionally': 1.0, 'controversial': 1.0, 'play': 1.0, 'defense': 1.0, 'industry': 1.0, 'quietly': 1.0, 'place': 1.0, 'bet': 1.0, 'market': 1.0}
{'oil': 1.0, 'economy': 1.3010299956639813, 'cloud': 1.0, 'stock': 1.3010299956639813, 'outlook': 1.3010299956639813, 'reuters': 1.3010299956639813, 'soar': 1.0, 'crude': 1.0, 'price': 1.0, 'plus': 1.0, 'worry': 1.0, 'earnings': 1.0, 'expect': 1.0, 'hang': 1.0, 'market': 1.0, 'week': 1.0, 'depth': 1.0, 'summer': 1.0, 'doldrums': 1.0}
{'iraq': 1.3010299956639813, 'halt': 1.3010299956639813, 'oil': 1.477121254

In [39]:
def compute_idf(dft, df_tokens_len):
    return math.log10(df_tokens_len / dft)

In [40]:
IDF = {
    word: compute_idf(dft, len(df['tokens']))
    for word, dft in vocabulary.items()
}

In [41]:
counter = 0
for key, value in IDF.items():
    if counter < 50:
        print(f"{key}: {value}")
        counter += 1
    else:
        break

wall: 1.9030863678561813
st: 1.854385670419181
bear: 2.2206404293422235
claw: 3.522875126144575
black: 2.156971349472846
reuters: 0.8419355869276325
short: 2.113505655691756
seller: 3.0579883278419246
dwindle: 3.3979363895362753
band: 2.6989663852002566
ultra: 3.170692608033213
cynic: 4.301026376528219
see: 1.8094312537810955
green: 2.1426638844329693
carlyle: 3.875057644255938
look: 1.634196514823918
commercial: 2.3459803618052932
aerospace: 2.9685879166126137
private: 2.2212423621924335
investment: 2.0853007119706515
firm: 1.682455348408089
group: 1.353592654641168
reputation: 2.9822676139038062
make: 1.411631287400346
time: 1.257402159728399
occasionally: 3.875057644255938
controversial: 2.4436938800969505
play: 1.5937401458355611
defense: 1.9781182720037471
industry: 1.686480673652197
quietly: 2.929958514256483
place: 1.860379628800125
bet: 2.682978279816126
market: 1.402484017286996
oil: 1.2014032769204646
economy: 1.8358914808284164
cloud: 2.750798023473125
stock: 1.2369429405646