In [125]:
import pandas as pd
import numpy as np
import string
import json
import re

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

from gensim.models import Word2Vec, FastText
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn.functional as F

from models import MLP, LSTM
import torch.nn as nn
import torch.optim as optim

import warnings
warnings.filterwarnings("ignore")

pd.set_option('max_colwidth', 0)

[nltk_data] Downloading package punkt to /home/alex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/alex/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [126]:
train_raw_df = pd.read_csv('data/train.csv')
test_raw_df = pd.read_csv('data/test.csv')
test_raw_df = test_raw_df.drop(columns=['ID'])

In [127]:
train_raw_df

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again."
1,3,Carlyle Looks Toward Commercial Aerospace (Reuters),"Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market."
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.
3,3,Iraq Halts Oil Exports from Main Southern Pipeline (Reuters),"Reuters - Authorities have halted oil export\flows from the main pipeline in southern Iraq after\intelligence showed a rebel militia could strike\infrastructure, an oil official said on Saturday."
4,3,"Oil prices soar to all-time record, posing new menace to US economy (AFP)","AFP - Tearaway world oil prices, toppling records and straining wallets, present a new economic menace barely three months before the US presidential elections."
...,...,...,...
119995,1,Pakistan's Musharraf Says Won't Quit as Army Chief,"KARACHI (Reuters) - Pakistani President Pervez Musharraf has said he will stay on as army chief, reneging on a pledge to quit the powerful post by the end of the year."
119996,2,Renteria signing a top-shelf deal,"Red Sox general manager Theo Epstein acknowledged Edgar Renteria was more a luxury for the 2005 Red Sox than a necessity. But there's nothing wrong with getting the keys to a BMW, and that's what the four-time All-Star and two-time Gold Glover is in the eyes of the Red Sox."
119997,2,Saban not going to Dolphins yet,"The Miami Dolphins will put their courtship of LSU coach Nick Saban on hold to comply with the NFL's hiring policy by interviewing at least one minority candidate, a team source told The Associated Press last night."
119998,2,Today's NFL games,"PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: Steelers by 10. Records: Steelers 12-1, Giants 5-8. Vs. spread: Steelers 10-1-2, Giants 5-8. Series: Giants lead, 43-27-3. Comments: Think the Giants knew Ben Roethlisberger was available on draft day when they broke the bank and traded for Eli Manning? . . . All Big Ben has done this year is complete ..."


In [128]:
test_raw_df

Unnamed: 0,Title,Description
0,Fears for T N pension after talks,Unions representing workers at Turner Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.
1,The Race is On: Second Private Team Sets Launch Date for Human Spaceflight (SPACE.com),"SPACE.com - TORONTO, Canada -- A second\team of rocketeers competing for the #36;10 million Ansari X Prize, a contest for\privately funded suborbital space flight, has officially announced the first\launch date for its manned rocket."
2,Ky. Company Wins Grant to Study Peptides (AP),"AP - A company founded by a chemistry researcher at the University of Louisville won a grant to develop a method of producing better peptides, which are short chains of amino acids, the building blocks of proteins."
3,Prediction Unit Helps Forecast Wildfires (AP),"AP - It's barely dawn when Mike Fitzpatrick starts his shift with a blur of colorful maps, figures and endless charts, but already he knows what the day will bring. Lightning will strike in places he expects. Winds will pick up, moist places will dry and flames will roar."
4,Calif. Aims to Limit Farm-Related Smog (AP),"AP - Southern California's smog-fighting agency went after emissions of the bovine variety Friday, adopting the nation's first rules to reduce air pollution from dairy cow manure."
...,...,...
7595,Around the world,"Ukrainian presidential candidate Viktor Yushchenko was poisoned with the most harmful known dioxin, which is contained in Agent Orange, a scientist who analyzed his blood said Friday."
7596,Void is filled with Clement,"With the supply of attractive pitching options dwindling daily -- they lost Pedro Martinez to the Mets, missed on Tim Hudson, and are resigned to Randy Johnson becoming a Yankee -- the Red Sox struck again last night, coming to terms with free agent Matt Clement on a three-year deal that will pay the righthander in the neighborhood of \$25 ..."
7597,Martinez leaves bitter,"Like Roger Clemens did almost exactly eight years earlier, Pedro Martinez has left the Red Sox apparently bitter about the way he was treated by management."
7598,5 of arthritis patients in Singapore take Bextra or Celebrex &lt;b&gt;...&lt;/b&gt;,SINGAPORE : Doctors in the United States have warned that painkillers Bextra and Celebrex may be linked to major cardiovascular problems and should not be prescribed.


# Cleaning data

Очевидно, что удаление знаков препинания и других символов необходимо, чтобы избежать кодирования не несущих смысл символов, сделаем это для всех экспериментов. Затем сравним удаление стоп-слов, удаление цифр и их комбирование.

In [129]:
print("Nulls count:", train_raw_df.isna().sum(), sep='\n')
print()
print("Empty string count:", train_raw_df.eq('').sum(), sep='\n')

Nulls count:
Class Index    0
Title          0
Description    0
dtype: int64

Empty string count:
Class Index    0
Title          0
Description    0
dtype: int64


In [130]:
print("Nulls count:", test_raw_df.isna().sum(), sep='\n')
print()
print("Empty string count:", test_raw_df.eq('').sum(), sep='\n')

Nulls count:
Title          0
Description    0
dtype: int64

Empty string count:
Title          0
Description    0
dtype: int64


In [131]:
def clean_text(train_df, remove_digits=False, remove_stop_words=False):
    train_df = train_df.copy(deep=True)

    # train_df['text'] = train_raw_df['Title'] + " " + train_raw_df['Description']
    train_df['text'] = train_df['Description']
    train_df.drop(columns=['Title', 'Description'], inplace=True)

    train_df['text'] = train_df['text'].str.replace("\\", " ").str.lower()
    
    if remove_digits:
        train_df['text'] = train_df['text'].str.replace(r'\d', ' ', regex=True)

    if remove_stop_words:
        stop_words = stopwords.words('english')
        stop_words_pattern = r'\b(?:' + '|'.join(re.escape(word) for word in stop_words) + r')\b'
        train_df['text'] = train_df['text'].str.replace(stop_words_pattern, ' ', regex=True)

    train_df['text'] = train_df['text'].str.replace(r'[^\d\w\s]', '', regex=True)
    train_df['text'] = train_df['text'].apply(word_tokenize, preserve_line=True)

    return train_df

In [132]:
train_removed_all = clean_text(train_raw_df, remove_digits=True, remove_stop_words=True)
test_removed_all = clean_text(test_raw_df, remove_digits=True, remove_stop_words=True)

In [133]:
train_removed_all

Unnamed: 0,Class Index,text
0,3,"[reuters, shortsellers, wall, street, dwindling, band, ultracynics, seeing, green]"
1,3,"[reuters, private, investment, firm, carlyle, group, reputation, making, welltimed, occasionally, controversial, plays, defense, industry, quietly, placed, bets, another, part, market]"
2,3,"[reuters, soaring, crude, prices, plus, worries, economy, outlook, earnings, expected, hang, stock, market, next, week, depth, summer, doldrums]"
3,3,"[reuters, authorities, halted, oil, export, flows, main, pipeline, southern, iraq, intelligence, showed, rebel, militia, could, strike, infrastructure, oil, official, said, saturday]"
4,3,"[afp, tearaway, world, oil, prices, toppling, records, straining, wallets, present, new, economic, menace, barely, three, months, us, presidential, elections]"
...,...,...
119995,1,"[karachi, reuters, pakistani, president, pervez, musharraf, said, stay, army, chief, reneging, pledge, quit, powerful, post, end, year]"
119996,2,"[red, sox, general, manager, theo, epstein, acknowledged, edgar, renteria, luxury, red, sox, necessity, nothing, wrong, getting, keys, bmw, fourtime, star, twotime, gold, glover, eyes, red, sox]"
119997,2,"[miami, dolphins, put, courtship, lsu, coach, nick, saban, hold, comply, nfl, hiring, policy, interviewing, least, one, minority, candidate, team, source, told, associated, press, last, night]"
119998,2,"[pittsburgh, ny, giants, time, p, line, steelers, records, steelers, giants, vs, spread, steelers, giants, series, giants, lead, comments, think, giants, knew, ben, roethlisberger, available, draft, day, broke, bank, traded, eli, manning, big, ben, done, year, complete]"


In [134]:
test_removed_all

Unnamed: 0,text
0,"[unions, representing, workers, turner, newall, say, disappointed, talks, stricken, parent, firm, federal, mogul]"
1,"[spacecom, toronto, canada, second, team, rocketeers, competing, million, ansari, x, prize, contest, privately, funded, suborbital, space, flight, officially, announced, first, launch, date, manned, rocket]"
2,"[ap, company, founded, chemistry, researcher, university, louisville, grant, develop, method, producing, better, peptides, short, chains, amino, acids, building, blocks, proteins]"
3,"[ap, barely, dawn, mike, fitzpatrick, starts, shift, blur, colorful, maps, figures, endless, charts, already, knows, day, bring, lightning, strike, places, expects, winds, pick, moist, places, dry, flames, roar]"
4,"[ap, southern, california, smogfighting, agency, went, emissions, bovine, variety, friday, adopting, nation, first, rules, reduce, air, pollution, dairy, cow, manure]"
...,...
7595,"[ukrainian, presidential, candidate, viktor, yushchenko, poisoned, harmful, known, dioxin, contained, agent, orange, scientist, analyzed, blood, said, friday]"
7596,"[supply, attractive, pitching, options, dwindling, daily, lost, pedro, martinez, mets, missed, tim, hudson, resigned, randy, johnson, becoming, yankee, red, sox, struck, last, night, coming, terms, free, agent, matt, clement, threeyear, deal, pay, righthander, neighborhood]"
7597,"[like, roger, clemens, almost, exactly, eight, years, earlier, pedro, martinez, left, red, sox, apparently, bitter, way, treated, management]"
7598,"[singapore, doctors, united, states, warned, painkillers, bextra, celebrex, may, linked, major, cardiovascular, problems, prescribed]"


In [135]:
print("Nulls count:")
print("\tRemoved both:", train_removed_all['text'].isna().sum())

Nulls count:
	Removed both: 0


In [136]:
def lemmatizing(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join(lemmatizer.lemmatize(token) for token in text)

In [137]:
class Vectorizer:
    def __init__(self, vect_type, params = None) -> None:
        self.vect_type = vect_type
        self.params = params
    
    def _split_text(self, text):
        return [sentence.split() for sentence in text]
    
    def fit(self, text):
        if self.vect_type == "word2vec":
            self.vectorizer = Word2Vec(sentences=self._split_text(text), **self.params)
            
        elif self.vect_type == "fasttext":
            self.vectorizer = FastText(sentences=self._split_text(text), **self.params)
            
        elif self.vect_type == "tfidf":
            self.vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=10000)
            self.vectorizer.fit(text)
            
    def transform(self, text):
        if self.vect_type == "tfidf":
            return self.vectorizer.transform(text).toarray()
        else:
            embeddings = []
            vector = self.vectorizer.wv
            null_vect = np.zeros(self.params['vector_size'])

            for sentence in self._split_text(text):
                vectors = [vector[token] if token in vector else null_vect for token in sentence]
                embeddings.append(np.mean(vectors, axis=0))
            
            return np.array(embeddings)

Разделяем на трейн и тест

In [138]:
data = train_removed_all
X = data[['text']].copy()
y = data[['Class Index']].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31)

In [139]:
test_data = test_removed_all
X_test_subm = test_data[['text']].copy()

Делаем лемматизацию

In [140]:
X_train['text'] = X_train['text'].apply(lemmatizing)
X_test['text'] = X_test['text'].apply(lemmatizing)

X_test_subm['text'] = X_test_subm['text'].apply(lemmatizing)

Векторизуем предложения

In [141]:
params = {'vector_size': 100,
            'window': 5,
            'min_count': 4,
            'workers': 4,
         }

vectorizer = Vectorizer("word2vec", params)

vectorizer.fit(X_train['text'])
            
X_train_vectorized = vectorizer.transform(X_train['text'])
X_test_vectorized = vectorizer.transform(X_test['text'])

X_test_subm_vectorized = vectorizer.transform(X_test_subm['text'])

Пробуем MLP

In [142]:
class NewsDataset(Dataset):
    def __init__(self, X: np.array, y: np.array=None):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x = self.X[idx]
        if self.y is not None:
            y = self.y[idx]
            y = torch.tensor(y, dtype=torch.float32)
        x = torch.tensor(x, dtype=torch.float32)
        if self.y is not None:
            return x, y
        
        return x

In [143]:
y_train_ = np.array(y_train['Class Index']) - 1
y_test_ = np.array(y_test['Class Index']) - 1

train_dataset = NewsDataset(X_train_vectorized, y_train_)
test_dataset = NewsDataset(X_test_vectorized, y_test_)

test_dataset_no_gt = NewsDataset(X_test_vectorized)

# train_size = int(0.8 * len(train_dataset))
# val_size = len(train_dataset) - train_size

# Разделение датасета
# train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
# val_loader = Dataset(val_dataset)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=True)
test_loader_no_gt = DataLoader(test_dataset_no_gt, batch_size=128, shuffle=False)

In [144]:
test_subm_dataset = NewsDataset(X_test_subm_vectorized)
test_subm_loader = DataLoader(test_subm_dataset, batch_size=128, shuffle=False)

In [145]:
def predict(model, dataloader, conv=False):
    model.eval()
    all_preds = []
    with torch.no_grad():
        for batch_x in dataloader:
            batch_x = batch_x.float()
            if conv:
                batch_x = batch_x.unsqueeze(2)
            outputs = model(batch_x)
            logits = F.softmax(outputs)
            _, preds = torch.max(logits, 1)

            all_preds.extend(preds.cpu().numpy())
            
    return all_preds

In [146]:
criterion = nn.CrossEntropyLoss()
def evaluate(model, dataloader, conv=False):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        total_loss = 0.0
        for batch_x, batch_y in dataloader:
            batch_x, batch_y = batch_x.float(), batch_y.long()
            if conv:
                batch_x = batch_x.unsqueeze(2)
            outputs = model(batch_x)
            logits = F.softmax(outputs)
            _, preds = torch.max(logits, 1)
            
            loss = criterion(outputs, batch_y)
            
            total_loss += loss.item()
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch_y.cpu().numpy())

    # Вычисляем F1 Score
    val_loss = total_loss / len(dataloader)
    # print(f"Val loss: {val_loss:.4f}")
    f1 = f1_score(all_labels, all_preds, average='micro')
    # print(f"F1 Score on test: {f1:.4f}")
    return f1, loss

In [147]:
def train(model, dataloader, criterion, optimizer, num_epochs=10, conv=False, name='MLP'):
    model.train()
    train_losses = []
    val_losses = []
    for epoch in range(num_epochs):
        total_loss = 0.0
        for batch_x, batch_y in dataloader:
            
            batch_x, batch_y = batch_x.float(), batch_y.long()
            if conv:
                batch_x = batch_x.unsqueeze(2)
                
            # print(batch_x.shape)
            
            outputs = model(batch_x)
            logits = F.softmax(outputs)
            
            loss = criterion(outputs, batch_y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        train_loss = total_loss / len(dataloader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss:.4f}")
        
        f1, val_loss = evaluate(model, test_loader, conv=conv)
        print(f'Val loss: {val_loss}')
        
        if len(val_losses)==0 or val_loss < min_val_loss:
            min_val_loss = val_loss
            torch.save(model.state_dict(), f'{name}.pt')
        
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        
    return train_losses, val_losses

In [110]:
input_dim = 100
hidden_dim = 128
output_dim = 4


model = MLP(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.RAdam(model.parameters(), lr=0.001)
# optimizer = optim.SGD(model.parameters(), lr=0.001)

num_epochs = 20
train_losses, val_losses = train(model, train_loader, criterion, optimizer, num_epochs=num_epochs)
f1, loss = evaluate(model, test_loader)
print(f'F1 score on test: {f1}')

Epoch [1/20], Loss: 0.5070
Val loss: 0.49181362986564636
Epoch [2/20], Loss: 0.3520
Val loss: 0.20840038359165192
Epoch [3/20], Loss: 0.3375
Val loss: 0.3488050401210785
Epoch [4/20], Loss: 0.3280
Val loss: 0.3420586884021759
Epoch [5/20], Loss: 0.3182
Val loss: 0.2371065467596054
Epoch [6/20], Loss: 0.3108
Val loss: 0.28661903738975525
Epoch [7/20], Loss: 0.3042
Val loss: 0.2170906364917755
Epoch [8/20], Loss: 0.2965
Val loss: 0.3579075336456299
Epoch [9/20], Loss: 0.2892
Val loss: 0.37132906913757324
Epoch [10/20], Loss: 0.2824
Val loss: 0.4010707139968872
Epoch [11/20], Loss: 0.2740
Val loss: 0.2295883148908615
Epoch [12/20], Loss: 0.2685
Val loss: 0.19815240800380707
Epoch [13/20], Loss: 0.2605
Val loss: 0.22525005042552948
Epoch [14/20], Loss: 0.2531
Val loss: 0.3871518075466156
Epoch [15/20], Loss: 0.2481
Val loss: 0.2863050401210785
Epoch [16/20], Loss: 0.2390
Val loss: 0.2932402491569519
Epoch [17/20], Loss: 0.2324
Val loss: 0.42701688408851624
Epoch [18/20], Loss: 0.2251
Val l

In [111]:
best_model = MLP(input_dim, hidden_dim, output_dim)
best_model.load_state_dict(torch.load('MLP.pt'))

f1, loss = evaluate(best_model, test_loader)
print(f'F1 score on test: {f1}')
print(f'Loss: {loss}')

F1 score on test: 0.8856666666666667
Loss: 0.33350616693496704


In [112]:
test_preds = predict(best_model, test_loader_no_gt)

In [None]:
f1_score(y_test_, test_preds, average='micro')

0.8856666666666667

## Submission

In [114]:
# preds = predict(best_model, test_subm_loader)

# preds = np.array(preds) + 1
# ids = [i for i in range(len(preds))]

# subm_df = pd.DataFrame({'ID': ids, 'Class Index': preds})
# subm_df.to_csv('submission.csv', index=False)

Пробуем обучить CNN

In [157]:
from models import Conv1DNet

input_dim = 100
hidden_dim = 128
output_dim = 4


model = Conv1DNet(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.RAdam(model.parameters(), lr=0.0001)
# optimizer = optim.SGD(model.parameters(), lr=0.001)

num_epochs = 20
train_losses, val_losses = train(model, train_loader, criterion, optimizer, num_epochs=num_epochs, conv=True,
                                 name='convNet')
f1, loss = evaluate(model, test_loader, conv=True)
print(f'F1 score on test: {f1}')

Epoch [1/20], Loss: 0.7910
Val loss: 0.45811891555786133
Epoch [2/20], Loss: 0.3804
Val loss: 0.4449732005596161
Epoch [3/20], Loss: 0.3578
Val loss: 0.36611101031303406
Epoch [4/20], Loss: 0.3460
Val loss: 0.32485517859458923
Epoch [5/20], Loss: 0.3376
Val loss: 0.35238224267959595
Epoch [6/20], Loss: 0.3309
Val loss: 0.31463560461997986
Epoch [7/20], Loss: 0.3253
Val loss: 0.22119060158729553
Epoch [8/20], Loss: 0.3202
Val loss: 0.29986414313316345
Epoch [9/20], Loss: 0.3158
Val loss: 0.5367271900177002
Epoch [10/20], Loss: 0.3112
Val loss: 0.2524969279766083
Epoch [11/20], Loss: 0.3077
Val loss: 0.41673150658607483
Epoch [12/20], Loss: 0.3040
Val loss: 0.5107138752937317
Epoch [13/20], Loss: 0.3006
Val loss: 0.4878309667110443
Epoch [14/20], Loss: 0.2979
Val loss: 0.2640914022922516
Epoch [15/20], Loss: 0.2951
Val loss: 0.15732546150684357
Epoch [16/20], Loss: 0.2916
Val loss: 0.3240918815135956
Epoch [17/20], Loss: 0.2887
Val loss: 0.21421320736408234
Epoch [18/20], Loss: 0.2862
Va

In [156]:
# import matplotlib.pyplot as plt

# plt.plot(range(1, num_epochs + 1), train_losses)
# plt.plot(range(1, num_epochs + 1), val_losses)

In [None]:
# f1, loss = evaluate(model, test_loader, conv=True)
# print(f'F1 score on test: {f1}')

F1 score on test: 0.8824166666666666


In [174]:
best_model = Conv1DNet(input_dim, hidden_dim, output_dim)
best_model.load_state_dict(torch.load('convNet.pt'))

f1, loss = evaluate(best_model, test_loader, conv=True)
print(f'F1 score on test: {f1}')
print(f'Loss: {loss}')

F1 score on test: 0.8849583333333333
Loss: 0.20451447367668152


In [175]:
test_preds = predict(best_model, test_loader_no_gt, conv=True)
f1_score(y_test_, test_preds, average='micro')

0.8849583333333333

Submission

In [176]:
preds = predict(best_model, test_subm_loader, conv=True)

preds = np.array(preds) + 1
ids = [i for i in range(len(preds))]

subm_df = pd.DataFrame({'ID': ids, 'Class Index': preds})
subm_df.to_csv('submission.csv', index=False)