In [1]:
import pandas as pd
import numpy as np
import json
import re

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

from gensim.models import FastText, Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
from torch.utils.data import Dataset, DataLoader

import warnings
warnings.filterwarnings("ignore")

pd.set_option('max_colwidth', 0)

[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>
[nltk_data] Downloading package wordnet to /home/alex/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
train_raw_df = pd.read_csv('data/train.csv')
test_raw_df = pd.read_csv('data/test.csv')

In [3]:
train_raw_df

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again."
1,3,Carlyle Looks Toward Commercial Aerospace (Reuters),"Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market."
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.
3,3,Iraq Halts Oil Exports from Main Southern Pipeline (Reuters),"Reuters - Authorities have halted oil export\flows from the main pipeline in southern Iraq after\intelligence showed a rebel militia could strike\infrastructure, an oil official said on Saturday."
4,3,"Oil prices soar to all-time record, posing new menace to US economy (AFP)","AFP - Tearaway world oil prices, toppling records and straining wallets, present a new economic menace barely three months before the US presidential elections."
...,...,...,...
119995,1,Pakistan's Musharraf Says Won't Quit as Army Chief,"KARACHI (Reuters) - Pakistani President Pervez Musharraf has said he will stay on as army chief, reneging on a pledge to quit the powerful post by the end of the year."
119996,2,Renteria signing a top-shelf deal,"Red Sox general manager Theo Epstein acknowledged Edgar Renteria was more a luxury for the 2005 Red Sox than a necessity. But there's nothing wrong with getting the keys to a BMW, and that's what the four-time All-Star and two-time Gold Glover is in the eyes of the Red Sox."
119997,2,Saban not going to Dolphins yet,"The Miami Dolphins will put their courtship of LSU coach Nick Saban on hold to comply with the NFL's hiring policy by interviewing at least one minority candidate, a team source told The Associated Press last night."
119998,2,Today's NFL games,"PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: Steelers by 10. Records: Steelers 12-1, Giants 5-8. Vs. spread: Steelers 10-1-2, Giants 5-8. Series: Giants lead, 43-27-3. Comments: Think the Giants knew Ben Roethlisberger was available on draft day when they broke the bank and traded for Eli Manning? . . . All Big Ben has done this year is complete ..."


# Cleaning data

Очевидно, что удаление знаков препинания и других символов необходимо, чтобы избежать кодирования не несущих смысл символов, сделаем это для всех экспериментов. Затем сравним удаление стоп-слов, удаление цифр и их комбирование.

In [4]:
print("Nulls count:", train_raw_df.isna().sum(), sep='\n')
print()
print("Empty string count:", train_raw_df.eq('').sum(), sep='\n')

Nulls count:
Class Index    0
Title          0
Description    0
dtype: int64

Empty string count:
Class Index    0
Title          0
Description    0
dtype: int64


In [5]:
def clean_text(df, remove_digits=False, remove_stop_words=False):
    train_df = train_raw_df.copy(deep=True)

    train_df['text'] = train_raw_df['Title'] + " " + train_raw_df['Description']
    train_df.drop(columns=['Title', 'Description'], inplace=True)

    train_df['text'] = train_df['text'].str.replace("\\", " ").str.lower()
    
    if remove_digits:
        train_df['text'] = train_df['text'].str.replace(r'\d', ' ', regex=True)

    if remove_stop_words:
        stop_words = stopwords.words('english')
        stop_words_pattern = r'\b(?:' + '|'.join(re.escape(word) for word in stop_words) + r')\b'
        train_df['text'] = train_df['text'].str.replace(stop_words_pattern, ' ', regex=True)

    train_df['text'] = train_df['text'].str.replace(r'[^\d\w\s]', '', regex=True)
    train_df['text'] = train_df['text'].apply(word_tokenize, preserve_line=True)

    return train_df

In [6]:
train_removed_digits = clean_text(train_raw_df, remove_digits=True)
train_removed_stopw = clean_text(train_raw_df, remove_stop_words=True)
train_removed_all = clean_text(train_raw_df, remove_digits=True, remove_stop_words=True)

In [7]:
train_removed_all

Unnamed: 0,Class Index,text
0,3,"[wall, st, bears, claw, back, black, reuters, reuters, shortsellers, wall, street, dwindling, band, ultracynics, seeing, green]"
1,3,"[carlyle, looks, toward, commercial, aerospace, reuters, reuters, private, investment, firm, carlyle, group, reputation, making, welltimed, occasionally, controversial, plays, defense, industry, quietly, placed, bets, another, part, market]"
2,3,"[oil, economy, cloud, stocks, outlook, reuters, reuters, soaring, crude, prices, plus, worries, economy, outlook, earnings, expected, hang, stock, market, next, week, depth, summer, doldrums]"
3,3,"[iraq, halts, oil, exports, main, southern, pipeline, reuters, reuters, authorities, halted, oil, export, flows, main, pipeline, southern, iraq, intelligence, showed, rebel, militia, could, strike, infrastructure, oil, official, said, saturday]"
4,3,"[oil, prices, soar, time, record, posing, new, menace, us, economy, afp, afp, tearaway, world, oil, prices, toppling, records, straining, wallets, present, new, economic, menace, barely, three, months, us, presidential, elections]"
...,...,...
119995,1,"[pakistan, musharraf, says, quit, army, chief, karachi, reuters, pakistani, president, pervez, musharraf, said, stay, army, chief, reneging, pledge, quit, powerful, post, end, year]"
119996,2,"[renteria, signing, topshelf, deal, red, sox, general, manager, theo, epstein, acknowledged, edgar, renteria, luxury, red, sox, necessity, nothing, wrong, getting, keys, bmw, fourtime, star, twotime, gold, glover, eyes, red, sox]"
119997,2,"[saban, going, dolphins, yet, miami, dolphins, put, courtship, lsu, coach, nick, saban, hold, comply, nfl, hiring, policy, interviewing, least, one, minority, candidate, team, source, told, associated, press, last, night]"
119998,2,"[today, nfl, games, pittsburgh, ny, giants, time, p, line, steelers, records, steelers, giants, vs, spread, steelers, giants, series, giants, lead, comments, think, giants, knew, ben, roethlisberger, available, draft, day, broke, bank, traded, eli, manning, big, ben, done, year, complete]"


In [8]:
print("Nulls count:")
print("\tRemoved digits:", train_removed_digits['text'].isna().sum())
print("\tRemoved stop words:", train_removed_stopw['text'].isna().sum())
print("\tRempved both:", train_removed_all['text'].isna().sum())

Nulls count:
	Removed digits: 0
	Removed stop words: 0
	Rempved both: 0


In [9]:
def lemmatizing(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join(lemmatizer.lemmatize(token) for token in text)

def stemming(text):
    stemmer = PorterStemmer()
    return ' '.join(stemmer.stem(token) for token in text)

In [10]:
class Vectorizer:
    def __init__(self, vect_type, params = None) -> None:
        self.vect_type = vect_type
        self.params = params
    
    def _split_text(self, text):
        return [sentence.split() for sentence in text]
    
    def fit(self, text):
        if self.vect_type == "word2vec":
            self.vectorizer = Word2Vec(sentences=self._split_text(text), **self.params)
            
        elif self.vect_type == "fasttext":
            self.vectorizer = FastText(sentences=self._split_text(text), **self.params)
            
        elif self.vect_type == "tfidf":
            self.vectorizer = TfidfVectorizer(ngram_range=(1, 3))
            self.vectorizer.fit(text)
            
    def transform(self, text):
        if self.vect_type == "tfidf":
            return self.vectorizer.transform(text).toarray()
        else:
            embeddings = []
            for sentence in text:
                vector = self.vectorizer.wv
                null_emb = np.zeros(self.params['vector_size'])
                embeddings.append(np.mean([vector[token] if token in vector else null_emb for token in sentence ], axis=0))
                
            return np.array(embeddings)   

In [11]:
def fit_predict(X_train, y_train, X_test):
    
    model = LogisticRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    return y_pred

In [12]:
# data_cleaning_dict = {"Removed digits" : train_removed_digits, 
#                       "Removed stop words" : train_removed_stopw,
#                       "Removed digits and stop words" : train_removed_all}

# tokenizing_type_dict = {"lemmatizing" : lemmatizing, "stemming" : stemming}

In [13]:
# for data_cleaning_type, data in data_cleaning_dict.items():
    
#     X = data[['text']].copy()
#     y = data[['Class Index']].copy()

#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31)
#     del X, y
    
#     for tokenizing_type, tokenizing_func in tokenizing_type_dict.items():
        
#         X_train['text'] = X_train['text'].apply(tokenizing_func)
#         X_test['text'] = X_test['text'].apply(tokenizing_func)

#         for vectorization_type in ["tfidf", "word2vec", "fasttext"]:
#             if vectorization_type == "tfidf":
#                 params = None
#             else:
#                 params = {'vector_size': 100,
#                         'window': 5,
#                         'min_count': 1,
#                         'workers': 4
#                     }
                
#             vectorizer = Vectorizer(vectorization_type, params)
#             vectorizer.fit(X_train['text'])
            
#             X_train_vectorized = vectorizer.transform(X_train['text'])
#             X_test_vectorized = vectorizer.transform(X_test['text'])
            
#             y_pred = fit_predict(X_train_vectorized, y_train, X_test_vectorized)
            
#             print(f"Cleaning type: {data_cleaning_type}, tokenizing type: {tokenizing_type}, vectorization_type: {vectorization_type}")
#             print("f1 score: ", f1_score(y_test, y_pred))

In [12]:
data = train_removed_all
X = data[['text']].copy()
y = data[['Class Index']].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31)

In [13]:
X_train['text'] = X_train['text'].apply(lemmatizing)
X_test['text'] = X_test['text'].apply(lemmatizing)

In [14]:
params = {'vector_size': 100,
            'window': 5,
            'min_count': 1,
            'workers': 4
         }

vectorizer = Vectorizer("word2vec", params)

vectorizer.fit(X_train['text'])
            
X_train_vectorized = vectorizer.transform(X_train['text'])
X_test_vectorized = vectorizer.transform(X_test['text'])

In [15]:
y_pred = fit_predict(X_train_vectorized, y_train, X_test_vectorized)
f1_score(y_test, y_pred, average="micro")

0.46216666666666667

In [None]:
class NewsDataset(Dataset):
    def __init__(self, X: np.array, y: np.array):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x = self.X[idx]
        y = self.y[idx]
        
        x = torch.tensor(x, dtype=torch.float32)
        y = torch.tensor(y, dtype=torch.float32)
        
        return x, y

In [80]:
y_train_ = np.array(y_train['Class Index']) - 1
y_test_ = np.array(y_test['Class Index']) - 1

train_dataset = NewsDataset(X_train_vectorized, y_train_)
test_dataset = NewsDataset(X_test_vectorized, y_test_)

train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=True)

In [57]:
from models import MLP
import torch.nn as nn
import torch.optim as optim

In [74]:
def train(model, dataloader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0.0
        for batch_x, batch_y in dataloader:
            
            batch_x, batch_y = batch_x.float(), batch_y.long()
            
            outputs = model(batch_x)
            
            loss = criterion(outputs, batch_y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(dataloader):.4f}")

In [None]:
def evaluate(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch_x, batch_y in dataloader:
            batch_x, batch_y = batch_x.float(), batch_y.float()
            outputs = model(batch_x)
            _, preds = torch.max(outputs, 1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch_y.cpu().numpy())

    # Вычисляем F1 Score
    f1 = f1_score(all_labels, all_preds, average='micro')
    print(f"F1 Score: {f1:.4f}")
    return f1

In [None]:
input_dim = 100
hidden_dim = 64
output_dim = 4


model = MLP(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 50
train(model, train_loader, criterion, optimizer, num_epochs=num_epochs)
f1 = evaluate(model, test_loader)

Epoch [1/50], Loss: 1.3257
Epoch [2/50], Loss: 1.2485
Epoch [3/50], Loss: 1.2287
Epoch [4/50], Loss: 1.2199
Epoch [5/50], Loss: 1.2158
Epoch [6/50], Loss: 1.2115
Epoch [7/50], Loss: 1.2084
Epoch [8/50], Loss: 1.2055
Epoch [9/50], Loss: 1.2020
Epoch [10/50], Loss: 1.1999
Epoch [11/50], Loss: 1.1968
Epoch [12/50], Loss: 1.1959
Epoch [13/50], Loss: 1.1947
Epoch [14/50], Loss: 1.1933
Epoch [15/50], Loss: 1.1872
Epoch [16/50], Loss: 1.1867
Epoch [17/50], Loss: 1.1838
Epoch [18/50], Loss: 1.1821
Epoch [19/50], Loss: 1.1814
Epoch [20/50], Loss: 1.1779
Epoch [21/50], Loss: 1.1770
Epoch [22/50], Loss: 1.1771
Epoch [23/50], Loss: 1.1746
Epoch [24/50], Loss: 1.1706
Epoch [25/50], Loss: 1.1699
Epoch [26/50], Loss: 1.1710
Epoch [27/50], Loss: 1.1687
Epoch [28/50], Loss: 1.1679
Epoch [29/50], Loss: 1.1659
Epoch [30/50], Loss: 1.1649
Epoch [31/50], Loss: 1.1637
Epoch [32/50], Loss: 1.1623
Epoch [33/50], Loss: 1.1654
Epoch [34/50], Loss: 1.1640
Epoch [35/50], Loss: 1.1605
Epoch [36/50], Loss: 1.1623
E