In [5]:
# get news file names
import os
def get_filenames(dir_name):
    return os.listdir(dir_name)
    
dir_name = 'news_data'
filenames = get_filenames(dir_name)
filenames = [os.path.join(dir_name, f) for f in filenames]
len(filenames)

80

In [24]:
# given list of filenames, return (list of text, list of class)
# class: 0(baseball), 1(soccer)
def get_contents(filenames):
    x, y = [], []
    for file in filenames:
        try:
            # use cp949 encoding because txt file was created at window OS
            with open(file, 'r', encoding='cp949') as f:
                y_class = file.split(os.sep)[1].split('_')[0]
                y_class = 0 if int(y_class) <= 4 else 1
                x.append(f.read())
                y.append(y_class)

        except Exception as e:
            print(file, e)
            
    return x, y
            
x, y = get_contents(filenames)
print(y[0], x[0])

0 It was not the shot heard around the world, it was not a playoff clinching hit or a season turning moment as far as we can tell. What we do know for certain is that Seattle Mariner first baseman Dae-Ho Lee's walk-off homer two-run homer with two outs in the bottom of the 10th inning Wednesday afternoon snapped a five game home losing streak and gave the M's (3-6) a 4-2 win over the Texas Rangers (5-5) for their first win at home.

The walk-off wonder strike by the 6-foot-4, 250-pound Korean national was a gem of a gift to the 15, 075 Mariner faithful who showed up to cheer their team despite watching one of the worst home opening series by Mariners. The M's team batting average during the home stand was a dismal .170 average with a home run percentage of 0.8 and an ERA of 4.50.

"We really, really needed it," manager Scott Servais said of the win and a players-only meeting that was held after an 8-0 loss to Texas on Tuesday. "It was an important game. The guys got together after the 

In [54]:
import string


# clean given string
def get_cleaned_text(text):
    # str.maketrans(x1, x2, x3)
    # map char in x1 to char in x2 and remove x3
    text = text.translate(str.maketrans('', '', string.punctuation)) # remove punctuation
    text = text.lower() # lower case
    text = [word for word in text.split() if len(word) > 1] # remove 1-char, e.g. 'a'
    text = [word for word in text if word.isalpha()] # remove number
    text = ' '.join(text)
    return text
    

# given list of document, return word_to_idx dict
def get_corpus_dict(docs):
    words = set()
    for doc in docs:
        doc = get_cleaned_text(doc)
        words.update([w for w in doc.split()])

    words = list(words)
    return { word:idx for (idx, word) in enumerate(words) }

corpus_dict = get_corpus_dict(x)

In [62]:
# return vector which represents given document
def doc_to_vec(doc, corpus_dict):
    vec = [0] * len(corpus_dict)
    doc = get_cleaned_text(doc)
    for word in doc.split():
        idx = corpus_dict[word]
        vec[idx] += 1
        
    return vec

In [71]:
import math

# return cosine similarity between 2 vector
def cosine_sim(v1, v2):
    inner_prod = sum(a*b for (a,b) in zip(v1, v2))
    size_v1 = math.sqrt(sum(a**2 for a in v1))
    size_v2 = math.sqrt(sum(a**2 for a in v2))
    
    return inner_prod / (size_v1 * size_v2)
    
cosine_sim([1,0], [1,0])

1.0

In [84]:
# for given doc, find top-n similar document and select class: 0(baseball), 1(soccer)
from tqdm import tqdm

n = 10
ans = 0
count = 0
for i in tqdm(range(80)):
    doc1 = x[i]
    y_class = y[i]
    vec1 = doc_to_vec(doc1, corpus_dict)
    vec2s = [doc_to_vec(doc2, corpus_dict) for doc2 in x]
    sims = [cosine_sim(vec1, vec2) for vec2 in vec2s]
    topn = sorted(enumerate(sims), key=lambda x: -x[1])[1:n+1] # order by cosine-similarity DESC
    topn = [x[0] for x in topn] # remain index of document
    
    # for accuracy
    for idx in topn:
        y_pred = y[idx]
        if y_pred == y_class:
            ans += 1
    count += n
        
print(ans / count)

100%|██████████| 80/80 [00:30<00:00,  2.69it/s]

0.70125



