# News Categorization

## keyword
- Categorization
- One-hot Encoding
- Bag of Words
- Cosine Distance
- Corpus
- List Comprehension

## Purpose
    Use pure python code, create 'News Categorization' program

## Process
    1. 파일을 불러오기
    2. 파일을 읽어서 단어사전(corpus) 만들기
    3. 단어별로 index 만들기
    4. 만들어진 index로 문서별 BOW 생성하기
    5. 비교하고자 하는 문서 비교하기
    6. 얼마나 맞는지 측정하기

In [1]:
import os

def get_file_list(dir_name):
    return os.listdir(dir_name)

In [3]:
get_file_list("news")

['1_Dae-Ho Lee walk-off homer gives Mariners 4-2 win over Rangers.txt',
 '1_Korean First Baseman Dae-Ho Lee Becomes Free Agent, Interested In MLB Deal.txt',
 '1_Lee Dae-ho Announces MLB Aspirations.txt',
 '1_Lee Dae-ho to Start Spring Training in Arizona.txt',
 '1_Lee Dae-ho wins MVP in Japan Series.txt',
 "1_Mariners' Lee Dae-ho belts a walk-off homer.txt",
 '1_Mariners’ Lee Dae-ho gets 1st two-hit game, double.txt',
 '1_MLB Team Interested In Dae-Ho Lee.txt',
 "1_Seattle Mariners' Newest Signing Dae-Ho Lee Could Become Fan Favorite.txt",
 '1_SoftBank Hawks Hope to Renew Contract with Lee Dae-ho.txt',
 '2_Dodgers left with questions after latest Hyun-Jin Ryu setback.txt',
 '2_Dodgers left-hander Hyun-Jin Ryu expects to be ready for start of season.txt',
 '2_Dodgers unsure when Hyun-Jin Ryu will throw another bullpen session.txt',
 '2_Dodgers will take it slowly with pitcher Hyun-Jin Ryu, whose health could be a key to their season.txt',
 '2_Hyun-Jin Ryu downplays long break between bu

In [6]:
def get_contents(file_list):
    y_class = []
    x_text = []
    class_dict = {
        1: "0", 2:"0", 3:"0", 4:"0", 5:"1", 6:"1", 7:"1", 8:"1"}

    for file_name in file_list:
        try:
            f = open(file_name, "r", encoding="cp949")
            category = int(file_name.split(os.sep)[1].split("_")[0])
            y_class.append(class_dict[category])
            x_text.append(f.read())
            f.close()
        except UnicodeDecodeError as e:
            print(e)
            print(file_name)
    return x_text, y_class 

In [10]:
def get_cleaned_text(text):
    import re
    text = re.sub('\W+', '', text.lower())
    return text

def get_corpus_dict(text):
    text = [sentence.split() for sentence in text]
    cleaned_words = [get_cleaned_text(word) for words in text for word in words]

    from collections import OrderedDict
    corpus_dict = OrderedDict()
    for i, v in enumerate(set(cleaned_words)):
        corpus_dict[v] = i
    return corpus_dict

In [14]:
def get_count_vector(text, corpus):
    text = [sentence.split() for sentence in text]
    word_number_list = [[corpus[get_cleaned_text(word)] for word in words] for words in text]
    x_vector = [[0 for _ in range(len(corpus))] for x in range(len(text))]

    for i, text in enumerate(word_number_list):
        for word_number in text:
            x_vector[i][word_number] += 1
    return x_vector

In [18]:
import math
def get_cosine_similarity(v1, v2):
    "Compute cosine similarity of v1 to v2 : (v1 dot v2)/{||v1||*||v2||)"
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    return sumxy/math.sqrt(sumxx*sumyy)

def get_similartiy_score(x_vector, source):
    source_vector = x_vector[source]
    similarity_list = []
    for target_vector in x_vector:
        similarity_list.append(get_cosine_similarity(source_vector, target_vector))
    return similarity_list

def get_top_n_similarity_news(similarity_score, n):
    import operator
    x = {i:v for i, v in enumerate(similarity_score)}
    sorted_x = sorted(x.items(), key=operator.itemgetter(1))

    return list(reversed(sorted_x))[1:n+1]

In [19]:
def get_accuracy(similarity_list, y_class, source_news):
    source_class = y_class[source_news]

    return sum([source_class == y_class[i[0]] for i in similarity_list]) / len(similarity_list)

In [20]:
if __name__ == "__main__":
    dir_name="news"
    file_list = get_file_list(dir_name)
    file_list = [os.path.join(dir_name, file_name) for file_name in file_list]

    x_text, y_class = get_contents(file_list)

    corpus = get_corpus_dict(x_text)
    # print("Number of words: {0}".format(len(corpus)))

    x_vector = get_count_vector(x_text, corpus)
    source_number = 10

    result = []

    for i in range(80):
        source_number = i

        similarity_score = get_similartiy_score(x_vector, source_number)
        similarity_news = get_top_n_similarity_news(similarity_score, 10)
        accuracy_score = get_accuracy(similarity_news, y_class, source_number)
        result.append(accuracy_score)
    print(sum(result) / 80)

0.6950000000000001
