In [1]:
import os
import pandas as pd
import re
import string

In [2]:
def read_txt_to_str(path:str)->str:
    data = ''
    try:
        my_file = open(path, "r")
        data = my_file.read().replace('\n', ' ')
        my_file.close()
        return data
    except Exception as e:
        print(f'error {path}, {e}')
        return data

In [3]:
folder_name = ['business','entertainment','politics','sport','tech']
res = {}
temp_to_df = []
for folder in folder_name:
    dir_path = os.path.join(os.getcwd(),'news_data',folder)
    res_temp = []
    for path in os.listdir(dir_path):
        if os.path.isfile(os.path.join(dir_path, path)):
            str_res = read_txt_to_str(os.path.join(dir_path,path))
            res_temp.append(str_res)
            temp_to_df.append([str_res,folder])
    res[folder] = res_temp 

error c:\Users\Andrey\Documents\exploration\temp\document_clustering_spending_tracker\document_classification_2\news_data\sport\199.txt, 'utf-8' codec can't decode byte 0xa3 in position 257: invalid start byte


In [4]:
df = pd.DataFrame(temp_to_df,columns=['text','cat'])

In [5]:
import nltk

nltk.download('wordnet') # for download to this path
nltk.download('punkt')
nltk.data.path.append('corpora')
nltk.data.path.append('tokenizers')

try:
    from nltk.stem.wordnet import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    def lemmatize(text : str) -> str:
        return lemmatizer.lemmatize(text)
except Exception as e:
    print(f'failed to load WordNetLemmatizer {e}')
    def lemmatize(text : str) -> str:
        return text

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Andrey\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Andrey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
try:
    from nltk.tokenize import word_tokenize
except Exception as e:
    print(f'error load nltk tokenize {e}')
    def word_tokenize(text:str)->list:
        return text.split()

In [7]:
def text_cleaning(text:str)->str:
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

def list_to_text(l:list)->str:
    return ' '.join(l)

In [8]:
def preprocess_text(text : str) -> str:
    res = []
    text = text_cleaning(text)
    list_text = word_tokenize(text)
    for word in list_text:
        res.append(lemmatize(word))
    return list_to_text(res)

In [9]:
df['text_cleaned'] = df['text'].apply(lambda x:preprocess_text(x))

In [10]:
def get_max_len(text:str)->int:
    return len(text.split())

df['max_len'] = df['text_cleaned'].apply(lambda x:get_max_len(x))

In [11]:
unique_mapping = {}
for row_text in df['text_cleaned']:
    for word in row_text.split():
        if word not in unique_mapping:
            unique_mapping[word] = 1
        else:
            unique_mapping[word]+=1

In [12]:
len(unique_mapping)

28095

In [13]:
df.head()

Unnamed: 0,text,cat,text_cleaned,max_len
0,Ad sales boost Time Warner profit Quarterly p...,business,ad sale boost time warner profit quarterly pro...,399
1,Dollar gains on Greenspan speech The dollar h...,business,dollar gain on greenspan speech the dollar ha ...,377
2,Yukos unit buyer faces loan claim The owners ...,business,yukos unit buyer face loan claim the owner of ...,260
3,High fuel prices hit BA's profits British Air...,business,high fuel price hit ba profit british airway h...,383
4,Pernod takeover talk lifts Domecq Shares in U...,business,pernod takeover talk lift domecq share in uk d...,252


In [16]:
import gensim.models
from time import time
from gensim import utils
import multiprocessing

tagged_df_train = []
count = 0
for sentence in df['text_cleaned']:
    tagged_df_train.append(gensim.models.doc2vec.TaggedDocument(sentence.split(), [count]))
    count+=1

cores = multiprocessing.cpu_count() # Count the number of cores in a computer
model_d2v = gensim.models.doc2vec.Doc2Vec(vector_size=200, min_count=2, epochs=20, workers=cores-1)
model_d2v.build_vocab(tagged_df_train)

In [17]:
model_d2v.train(tagged_df_train, total_examples=model_d2v.corpus_count, epochs=model_d2v.epochs)

In [20]:
ranks = []
second_ranks = []
for doc_id in range(len(tagged_df_train)):
    inferred_vector = model_d2v.infer_vector(tagged_df_train[doc_id].words)
    sims = model_d2v.dv.most_similar([inferred_vector], topn=len(model_d2v.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    second_ranks.append(sims[1])

In [21]:
df

Unnamed: 0,text,cat,text_cleaned,max_len
0,Ad sales boost Time Warner profit Quarterly p...,business,ad sale boost time warner profit quarterly pro...,399
1,Dollar gains on Greenspan speech The dollar h...,business,dollar gain on greenspan speech the dollar ha ...,377
2,Yukos unit buyer faces loan claim The owners ...,business,yukos unit buyer face loan claim the owner of ...,260
3,High fuel prices hit BA's profits British Air...,business,high fuel price hit ba profit british airway h...,383
4,Pernod takeover talk lifts Domecq Shares in U...,business,pernod takeover talk lift domecq share in uk d...,252
...,...,...,...,...
2720,Mobile games come of age The BBC News website...,tech,mobile game come of age the bbc news website t...,909
2721,California sets fines for spyware The makers ...,tech,california set fine for spyware the maker of c...,291
2722,Web helps collect aid donations The web is he...,tech,web help collect aid donation the web is helpi...,465
2723,Mobiles rack up 20 years of use Mobile phones...,tech,mobile rack up year of use mobile phone in the...,425


In [29]:
doc_id = 0
print('{} Document ({}): «{}»\n'.format(df['cat'].iloc[doc_id], doc_id, ' '.join(tagged_df_train[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model_d2v)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s %s: «%s»\n' % (df['cat'].iloc[index],label, sims[index], ' '.join(tagged_df_train[sims[index][0]].words)))

business Document (0): «ad sale boost time warner profit quarterly profit at u medium giant timewarner jumped to £ for the three month to december from yearearlier the firm which is now one of the biggest investor in google benefited from sale of highspeed internet connection and higher advert sale timewarner said fourth quarter sale rose to from it profit were buoyed by oneoff gain which offset a profit dip at warner bros and le user for aol time warner said on friday that it now owns of searchengine google but it own internet business aol had ha mixed fortune it lost subscriber in the fourth quarter profit were lower than in the preceding three quarter however the company said aols underlying profit before exceptional item rose on the back of stronger internet advertising revenue it hope to increase subscriber by offering the online service free to timewarner internet customer and will try to sign up aols existing customer for highspeed broadband timewarner also ha to restate and res