## Non-Negative Matrix Factorization for Exploring the European Parliament's Topic Agenda
### Assignment 2 for Machine Learning Complements class
By Alexandra de Carvalho, Luís Costa, Nuno Pedrosa

#### Importing the needed Python libraries
We will use Pandas for dataframe manipulation.

In [60]:
import os
import re
import math
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

import pandas as pd

# for modeling 
from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import NMF

# for text processing
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances

[nltk_data] Downloading package stopwords to /home/alexa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/alexa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/alexa/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/alexa/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Static NMF

#### Importing the data

In [3]:
# expand pandas df column display width to enable easy inspection
pd.set_option('max_colwidth', 150)

# read the textfiles to a dataframe
dir_path = 'sample' # folder path
files = [] # list to store files

for path in os.listdir(dir_path):
    if os.path.isfile(os.path.join(dir_path, path)):
        files.append(os.path.join(dir_path, path))
    else:
        subpath = os.path.join(dir_path, path)
        for path2 in os.listdir(subpath):
            if os.path.isfile(os.path.join(subpath, path2)):
                files.append(os.path.join(subpath, path2))

#### Tokenizing
To make all of the text in the speeches as comparable as possible we need to remove punctuation, capitalization, numbers, and strange characters. We also keep the term frequency on each document.

In [4]:
text_tokens = dict()
for filename in files:
    with open(filename, 'rb') as f:
        lines = f.readlines()
        text_tokens[filename] = dict()
        
        for line in lines:
            for token in re.split('\W+', str(line)):
                token = token.lower()
                if len(token) > 3 and not token.isnumeric() and not token.lower() in stopwords.words('english'):
                    text_tokens[filename][token] = text_tokens[filename].get(token, 0) + 1

#### Lemmatizing

In [5]:
wordnet_lemmatizer = WordNetLemmatizer()   # stored function to lemmatize each word
is_noun = lambda pos: pos[:2] == 'NN'

nouns = dict()
for filename, tokens in text_tokens.items():
    if filename not in nouns:
        nouns[filename] = dict()

    for (word, pos) in pos_tag(list(tokens.keys())):
        if is_noun(pos):
            nouns[filename][wordnet_lemmatizer.lemmatize(word)] = nouns[filename].get(wordnet_lemmatizer.lemmatize(word), 0) + text_tokens[filename][word]

#### Building the matrix A

Firstly, only with the term frequency weights.

In [6]:
dictvectorizer = DictVectorizer(sparse=False)
a = dictvectorizer.fit_transform(list(nouns.values()))

Building the list of all tokens (all columns of A, in order).

In [7]:
token_list = dictvectorizer.get_feature_names()



Now calculating updating to TF-IDF weights

In [8]:
for column_idx in range(len(token_list)):
    idf = math.log(len(a[:, column_idx])/len([x for x in a[:, column_idx] if x != 0]), 10)

    for element_idx in range(len(files)):
        if a[element_idx,column_idx] != 0:
            a[element_idx,column_idx] = (math.log(a[element_idx,column_idx], 10) + 1) * idf

TODO : normalize the values

#### Finding the best value for K : TC-W2V

We used the same t as the paper.

In [58]:
t = 10

In [98]:
max_model_coherence = 0
res_k = 0

for k in range(10,26):

    nmf_model = NMF(k) 
    nmf_model.fit_transform(a)

    vocabulary = [[token_list[x[1]] for x in sorted(zip(topic,range(len(topic))), reverse = True)[:t]] for topic in nmf_model.components_]
    model = Word2Vec(sentences = vocabulary, vector_size = 200, window = 5, hs = 1, negative = 0, min_count = 1)
    
    # calculating individual topic coherence scores for each topic
    model_score = []
    for topic in vocabulary:
        topic_score = []
        for w1 in topic:
            for w2 in topic:
                if w2 > w1:
                    word_score = cosine_similarity(model.wv[w2].reshape(1,-1),model.wv[w1].reshape(1,-1))[0]
                    topic_score.append(word_score[0])
        
        topic_score = sum(topic_score)/len(topic_score) # mean of each word pair similarity in the topic
        model_score.append(topic_score)

    model_coherence = sum(model_score)/len(model_score) # mean of topic coherence in the model
    print("k = ",k, ". Model coherence:", model_coherence)

    if model_coherence > max_model_coherence:
        max_model_coherence = model_coherence
        res_k = k

k =  10 . Model coherence: 0.0023823076238234835
k =  11 . Model coherence: 0.0023658915922913533
k =  12 . Model coherence: 0.0019045398576525733




k =  13 . Model coherence: -0.0012079225394389251




k =  14 . Model coherence: 0.0007569255819636587
k =  15 . Model coherence: 0.00217567439557536
k =  16 . Model coherence: -0.00023898034823004177
k =  17 . Model coherence: 0.0030269980628432986
k =  18 . Model coherence: 0.0029971630723863462
k =  19 . Model coherence: 0.0033938294912794634
k =  20 . Model coherence: 0.0028373181233635274
k =  21 . Model coherence: 0.003928204220269989
k =  22 . Model coherence: 0.002854457509624119
k =  23 . Model coherence: 0.0002944259246887758
k =  24 . Model coherence: 0.003591534689938406
k =  25 . Model coherence: 0.0030409138510003684


#### NMF

In [100]:
nmf_model = NMF(res_k) 
w = nmf_model.fit_transform(a)

#### Results

For each topic, find the t higher weights' index and find the correpondent token (same index) in the token list. These are the descriptors of each topic.

In [101]:
for i, topic in enumerate(nmf_model.components_):
    print("Topic", i, ":",[token_list[x[1]] for x in sorted(zip(topic,range(len(topic))), reverse = True)[:t]])

Topic 0 : ['chip', 'computer', 'science', 'silicon', 'technology', 'researcher', 'supercomputer', 'engineering', 'gene', 'intel']
Topic 1 : ['player', 'club', 'game', 'football', 'team', 'season', 'chelsea', 'manager', 'champion', 'match']
Topic 2 : ['party', 'election', 'tory', 'government', 'howard', 'democrat', 'policy', 'leader', 'blair', 'immigration']
Topic 3 : ['music', 'band', 'song', 'rock', 'singer', 'single', 'album', 'artist', 'record', 'concert']
Topic 4 : ['forsyth', 'frederick', 'terrorist', 'internment', 'forsythe', 'totalitarianism', 'qaeda', 'fundamentalism', 'churchill', 'liberty']
Topic 5 : ['growth', 'economy', 'market', 'price', 'rate', 'rise', 'bank', 'dollar', 'analyst', 'profit']
Topic 6 : ['angel', 'rhapsody', 'bland', 'guy', 'pulp', 'cheesy', 'brit', 'scissor', 'joke', 'listener']
Topic 7 : ['sub', 'minute', 'goal', 'ball', 'header', 'yard', 'kick', 'cech', 'cross', 'duff']
Topic 8 : ['virus', 'program', 'security', 'mail', 'software', 'computer', 'attack', '

We can then observe the documents with bigger weights for each topic. Because the files names already tag the contained speech by topic, we can infer the validity of the model built.

In [103]:
for i in range(res_k):
    print("Topic", i, ":",[files[x[1]].split('/')[-1] for x in sorted(zip(w[:,i],range(len(w[:,i]))), reverse = True)[:t]])

Topic 0 : ['tech_161.txt', 'tech_173.txt', 'tech_019.txt', 'tech_329.txt', 'tech_137.txt', 'tech_141.txt', 'tech_320.txt', 'tech_166.txt', 'tech_165.txt', 'tech_018.txt']
Topic 1 : ['football_207.txt', 'football_087.txt', 'football_088.txt', 'football_086.txt', 'football_223.txt', 'football_202.txt', 'football_085.txt', 'football_141.txt', 'football_124.txt', 'football_221.txt']
Topic 2 : ['politics_283.txt', 'politics_213.txt', 'politics_041.txt', 'politics_220.txt', 'politics_087.txt', 'politics_122.txt', 'politics_159.txt', 'politics_272.txt', 'politics_271.txt', 'politics_275.txt']
Topic 3 : ['entertainment_262.txt', 'entertainment_131.txt', 'entertainment_244.txt', 'entertainment_229.txt', 'entertainment_263.txt', 'entertainment_142.txt', 'entertainment_145.txt', 'entertainment_133.txt', 'entertainment_236.txt', 'entertainment_153.txt']
Topic 4 : ['politics_290.txt', 'politics_052.txt', 'politics_160.txt', 'politics_263.txt', 'politics_189.txt', 'politics_155.txt', 'tech_245.txt',