In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer

In [2]:
df = pd.read_csv("private/taxrisk.csv")
dummy3_df = df[df['dummy3']==1].copy()
dummy3_df.reset_index(inplace=True)
dummy3_df.head()

Unnamed: 0,index,text,company,year,file_name,dummy1,dummy2,dummy3
0,6,\x92s envi- ronmental impact and its social r...,a.p. moller-maersk,2009,A.P. Moller-Maersk_2009.TXT,1,1,1
1,9,global human rights standards associated wi...,a.p. moller-maersk,2011,A.P. Moller-Maersk_2011.TXT,1,1,1
2,10,set by OECD for multinational enterprises e...,a.p. moller-maersk,2011,A.P. Moller-Maersk_2011.TXT,1,1,1
3,11,been developed with the intention of closin...,a.p. moller-maersk,2011,A.P. Moller-Maersk_2011.TXT,1,1,1
4,12,"has a Responsible Procurement Programme, hea...",a.p. moller-maersk,2011,A.P. Moller-Maersk_2011.TXT,1,1,1


In [3]:
punc = ['.', ',', '"', "'", '?', '!',
        ':', ';', '(', ')', '[', ']',
        '{', '}', '%']

stop_words = text.ENGLISH_STOP_WORDS.union(punc)
txts = dummy3_df['text'].values
stemmer = SnowballStemmer('english')
tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')

def tokenize(txt):
    return [stemmer.stem(word) for word 
            in tokenizer.tokenize(txt.lower())]

vectorizer = TfidfVectorizer(stop_words=stop_words,
                              tokenizer=tokenize,
                              max_features=100)
X = vectorizer.fit_transform(txts)
words = vectorizer.get_feature_names()

In [4]:
def run_k_means(n_clusters=5):
    kmeans = KMeans(n_clusters=n_clusters, n_init=5, n_jobs=-1)
    kmeans.fit(X)
    common_words = kmeans.cluster_centers_.argsort()[:, -1:-11:-1]
    for num, centroid in enumerate(common_words):
        print(str(num) + ': ' + ', '.join(words[word] for word
                                           in centroid))
    dummy3_df['cluster'] = kmeans.predict(vectorizer.transform(dummy3_df.text))
    print("\nCluster's distribution: ")
    print(dummy3_df['cluster'].value_counts())

    print("\nOne particular company: ")
    print(dummy3_df[dummy3_df['company'] == 'a.p. moller-maersk']['cluster'].value_counts())

In [5]:
run_k_means(5)

0: x, tax, s, group, risk, manag, strategi, busi, liabil, polici
1: tax, risk, law, chang, author, group, price, result, manag, legisl
2: depart, tax, group, risk, ensur, x, monitor, s, insur, manag
3: tax, committe, xa, group, risk, provis, audit, report, manag, x
4: risk, tax, control, manag, group, polici, report, intern, compani, financi

Cluster's distribution: 
1    593
3    549
4    498
0    479
2    274
Name: cluster, dtype: int64

One particular company: 
1    14
4     2
Name: cluster, dtype: int64


In [6]:
run_k_means(5)

0: tax, risk, law, chang, author, group, manag, result, price, assess
1: tax, risk, committe, manag, group, control, report, audit, polici, board
2: depart, tax, group, risk, monitor, x, insur, legal, ensur, manag
3: xa, tax, risk, group, x, oper, manag, s, report, taxat
4: tax, x, s, group, risk, manag, strategi, polici, busi, control

Cluster's distribution: 
0    828
4    575
1    554
2    286
3    150
Name: cluster, dtype: int64

One particular company: 
0    15
4     1
Name: cluster, dtype: int64
