In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import nltk
import re
import spacy
import gensim

import time
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

### preprocessed from v0.2

In [2]:
df = pd.read_csv('preprocessed_df.csv')
df[:5]

Unnamed: 0,article,category,fileid
0,bahia cocoa review showers continued throughou...,cocoa,training/1
1,computer terminal systems lt cpml completes sa...,acq,training/10
2,n z trading bank deposit growth rises slightly...,money-supply,training/100
3,national amusements again ups viacom lt via bi...,acq,training/1000
4,rogers lt rog sees st qtr net up significantly...,earn,training/10000


### split classes & train/test

In [3]:
from sklearn.model_selection import train_test_split

df_earn = df.loc[df['category'] == 'earn']
df_acq = df.loc[df['category'] == 'acq']
df_crude = df.loc[df['category'] == 'crude']
df_trade = df.loc[df['category'] == 'trade']
df_money = df.loc[df['category'] == 'money-fx']

#set data/target for classes
X_train_earn, X_test_earn, y_train_earn, y_test_earn = train_test_split(
    df_earn['article'], df_earn['category'], test_size=0.25)

X_train_acq, X_test_acq, y_train_acq, y_test_acq = train_test_split(
    df_acq['article'], df_acq['category'], test_size=0.25)

X_train_crude, X_test_crude, y_train_crude, y_test_crude = train_test_split(
    df_crude['article'], df_crude['category'], test_size=0.25)

X_train_trade, X_test_trade, y_train_trade, y_test_trade = train_test_split(
    df_trade['article'], df_trade['category'], test_size=0.25)

X_train_money, X_test_money, y_train_money, y_test_money = train_test_split(
    df_money['article'], df_money['category'], test_size=0.25)

#earn vs acq
X_train = pd.concat([X_train_earn, X_train_acq], 0).reset_index(drop=True)
X_test = pd.concat([X_test_earn, X_test_acq], 0).reset_index(drop=True)
y_train = pd.concat([y_train_earn, y_train_acq], 0).reset_index(drop=True)
y_test = pd.concat([y_test_earn, y_test_acq], 0).reset_index(drop=True)

#crude vs trade vs money
X_train_ = pd.concat([X_train_crude, X_train_trade, X_train_money],
                     0).reset_index(drop=True)
X_test_ = pd.concat([X_test_crude, X_test_trade, X_test_money],
                     0).reset_index(drop=True)
y_train_ = pd.concat([y_train_crude, y_train_trade, y_train_money],
                     0).reset_index(drop=True)
y_test_ = pd.concat([y_test_crude, y_test_trade, y_test_money],
                     0).reset_index(drop=True)

In [None]:
#train_nlp.to_csv('train_nlp.csv', index=False)

## train own word2vec

In [4]:
train_nlp = pd.read_csv('train_nlp_subset.csv')
nlp = spacy.load('en')

In [26]:
bag2 = []
for name in train_nlp.columns:
    bag2.append(name)

#take out category, fileid, and article     
bag2 = bag2[3:]
print(len(bag2))

5515


In [14]:
from gensim.models import word2vec

model2 = word2vec.Word2Vec(bag2,
                           workers=2,
                           min_count=10,
                           window=6,
                           sg=0,
                           sample=1e-3,
                           size=300,
                           hs=1)

In [18]:
X_df = pd.DataFrame(columns=bag2)
vocab = model2.wv.vocab.keys()

for i, word in enumerate(X_df)
X_df[:3]
#for word in bag2:
#    if word not in X_df.columns and word in vocab:
#            X_df[word] = ''
#            X_df.loc[i, word] = [model2.wv[word]]

Unnamed: 0,article,category,fileid,quarter,roger,say,significantly,earning,dlrs,year,...,reportedly,apollo,pneumo,mailing,fw,brik,tc,max,cental,traf


In [None]:
for i, article in enumerate(train_nlp_subset['article']):
    #parse article
    article_nlp = nlp(article)
    #filter stopwords, punctuation
    article = [token.lemma_ for token in article_nlp 
                if not token.is_punct and not token.is_stop]
    #bag 20 most common words
    bag = ([item[0] for item in Counter(article).most_common(20)])
    #add new words as features and populate rows with word vector
    #df_temp = pd.DataFrame()
    for word in bag:
        if word not in train_nlp.columns and word in model.wv.vocab.keys():
            train_nlp[word] = ''
            train_nlp.loc[i, word] = np.mean([model.wv[word]])
train_nlp.replace('', np.mean(np.zeros(300,)), inplace=True)
print(time.time() - start_time)

In [15]:
X = train_nlp.loc[:, ~train_nlp.columns.isin(['article', 'category', 'fileid'])]
y = train_nlp['category']
print(X.shape, y.shape)

(6215, 5515) (6215,)


In [None]:
from sklearn.preprocessing import normalize, Normalizer
from sklearn.decomposition import TruncatedSVD
X_norm = normalize(X)
X_svd = TruncatedSVD(2).fit_transform(X_norm)

import umap
reducer = umap.UMAP()
X_umap = reducer.fit_transform(X_norm)

n_clust = 2

In [None]:
y_gt = pd.Categorical(y).codes

plt.figure(figsize=(12,6))
plt.subplot(121)
plt.scatter(X_svd[:,0], X_svd[:,1], c=y_gt)
plt.title('ground truth svd')

plt.subplot(122)
plt.scatter(X_umap[:,0], X_umap[:,1], c=y_gt)
plt.title('ground truth umap')
plt.show()

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

kmeans = KMeans(n_clusters=n_clust,
                init='k-means++',
                n_init=10)

y_pred_svd = kmeans.fit_predict(X_svd)
y_pred_umap = kmeans.fit_predict(X_umap)

plt.figure(figsize=(12,6))
plt.suptitle('k-means')
plt.subplot(121)
plt.scatter(X_svd[:, 0], X_svd[:, 1], c=y_pred_svd)
plt.title('svd, ari={:0.5}'.format(
    adjusted_rand_score(y, y_pred_svd)))


plt.subplot(122)
plt.scatter(X_umap[:, 0], X_umap[:, 1], c=y_pred_umap)
plt.title('umap, ari={:0.5}'.format(
    adjusted_rand_score(y, y_pred_umap)))

In [None]:
from sklearn.model_selection import cross_val_score
import xgboost as xgb
xgbc = xgb.XGBClassifier().fit(X, y)

print('vanilla xgboost classifier')
print('train 10 cv mean: {}'.format(cross_val_score(xgbc, X, y, cv=10)))

In [None]:
from sklearn.cluster import SpectralClustering
from sklearn.metrics import adjusted_rand_score

#rbf
sc_rbf = SpectralClustering(n_clusters=n_clust,
                            affinity='rbf').fit(X_svd)

predict_rbf_train = sc_rbf.fit_predict(X_svd)
#plots
plt.figure(figsize=(6,6))
#plt.suptitle('affinity=rbf')
#plt.subplot(121)
plt.scatter(X_svd[:,0], X_svd[:,1], c=predict_rbf_train)
plt.title('train rbf ari: {}'.format(adjusted_rand_score(
    y, predict_rbf_train)))

In [None]:
bag2 = []
for name in train_nlp.columns:
    bag2.append(name)

In [None]:
model2 = 

In [None]:
#error stop here
#zeros = np.zeros([300,], dtype=list)
start_time = time.time()

#train_nlp = pd.concat([df_earn, df_acq], 0).reset_index(drop=True)

for i, article in enumerate(train_nlp_subset['article']):
    #parse article
    article_nlp = nlp(article)
    #filter stopwords, punctuation
    article = [token.lemma_ for token in article_nlp 
                if not token.is_punct and not token.is_stop]
    #bag 20 most common words
    bag = ([item[0] for item in Counter(article).most_common(20)])
    #add new words as features and populate rows with word vector
    #df_temp = pd.DataFrame()
    for word in bag:
        if word not in train_nlp.columns and word in model.wv.vocab.keys():
            train_nlp[word] = ''
            train_nlp.loc[i, word] = np.mean([model.wv[word]])
train_nlp.replace('', np.mean(np.zeros(300,)), inplace=True)
print(time.time() - start_time)

### spacey parsing, word2vec token vectorization

In [None]:
train_nlp = pd.read_csv('train_nlp_subset.csv')
#train_nlp.head()

In [None]:
from collections import Counter
#start_time = time.time()
#parser
nlp = spacy.load('en')

#load pre trained google model
#model = gensim.models.KeyedVectors.load_word2vec_format(
#    './GoogleNews-vectors-negative300.bin', binary=True)

#print(time.time() - start_time)