In [5]:
import pickle
import os
import time

import numpy as np
import pandas as pd
import scipy.sparse.csr as csr
import scipy.sparse as sparse
from sklearn.base import clone
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import CountVectorizer

import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
pd.set_option('display.max_columns', 200)

In [7]:
raw_text = pickle.load( open( "raw_text_dataset.pickle", "rb" ) )

In [8]:
type(raw_text)

tuple

In [9]:
 # raw text format:
# (X_train_raw, y_train_raw, X_test_raw, y_test)

global X_train_raw, X_test_raw
X_train_raw = raw_text[0]
y_train_labels = raw_text[1] 
X_test_raw = raw_text[2]
y_test_labels = raw_text[3]

In [10]:
len(X_train_raw)

4743

In [11]:
print('Number of train docs:', len(X_train_raw), 'Number of test docs:', len(X_test_raw),'Labels in training set:', len(y_train_labels))
print('\nExample train labels:', y_train_labels[:15])

Number of train docs: 4743 Number of test docs: 4858 Labels in training set: 4743

Example train labels: [['cocoa', 'el-salvador', 'usa', 'uruguay'], ['usa'], ['usa'], ['usa', 'brazil'], ['grain', 'wheat', 'corn', 'barley', 'oat', 'sorghum', 'usa'], ['veg-oil', 'linseed', 'lin-oil', 'soy-oil', 'sun-oil', 'soybean', 'oilseed', 'corn', 'sunseed', 'grain', 'sorghum', 'wheat', 'argentina'], ['usa'], ['usa'], ['earn', 'usa'], ['acq', 'usa'], ['earn', 'usa'], ['earn', 'acq', 'usa'], ['earn', 'usa'], ['earn', 'usa'], ['usa']]


### Create instance of TfifdVectorizer

In [12]:
vectorizer = TfidfVectorizer(max_df=0.5, 
                             max_features=10000,
                             min_df=2, 
                             stop_words='english',
                             norm='l2', 
                             use_idf=True,
                            #     token_pattern='(?u)\\b\\w\\w+\\b'
                            token_pattern = '(?u)\\b[a-zA-Z]\\w+\\b'
                             )


In [13]:
# build the vectorizer from training data
global X_train_tfidf
X_train_tfidf = vectorizer.fit_transform(X_train_raw)

In [14]:
X_train_tfidf.shape

(4743, 10000)

In [15]:
X_train_tfidf

<4743x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 217725 stored elements in Compressed Sparse Row format>

In [16]:
# get feature names from the training set
X_train_features = vectorizer.get_feature_names()

### Function doc2vec that takes a document/string and a vectorizer object

In [39]:

def doc2vec(doc_str,vectorizer):
       
    # use the vectorizer object to transform the input string
    doc_vec = vectorizer.transform(doc_str) # returns a sparse matrix
    print("Input doc:", doc_str)
    print(doc_vec.shape)
            
    # get features for the dataset.
    doc_features = vectorizer.get_feature_names()
    
    # get feature count in the *INPUT* document
    c_vectorizer_input = CountVectorizer()
    fc_input = c_vectorizer_input.fit_transform(doc_str)  # train using the input document
    print("feature names in input doc:", c_vectorizer_input.vocabulary_)
    print("feature count in input doc:", fc_input.toarray())

    # get feature count in the training dataset; apply transform on the input string
    count_vectorizer = CountVectorizer()
    fc_data = count_vectorizer.fit_transform(X_train_raw) # train using the corpus training data set
    
    fc_data_vec = count_vectorizer.transform(doc_str) # transform the input string
           
    print("training data feature count:",fc_data_vec.toarray())
    #print("training data feature count:",fc_data_vec.toarray().nonzero())
        
    # returning the vector for input document, features and the feature count for the training data
    return doc_vec.toarray(), doc_features, fc_data_vec.toarray()

In [37]:
doc1 = "The cocoa cadabra"
doc2 = "AAPL SE"
doc3 = "bullish stocks"
doc4 = "I walked through a random forest and earned a high premium"

all_docs = [doc1,doc2,doc3,doc4]


### Call doc2vec function on each of the strings above

In [40]:
vec_1, features_1, f_count_1 = doc2vec([doc1],vectorizer)
vec_2, features_2, f_count_2 = doc2vec([doc2],vectorizer)
vec_3, features_3, f_count_3 = doc2vec([doc3],vectorizer)
vec_4, features_4, f_count_4 = doc2vec([doc4],vectorizer)

Input doc: ['The cocoa cadabra']
(1, 10000)
feature names in input doc: {'the': 2, 'cocoa': 1, 'cadabra': 0}
feature count in input doc: [[1 1 1]]
training data feature count: [[0 0 0 ... 0 0 0]]
Input doc: ['AAPL SE']
(1, 10000)
feature names in input doc: {'se': 1, 'aapl': 0}
feature count in input doc: [[1 1]]
training data feature count: [[0 0 0 ... 0 0 0]]
Input doc: ['bullish stocks']
(1, 10000)
feature names in input doc: {'stocks': 1, 'bullish': 0}
feature count in input doc: [[1 1]]
training data feature count: [[0 0 0 ... 0 0 0]]
Input doc: ['I walked through a random forest and earned a high premium']
(1, 10000)
feature names in input doc: {'premium': 4, 'high': 3, 'forest': 2, 'through': 6, 'and': 0, 'walked': 7, 'earned': 1, 'random': 5}
feature count in input doc: [[1 1 1 1 1 1 1 1]]
training data feature count: [[0 0 0 ... 0 0 0]]


### Using LSA for feature reduction

In [254]:
# Use fewer features and project the tfidf vectors previously obtained onto these principal components.
svd = TruncatedSVD(
    n_components=200,
    random_state=42,
    algorithm='arpack'
)

lsa = make_pipeline(
    svd, 
    #Normalizer(copy=False)
)

# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)


In [601]:
# explained variance with and without the Normalizer is the same, at 40%

explained_variance = svd.explained_variance_ratio_.sum()
print("  Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

# Now apply the transformations to the test data as well.

X_test_tfidf = vectorizer.transform(X_test_raw)
X_test_lsa = lsa.transform(X_test_tfidf)

  Explained variance of the SVD step: 40%


#### Function to project document vectors on the lsa model

In [582]:
## to transform docs using lsa model
def doc2vec_lsa(vec,lsa):
    return lsa.transform(vec)

In [587]:
lsa_vec_1 = doc2vec_lsa(vec_1,lsa)
lsa_vec_2 = doc2vec_lsa(vec_2,lsa)
lsa_vec_3 = doc2vec_lsa(vec_3,lsa)
lsa_vec_4 = doc2vec_lsa(vec_4,lsa)

### Recommend function

In [615]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend(vec,X_model,X_corpus):
    
    #computes similarity score between the input document and the input model
    sim_vec = cosine_similarity(vec,X_model)
    
    # flatten the ndarray
    doc_vec = sim_vec.ravel()
    #print("Similarity scores:",doc_vec)
    print(doc_vec.shape)
    
    #sim_top10    
    sim_top10 = -np.sort(-doc_vec,axis=None)[:10]
    print("Top 10 similarity scores:", sim_top10)
    
    #idx_top10
    idx = np.argsort(doc_vec,axis=-1)
    idx_top10 = np.flip(idx)[:10]
    print("Indices of the top 10 similarity scores:",idx_top10,"\n")
        
    #X_top10
    X_top10 = []
    for i in idx_top10:
        #print("\n", X_corpus[i])
        X_top10.append(X_corpus[i])
        
    return doc_vec, sim_top10, idx_top10, X_top10
        
    
        

#### Using document vectors for doc1, doc2, doc3 and doc4 for the tfidf model

In [616]:
# passing the X_tfidf sparse matrix

doc_vec_1, sim_top10_1, idx_top10_1, X_top10_1 = recommend(vec_1,X_train_tfidf,X_train_raw)
doc_vec_2, sim_top10_2, idx_top10_2, X_top10_2 = recommend(vec_2,X_train_tfidf,X_train_raw)
doc_vec_3, sim_top10_3, idx_top10_3, X_top10_3 = recommend(vec_3,X_train_tfidf,X_train_raw)
doc_vec_4, sim_top10_4, idx_top10_4, X_top10_4 = recommend(vec_4,X_train_tfidf,X_train_raw)

(4743,)
Top 10 similarity scores: [0.42235728 0.42040369 0.35255483 0.35085335 0.2988182  0.14068797
 0.11826212 0.1143153  0.10724859 0.05619974]
Indices of the top 10 similarity scores: [3009 2974    0  266 3917 3092 2318 4232 1720 4322] 

(4743,)
Top 10 similarity scores: [0.20399735 0.20365975 0.19772875 0.17447117 0.1672799  0.14959382
 0.13094691 0.11369229 0.10175478 0.04976086]
Indices of the top 10 similarity scores: [ 358 2555  376 2480  473  693 2539 2961  980 3326] 

(4743,)
Top 10 similarity scores: [0.30381139 0.30381139 0.29963665 0.25871352 0.25514106 0.20313366
 0.19031362 0.17583325 0.1702555  0.16728479]
Indices of the top 10 similarity scores: [4467 4417 1542 1687 3751  246 3747 1202 1190 1682] 

(4743,)
Top 10 similarity scores: [0.20513221 0.2010637  0.10954859 0.10767873 0.10503232 0.09503946
 0.09346369 0.08627354 0.08562531 0.08477512]
Indices of the top 10 similarity scores: [3157 3144 4014 1669  472  467 3638 4678  651 3254] 



#### Using document vectors for doc1, doc2, doc3 and doc4 for the lsa model

In [617]:
# passing the X_train_lsa matrix

lsa_doc_vec_1, lsa_sim_top10_1, lsa_idx_top10_1, lsa_X_top10_1 = recommend(lsa_vec_1,X_train_lsa,X_train_raw)
lsa_doc_vec_2, lsa_sim_top10_2, lsa_idx_top10_2, lsa_X_top10_2 = recommend(lsa_vec_2,X_train_lsa,X_train_raw)
lsa_doc_vec_3, lsa_sim_top10_3, lsa_idx_top10_3, lsa_X_top10_3 = recommend(lsa_vec_3,X_train_lsa,X_train_raw)
lsa_doc_vec_4, lsa_sim_top10_4, lsa_idx_top10_4, lsa_X_top10_4 = recommend(lsa_vec_4,X_train_lsa,X_train_raw)

(4743,)
Top 10 similarity scores: [0.66299725 0.63583877 0.58200046 0.51773392 0.49465865 0.47773937
 0.46553139 0.44976251 0.44355088 0.43288301]
Indices of the top 10 similarity scores: [3009 2974  266 3833  894  220 3917 1741 3339 2867] 

(4743,)
Top 10 similarity scores: [0.86037122 0.80394306 0.74667568 0.74475129 0.74081637 0.69813517
 0.69508071 0.68335518 0.65656429 0.63659858]
Indices of the top 10 similarity scores: [ 473  693 1354  358  376 2818 2961 4254 3580  980] 

(4743,)
Top 10 similarity scores: [0.75442851 0.75425952 0.74885696 0.73217583 0.72090926 0.64055845
 0.60239534 0.50100177 0.45536005 0.43358076]
Indices of the top 10 similarity scores: [3751 1190 1687 1682 3747 1202 1542  670  663 2991] 

(4743,)
Top 10 similarity scores: [0.43183902 0.42397246 0.41647472 0.41646787 0.40471379 0.40289744
 0.39757218 0.39311349 0.39311349 0.39298254]
Indices of the top 10 similarity scores: [3418 4642 4659 3482 3489 4040 1324 4458 4424 4081] 



### CONCULSION

The similarity scores obtained by using the LSA model (with fewer features) is far superior than those obtained by using 
the tfidf matrix with large number of features.