In [11]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import nltk

In [2]:
us_df = pickle.load(open("us_df.p", "rb"))
it_df = pickle.load(open("it_df.p", "rb"))

In [31]:
english_words = set(nltk.corpus.stopwords.words('english'))
english_words = english_words.union(['mr', 'president'])
italian_words = set(nltk.corpus.stopwords.words('italian'))

In [32]:
#CV with single words, English
cv_1_en = CountVectorizer(stop_words=english_words, ngram_range=(1,1))
X_en_1 = cv_1_en.fit_transform(us_df['Content'])
word_counts_en_1 = pd.DataFrame(X_en_1.toarray(),columns=cv_1_en.get_feature_names())

In [33]:
cv_1_it = CountVectorizer(stop_words=italian_words, ngram_range=(1,1))
X_it_1 = cv_1_it.fit_transform(it_df['Content'])
word_counts_it_1 = pd.DataFrame(X_it_1.toarray(),columns=cv_1_it.get_feature_names())

In [34]:
cv_2_en = CountVectorizer(stop_words=english_words, ngram_range=(2,2))
X_en_2 = cv_2_en.fit_transform(us_df['Content'])
word_counts_en_2 = pd.DataFrame(X_en_2.toarray(),columns=cv_2_en.get_feature_names())

In [35]:
cv_2_it = CountVectorizer(stop_words=italian_words, ngram_range=(2,2))
X_it_2 = cv_2_it.fit_transform(it_df['Content'])
word_counts_it_2 = pd.DataFrame(X_it_2.toarray(),columns=cv_2_it.get_feature_names())

In [61]:
# Acronynms: Latent Semantic Analysis (LSA) is just another name for 
#  Signular Value Decomposition (SVD) applied to Natural Language Processing (NLP)
lsa = TruncatedSVD(5)
doc_topic = lsa.fit_transform(en_1)
lsa.explained_variance_ratio_

array([0.58197197, 0.03302128, 0.02979686, 0.02717095, 0.02489888])

In [62]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    """
    Displays the top n terms in each topic
    """
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix + 1)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [70]:
display_topics(lsa, cv_2_en.get_feature_names(), 5)


Topic  1
almost deal, antibody test, amazing great, atkinson job, back announcement

Topic  2
asymptomatic spreading, away something, asymptomatic monitoring, arms yes, back move

Topic  3
asymptomatic spreading, alone eighteen, back gave, approved within, asked want

Topic  4
asymptomatic monitoring, almost deal, asking administration, almost miracle, asymptomatic piece

Topic  5
almost million, asking administration, away never, amounts blue, back quickly


In [66]:
nmf_model = NMF(2)
doc_topic = nmf_model.fit_transform(en_1)

In [69]:
display_topics(nmf_model, cv_1_en.get_feature_names(), 20)


Topic  1
going, know, people, think, want, said, like, lot, get, really, one, go, new, would, well, great, states, right, country, say

Topic  2
thank, people, going, want, get, us, testing, much, american, vice, working, know, states, think, also, need, well, governor, country, today
