In [3]:
import pandas as pd

# Data

In [8]:
raw_data = pd.read_csv("../raw_data/articles1.csv").sample(frac = 0.02)
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 18047 to 31129
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   1000 non-null   int64  
 1   id           1000 non-null   int64  
 2   title        1000 non-null   object 
 3   publication  1000 non-null   object 
 4   author       885 non-null    object 
 5   date         1000 non-null   object 
 6   year         1000 non-null   float64
 7   month        1000 non-null   float64
 8   url          0 non-null      float64
 9   content      1000 non-null   object 
dtypes: float64(3), int64(2), object(5)
memory usage: 85.9+ KB


In [11]:
raw_content = raw_data.content
raw_content.head()

18047    During the weeks before the final day of the c...
30432    Though Marco Rubio has consistently opposed th...
31380    Sunday in an appearance on Fox News Channel’s ...
4170     HOBOKEN, N. J.  —   Federal investigators have...
42689    Atlanta  (CNN) A   video from ISIS shows milit...
Name: content, dtype: object

# TF-IDF

In [94]:
from sklearn.feature_extraction.text import TfidfVectorizer

texts = raw_content

tf_idf_vectorizer = TfidfVectorizer()

X = tf_idf_vectorizer.fit_transform(texts)
X

<1000x29720 sparse matrix of type '<class 'numpy.float64'>'
	with 284689 stored elements in Compressed Sparse Row format>

In [19]:
from sklearn.cluster import KMeans

tf_idf_model = KMeans(n_clusters=20).fit(X)

In [28]:
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        

print_topics(tf_idf_model.cluster_centers_, tf_idf_vectorizer)

Topic 0:
[('tesla', 0.3961301801853234), ('the', 0.1706194914544641), ('musk', 0.16157716531489766), ('lithium', 0.12966018712048247), ('vehicles', 0.12006567328859741), ('cars', 0.10888838236925824), ('car', 0.10525605936718614), ('to', 0.09638655704411865), ('and', 0.09121401543684719), ('of', 0.09036275869688043)]
Topic 1:
[('sanders', 0.2715483345973047), ('the', 0.18832245572405673), ('bernie', 0.10954821251486507), ('clinton', 0.08970331219462485), ('of', 0.08892221904962866), ('to', 0.0870939499711534), ('in', 0.07762883070834521), ('delegates', 0.07760940283506548), ('democratic', 0.06426641275042232), ('and', 0.06126980259124733)]
Topic 2:
[('the', 0.2756052861946203), ('israeli', 0.2563009065937392), ('palestinian', 0.1881284861412586), ('israel', 0.14087949571239158), ('to', 0.09820778585697122), ('palestinians', 0.09544732914420685), ('dermer', 0.08363033670263317), ('of', 0.0819145526502959), ('and', 0.0685752593324952), ('in', 0.05876164299907806)]
Topic 3:
[('the', 0.241

In [33]:
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components=20).fit(X)

print_topics(lda_model.components_, tf_idf_vectorizer)

In [36]:
lda_model.transform(X[0])

array([[0.0034004 , 0.0034004 , 0.0034004 , 0.0034004 , 0.5353657 ,
        0.0034004 , 0.0034004 , 0.0034004 , 0.0034004 , 0.0034004 ,
        0.0034004 , 0.20329059, 0.0034004 , 0.0034004 , 0.07443649,
        0.0034004 , 0.13250081, 0.0034004 , 0.0034004 , 0.0034004 ]])

In [135]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence

text_list = [text_to_word_sequence(sentence) for sentence in texts]

In [148]:
from gensim.models import Word2Vec

word2vec = Word2Vec(sentences=text_list, vector_size=10)

word2vec.wv["tesla"]

array([ 0.2100628 ,  0.1537489 ,  0.87903917, -0.5523931 , -0.07621907,
       -0.07617197,  0.4168468 ,  0.9264497 , -0.6768583 , -0.920836  ],
      dtype=float32)

In [151]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence.split(" "))
        embed.append(embedded_sentence)
        
    return embed

# Embed the training and test sentences
X_embed = embedding(word2vec, texts)


# Pad the training and test embedded sentences
shape_output = max([array.size for array in X_embed])
#X = pad_sequences(X_embed, dtype='float32', padding='post', maxlen=shape_output).reshape(1000,shape_output*10)

In [None]:
X.shape

In [143]:
from scipy.sparse import csr_matrix

X = csr_matrix(X)

In [149]:
word2vec.wv.most_similar("trump")

[('clinton', 0.8887618780136108),
 ('trump’s', 0.8878629207611084),
 ('election', 0.8647207617759705),
 ('cruz', 0.8619100451469421),
 ('obama', 0.8521750569343567),
 ('ryan', 0.8442927598953247),
 ('candidate', 0.8231729865074158),
 ('unimatrixzeroone', 0.8141018152236938),
 ('’a', 0.8074794411659241),
 ('marc', 0.807149350643158)]

In [120]:
tf_idf_model = KMeans(n_clusters=20).fit(X)

for idx, topic in enumerate(tf_idf_model.cluster_centers_):
    print("Topic %d:" % (idx))
    print([word2vec.wv.most_similar(positive=[topic[i]], topn=1)
                    for i in topic.argsort()[:-10 - 1:-1]])

Topic 0:


TypeError: 'numpy.float32' object is not iterable

In [162]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

X

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


<4x9 sparse matrix of type '<class 'numpy.float64'>'
	with 21 stored elements in Compressed Sparse Row format>