In [46]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from IPython.core.interactiveshell import InteractiveShell
# Used to get multiple outputs per cell
InteractiveShell.ast_node_interactivity = "all"

### Main implementation
#### Load Datasets

In [83]:
tweets_dataset = pd.read_csv('tweets_info_test.csv') 
user_dataset = pd.read_csv('user_info.csv') 
tweets_dataset.sort_values(by='Likes count', ascending = False)
# user_dataset

Unnamed: 0,User,Date,Tweet,Binders,Permalink,Retweet count,Likes count,Tweet value
0,Judd Legum,11.03.20 13:25,"TRUMP TWO WEEKS AGO: ""You have 15 people [in t...",,https://www.twitter.com/user/status/1237731484...,5503,19357,950.17
1,CAPITÁN ADOBO,11.03.20 13:19,35 grados un 11 de marzo.\n\nLos sevillanos no...,,https://www.twitter.com/user/status/1237729790...,2117,4978,100.28


#### Clean Tweets (Remove punctuation and stop-words)

In [93]:
# first create the transform and stop words set for english
vectorizer = CountVectorizer(stop_words='english')

# The Tweet column is the list of documents
vectorizer.fit(tweets_dataset['Tweet'])

# encode document
vector = vectorizer.transform(tweets_dataset['Tweet'])
vector.toarray()

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

array([[1, 0, 2, 0, 0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 1, 1, 1, 1, 1, 0, 1],
       [0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
        1, 1, 1, 0, 0, 0, 0, 0, 1, 0]], dtype=int64)

In [103]:
# create the transform
vectorizer = TfidfVectorizer(stop_words='english')

# tokenize and build vocab
vectorizer.fit(tweets_dataset['Tweet'])
sorted(vectorizer.vocabulary_.items(), key = lambda k: k[1])
vectorizer.idf_

# encode document
vectorizer.transform(tweets_dataset['Tweet']).toarray()

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

[('039', 0),
 ('11', 1),
 ('15', 2),
 ('35', 3),
 ('acabar', 4),
 ('ago', 5),
 ('cases', 6),
 ('close', 7),
 ('confirmed', 8),
 ('coronavirus', 9),
 ('couple', 10),
 ('days', 11),
 ('el', 12),
 ('going', 13),
 ('grados', 14),
 ('los', 15),
 ('marzo', 16),
 ('más', 17),
 ('nuestra', 18),
 ('para', 19),
 ('parte', 20),
 ('people', 21),
 ('podemos', 22),
 ('poner', 23),
 ('sevillanos', 24),
 ('states', 25),
 ('today', 26),
 ('trump', 27),
 ('united', 28),
 ('weeks', 29),
 ('ya', 30),
 ('zero', 31)]

array([1.40546511, 1.40546511, 1.40546511, 1.40546511, 1.40546511,
       1.40546511, 1.40546511, 1.40546511, 1.40546511, 1.        ,
       1.40546511, 1.40546511, 1.40546511, 1.40546511, 1.40546511,
       1.40546511, 1.40546511, 1.40546511, 1.40546511, 1.40546511,
       1.40546511, 1.40546511, 1.40546511, 1.40546511, 1.40546511,
       1.40546511, 1.40546511, 1.40546511, 1.40546511, 1.40546511,
       1.40546511, 1.40546511])

array([[0.21808824, 0.        , 0.43617648, 0.        , 0.        ,
        0.21808824, 0.21808824, 0.21808824, 0.21808824, 0.31034316,
        0.21808824, 0.21808824, 0.        , 0.21808824, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.21808824, 0.        , 0.        , 0.        ,
        0.21808824, 0.21808824, 0.21808824, 0.21808824, 0.21808824,
        0.        , 0.21808824],
       [0.        , 0.25394911, 0.        , 0.25394911, 0.25394911,
        0.        , 0.        , 0.        , 0.        , 0.18068688,
        0.        , 0.        , 0.25394911, 0.        , 0.25394911,
        0.25394911, 0.25394911, 0.25394911, 0.25394911, 0.25394911,
        0.25394911, 0.        , 0.25394911, 0.25394911, 0.25394911,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.25394911, 0.        ]])