In [15]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from IPython.core.interactiveshell import InteractiveShell
# Used to get multiple outputs per cell
InteractiveShell.ast_node_interactivity = "all"

### Main implementation
#### Load Datasets

In [16]:
tweets_dataset = pd.read_csv('tweets_info_test.csv') 
user_dataset = pd.read_csv('user_info.csv') 
tweets_dataset.sort_values(by='Likes count', ascending = False)
# user_dataset

Unnamed: 0,User,Date,Tweet,Binders,Permalink,Retweet count,Likes count,Tweet value
0,Judd Legum,11.03.20 13:25,"TRUMP TWO WEEKS AGO: ""You have 35 people [in t...",,https://www.twitter.com/user/status/1237731484...,5503,19357,950.17
1,CAPIT√ÅN ADOBO,11.03.20 13:19,35 grados un 11 de marzo.\n\nLos sevillanos no...,,https://www.twitter.com/user/status/1237729790...,2117,4978,100.28
2,œü ùìüùìªùì≤ùì∂ùì∂ùì≤ùì≤ ùìüùì∏ùìΩùìΩùìÆùìª ‚òç,11.03.20 00:07,‡∏Ñ‡∏∑‡∏≠‡∏£‡πâ‡∏≤‡∏ô‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡πÉ‡∏ô All Seasons ‡∏°‡∏µ‡∏Ñ‡∏ô‡πÄ‡∏õ‡πá‡∏ô #COVID19 ‡πÅ...,,https://www.twitter.com/user/status/1237530520...,1285,150,0.52


#### Clean Tweets (Remove punctuation and stop-words)

In [31]:
# first create the transform and stop words set for english
vectorizer = CountVectorizer(stop_words='english')

# The Tweet column is the list of documents
vectorizer.fit(tweets_dataset['Tweet'])

# Counter words by tweet array
word_counter = vectorizer.transform(tweets_dataset['Tweet'])
pd.DataFrame(word_counter.toarray(), columns=vectorizer.get_feature_names())

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

Unnamed: 0,039,11,15,2‡∏Å‡∏û,35,50‡∏Å‡∏ß,acabar,ago,cases,close,...,‡∏≤‡∏ô‡πÑ‡∏õ‡πÅ‡∏•,‡∏≤‡∏¢,‡∏≤‡∏ß‡πÄ‡∏á,‡πÄ‡∏Ñ,‡πÄ‡∏à,‡πÄ‡∏≠‡∏á,‡πÅ‡∏•,‡πÇ‡∏Ñ‡∏ß,‡πÉ‡∏Ñ‡∏£‡πÑ‡∏õ‡∏Å,‡πÉ‡∏ä
0,1,0,1,0,1,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,1,0,0,0,0,...,1,1,1,1,1,1,2,1,1,1


In [29]:
# create the transform
vectorizer = TfidfVectorizer(stop_words='english')

# tokenize and build vocab
vectorizer.fit(tweets_dataset['Tweet'])
pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names(), columns=['IDF'])

# Tf-idf matrix by tweet
tf_idf = vectorizer.transform(tweets_dataset['Tweet'])
pd.DataFrame(tf_idf.toarray(), columns=vectorizer.get_feature_names())

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

Unnamed: 0,IDF
039,1.693147
11,1.693147
15,1.693147
2‡∏Å‡∏û,1.693147
35,1.287682
...,...
‡πÄ‡∏≠‡∏á,1.693147
‡πÅ‡∏•,1.693147
‡πÇ‡∏Ñ‡∏ß,1.693147
‡πÉ‡∏Ñ‡∏£‡πÑ‡∏õ‡∏Å,1.693147


Unnamed: 0,039,11,15,2‡∏Å‡∏û,35,50‡∏Å‡∏ß,acabar,ago,all,and,...,‡∏≤‡∏ô‡πÑ‡∏õ‡πÅ‡∏•,‡∏≤‡∏¢,‡∏≤‡∏ß‡πÄ‡∏á,‡πÄ‡∏Ñ,‡πÄ‡∏à,‡πÄ‡∏≠‡∏á,‡πÅ‡∏•,‡πÇ‡∏Ñ‡∏ß,‡πÉ‡∏Ñ‡∏£‡πÑ‡∏õ‡∏Å,‡πÉ‡∏ä
0,0.13376,0.0,0.13376,0.0,0.101728,0.0,0.0,0.13376,0.0,0.13376,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.212445,0.0,0.0,0.16157,0.0,0.212445,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.138675,0.0,0.138675,0.0,0.0,0.138675,0.0,...,0.138675,0.138675,0.138675,0.138675,0.138675,0.138675,0.27735,0.138675,0.138675,0.138675
