### Modules and libraries required

In [35]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics

from IPython.core.interactiveshell import InteractiveShell
# Used to get multiple outputs per cell
InteractiveShell.ast_node_interactivity = "all"

## TASK 1
### Load Datasets

In [36]:
tweets_dataset = pd.read_csv('tweets_info_test.csv') 
user_dataset = pd.read_csv('user_info.csv') 
tweets_dataset.sort_values(by='Likes count', ascending = False)
# tweets_dataset.groupby('User').count().sort_values(['Tweet'], ascending=False)
user_dataset.sort_values(by='Followers', ascending=False)
# user_dataset.groupby('User').count().sort_values(['Impacts'], ascending=False)

Unnamed: 0,User,Date,Tweet,Binders,Permalink,Retweet count,Likes count,Tweet value
7,e,11.03.20 12:44,the coronavirus outbreak in Turkey https://t.c...,,https://www.twitter.com/user/status/1237721054...,53931,208546,12.04
2,The Spectator Index,10.03.20 22:39,BREAKING: British Health Minister tests positi...,,https://www.twitter.com/user/status/1237508289...,24043,51520,2598.05
8,Barack Obama,10.03.20 22:39,BREAKING: British Health Minister tests positi...,,https://www.twitter.com/user/status/1237508289...,24043,51520,2598.05
4,e,10.03.20 21:56,JUST IN: First coronavirus case in Turkey,,https://www.twitter.com/user/status/1237497648...,4702,20256,2598.05
0,Judd Legum,11.03.20 13:25,"TRUMP TWO WEEKS AGO: ""You have 35 people [in t...",,https://www.twitter.com/user/status/1237731484...,5503,19357,950.17
5,Lenilda Luna 🇨🇺🇵🇸🇧🇷 🚩 🆙️8️⃣0️⃣,11.03.20 13:09,Cuba anuncia vacina contra o Coronavírus \n\nh...,,https://www.twitter.com/user/status/1237727402...,3215,17909,22.2
6,Isabel Díaz Ayuso,11.03.20 13:04,🔴Importante:\n\nLa Comunidad de Madrid no ha v...,,https://www.twitter.com/user/status/1237725969...,6005,11628,306.82
3,The Spectator Index,10.03.20 22:37,BREAKING: British member of parliament tests p...,,https://www.twitter.com/user/status/1237507835...,2008,5610,2598.05
1,CAPITÁN ADOBO,11.03.20 13:19,35 grados un 11 de marzo.\n\nLos sevillanos no...,,https://www.twitter.com/user/status/1237729790...,2117,4978,100.28


Unnamed: 0,Name,User,Location,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value
0,@BarackObama,Barack Obama,"Washington, DC",13,0,0,12,5,1,1489476161,115296834,150464.37,1466166.07
1,@Cristiano,Cristiano Ronaldo,"Turim, Piemonte",1,0,1,1,0,0,83133379,83133379,111363.10,146999.29
2,@ladygaga,Lady Gaga,,2,0,0,2,0,0,162436360,81218183,109802.60,241565.72
3,@realDonaldTrump,Donald J. Trump,"Washington, DC",55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11
4,@selenagomez,Selena Gomez,Los Angeles,1,0,0,1,0,0,60528525,60528525,83860.78,92246.86
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14263,@gegyrigle,JewelRigle,,1,0,1,0,0,0,1,1,0.00,0.00
14264,@sadbadme,กูจะอยู่กูจะรอด,,1,0,1,0,0,0,1,1,0.00,0.00
14265,@pawat58720145,pawat,,1,0,1,0,0,0,1,1,0.00,0.00
14266,@DelsVeja,🐉Y A S U K E🤺,"Ile-de-France, France",1,0,0,0,0,1,0,0,0.00,0.00


### 1.1 Clean Data (Remove punctuation and stop-words)
#### Count words by document

In [37]:
# first create the transform and stop words set for english
vectorizer = CountVectorizer(stop_words='english')

# The Tweet column is the list of documents
vectorizer.fit(tweets_dataset['Tweet'])

# Counter words by tweet array
word_counter = vectorizer.transform(tweets_dataset['Tweet'])
print('Vector Representation')
pd.DataFrame(word_counter.toarray(), columns=vectorizer.get_feature_names())

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

Vector Representation


Unnamed: 0,039,11,15,35,acabar,ago,anuncia,atender,breaking,british,...,today,todos,trump,turkey,united,vacina,valorado,weeks,ya,zero
0,1,0,1,1,0,1,0,0,0,0,...,1,0,1,0,1,0,0,1,0,1
1,0,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
6,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


### TF-IDF computation by document

In [38]:
# create the transform
vectorizer = TfidfVectorizer(stop_words='english')

# tokenize and build vocab
vectorizer.fit(tweets_dataset['Tweet'])
pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names(), columns=['IDF']).head(10)

# Tf-idf matrix by tweet
tf_idf = vectorizer.transform(tweets_dataset['Tweet'])
tf_idf = pd.DataFrame(tf_idf.toarray(), columns=vectorizer.get_feature_names())
# Add TF-IDF column to tweets_dataset
tweets_dataset = pd.concat([tweets_dataset, tf_idf], axis=1)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

Unnamed: 0,IDF
039,2.609438
11,2.609438
15,2.609438
35,2.203973
acabar,2.609438
ago,2.609438
anuncia,2.609438
atender,2.609438
breaking,1.916291
british,1.916291


## Feature set
### 1.2 Merge User and Tweets info datasets

**In this step is created the dataset both for task 1 as task 2, therefore is required to run this before [task_2.ipynb](task_2.ipynb)**

In [39]:
# Construct the feature set
custom_dataset = pd.merge(user_dataset, tweets_dataset, on=['User'], how='inner')
drop_columns = ['User','Date','Tweet','Binders','Permalink','Name','Location']
custom_dataset.drop(drop_columns, axis=1, inplace=True)
custom_dataset

Unnamed: 0,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value,...,today,todos,trump,turkey,united,vacina,valorado,weeks,ya,zero
0,13,0,0,12,5,1,1489476161,115296834,150464.37,1466166.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,382,0,0,0,0,382,615260733,1665144,2722.29,1009497.14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,382,0,0,0,0,382,615260733,1665144,2722.29,1009497.14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,22,0,3,5,0,14,9061712,416833,960.4,21959.22,...,0.239518,0.0,0.239518,0.0,0.239518,0.0,0.0,0.239518,0.0,0.239518
4,14,0,3,3,0,8,2403686,196831,460.34,6090.03,...,0.0,0.182474,0.0,0.0,0.0,0.0,0.182474,0.0,0.0,0.0
5,2,0,0,0,0,2,90156,45078,100.28,200.56,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.261657,0.0
6,1,0,0,1,0,0,8310,8310,20.18,22.2,...,0.0,0.0,0.0,0.0,0.0,0.385949,0.0,0.0,0.0,0.0
7,1,0,1,0,0,0,3645,3645,10.03,12.04,...,0.0,0.0,0.0,0.496591,0.0,0.0,0.0,0.0,0.0,0.0
8,1,0,1,0,0,0,3645,3645,10.03,12.04,...,0.0,0.0,0.0,0.444769,0.0,0.0,0.0,0.0,0.0,0.0
9,1,0,1,0,0,0,504,504,1.7,2.04,...,0.0,0.0,0.0,0.496591,0.0,0.0,0.0,0.0,0.0,0.0


### 1.3 Calculate the tweet popularity and append feature set

In [40]:
# Calculate tweet popularity
custom_dataset['Popular'] = np.where(custom_dataset['Likes count'] > 10000, True, False)
custom_dataset

Unnamed: 0,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value,...,todos,trump,turkey,united,vacina,valorado,weeks,ya,zero,Popular
0,13,0,0,12,5,1,1489476161,115296834,150464.37,1466166.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
1,382,0,0,0,0,382,615260733,1665144,2722.29,1009497.14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
2,382,0,0,0,0,382,615260733,1665144,2722.29,1009497.14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
3,22,0,3,5,0,14,9061712,416833,960.4,21959.22,...,0.0,0.239518,0.0,0.239518,0.0,0.0,0.239518,0.0,0.239518,True
4,14,0,3,3,0,8,2403686,196831,460.34,6090.03,...,0.182474,0.0,0.0,0.0,0.0,0.182474,0.0,0.0,0.0,True
5,2,0,0,0,0,2,90156,45078,100.28,200.56,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.261657,0.0,False
6,1,0,0,1,0,0,8310,8310,20.18,22.2,...,0.0,0.0,0.0,0.0,0.385949,0.0,0.0,0.0,0.0,True
7,1,0,1,0,0,0,3645,3645,10.03,12.04,...,0.0,0.0,0.496591,0.0,0.0,0.0,0.0,0.0,0.0,True
8,1,0,1,0,0,0,3645,3645,10.03,12.04,...,0.0,0.0,0.444769,0.0,0.0,0.0,0.0,0.0,0.0,True
9,1,0,1,0,0,0,504,504,1.7,2.04,...,0.0,0.0,0.496591,0.0,0.0,0.0,0.0,0.0,0.0,True


### 1.4 Divide the dataset into 80% training and 20% testing

In [41]:
# split the custom_dataset both in training and testing sets
train_feat, test_feat, train_labels, test_labels = train_test_split(custom_dataset.drop('Popular', axis= 1), custom_dataset['Popular'], test_size= 0.2)
print('Training Dataset\n')
train_feat
train_labels
print('Testing Dataset\n')
test_feat
test_labels

Training Dataset



Unnamed: 0,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value,...,today,todos,trump,turkey,united,vacina,valorado,weeks,ya,zero
9,1,0,1,0,0,0,504,504,1.7,2.04,...,0.0,0.0,0.0,0.496591,0.0,0.0,0.0,0.0,0.0,0.0
2,382,0,0,0,0,382,615260733,1665144,2722.29,1009497.14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,13,0,0,12,5,1,1489476161,115296834,150464.37,1466166.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1,0,0,1,0,0,8310,8310,20.18,22.2,...,0.0,0.0,0.0,0.0,0.0,0.385949,0.0,0.0,0.0,0.0
4,14,0,3,3,0,8,2403686,196831,460.34,6090.03,...,0.0,0.182474,0.0,0.0,0.0,0.0,0.182474,0.0,0.0,0.0
5,2,0,0,0,0,2,90156,45078,100.28,200.56,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.261657,0.0
10,1,0,1,0,0,0,504,504,1.7,2.04,...,0.0,0.0,0.0,0.444769,0.0,0.0,0.0,0.0,0.0,0.0
3,22,0,3,5,0,14,9061712,416833,960.4,21959.22,...,0.239518,0.0,0.239518,0.0,0.239518,0.0,0.0,0.239518,0.0,0.239518


9      True
2     False
0      True
6      True
4      True
5     False
10     True
3      True
Name: Popular, dtype: bool

Testing Dataset



Unnamed: 0,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value,...,today,todos,trump,turkey,united,vacina,valorado,weeks,ya,zero
8,1,0,1,0,0,0,3645,3645,10.03,12.04,...,0.0,0.0,0.0,0.444769,0.0,0.0,0.0,0.0,0.0,0.0
1,382,0,0,0,0,382,615260733,1665144,2722.29,1009497.14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1,0,1,0,0,0,3645,3645,10.03,12.04,...,0.0,0.0,0.0,0.496591,0.0,0.0,0.0,0.0,0.0,0.0


8    True
1    True
7    True
Name: Popular, dtype: bool

### 1.5 Classifiers
#### Naive Bayes

In [42]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import MultinomialNB

#Create a Gaussian Classifier
model = MultinomialNB()

# Train the model using the training sets
model.fit(train_feat, train_labels)

score_train =  model.score(train_feat, train_labels)
print('Train set score: ', score_train)

#Predict the response for test dataset
prediction = model.predict(test_feat)

# Model Accuracy, how often is the classifier correct?
print("Prediction accuracy:",metrics.accuracy_score(test_labels, prediction))

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Train set score:  0.875
Prediction accuracy: 0.6666666666666666


#### Nearest Neighbors 

In [43]:
#Import KNN Classifier
from sklearn.neighbors import KNeighborsClassifier

#Create a KNeighbors Classifier
model = KNeighborsClassifier(n_neighbors=5)

# Train the model using the training sets
model.fit(train_feat,train_labels)

score_train =  model.score(train_feat, train_labels)
print('Train set score: ', score_train)

#Predict the response for test dataset
prediction = model.predict(test_feat)

# Model Accuracy, how often is the classifier correct?
print("Prediction accuracy:",metrics.accuracy_score(test_labels, prediction))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

Train set score:  0.75
Prediction accuracy: 1.0


## Task 2
### 2.1 Logistic Regression
#### Split custom dataset into 90% for training and 10% for testing

In [44]:
# split the custom_dataset both in training and testing sets
train_feat, test_feat, train_labels, test_labels = train_test_split(custom_dataset.drop('Popular', axis= 1), custom_dataset['Popular'], test_size= 0.1)
print('Training Dataset\n')
train_feat
train_labels
print('Testing Dataset\n')
test_feat
test_labels

Training Dataset



Unnamed: 0,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value,...,today,todos,trump,turkey,united,vacina,valorado,weeks,ya,zero
0,13,0,0,12,5,1,1489476161,115296834,150464.37,1466166.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,22,0,3,5,0,14,9061712,416833,960.4,21959.22,...,0.239518,0.0,0.239518,0.0,0.239518,0.0,0.0,0.239518,0.0,0.239518
5,2,0,0,0,0,2,90156,45078,100.28,200.56,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.261657,0.0
8,1,0,1,0,0,0,3645,3645,10.03,12.04,...,0.0,0.0,0.0,0.444769,0.0,0.0,0.0,0.0,0.0,0.0
6,1,0,0,1,0,0,8310,8310,20.18,22.2,...,0.0,0.0,0.0,0.0,0.0,0.385949,0.0,0.0,0.0,0.0
10,1,0,1,0,0,0,504,504,1.7,2.04,...,0.0,0.0,0.0,0.444769,0.0,0.0,0.0,0.0,0.0,0.0
2,382,0,0,0,0,382,615260733,1665144,2722.29,1009497.14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1,0,1,0,0,0,3645,3645,10.03,12.04,...,0.0,0.0,0.0,0.496591,0.0,0.0,0.0,0.0,0.0,0.0
9,1,0,1,0,0,0,504,504,1.7,2.04,...,0.0,0.0,0.0,0.496591,0.0,0.0,0.0,0.0,0.0,0.0


0      True
3      True
5     False
8      True
6      True
10     True
2     False
7      True
9      True
Name: Popular, dtype: bool

Testing Dataset



Unnamed: 0,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value,...,today,todos,trump,turkey,united,vacina,valorado,weeks,ya,zero
1,382,0,0,0,0,382,615260733,1665144,2722.29,1009497.14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,14,0,3,3,0,8,2403686,196831,460.34,6090.03,...,0.0,0.182474,0.0,0.0,0.0,0.0,0.182474,0.0,0.0,0.0


1    True
4    True
Name: Popular, dtype: bool

#### Logistic Regression implementation

In [45]:
#Import Logistic Regression classifier
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(train_feat, train_labels)

score_train =  model.score(train_feat, train_labels)
print('Train set score: ', score_train)

#Predict the response for test dataset
prediction = model.predict(test_feat)

# Model Accuracy, how often is the classifier correct?
print("Prediction accuracy:",metrics.accuracy_score(test_labels, prediction))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Train set score:  0.7777777777777778
Prediction accuracy: 1.0


### 2.2 Root Mean Squared Root (RMSE)

In [46]:
#Import RMSE metric
from sklearn.metrics import mean_squared_error
# converting True to 1 and False to 0, to compute RMSE
test_labels = test_labels.astype(int)
prediction = prediction.astype(int)

rmse = mean_squared_error(test_labels, prediction)
# RMSE result
print("RMSE score:",rmse)

RMSE score: 0.0


### 2.3 No. of retweets Prediction
#### Load new dataset

In [47]:
new_tweets_dataset = pd.read_csv('new_tweets.csv') 
new_tweets_dataset

Unnamed: 0,User,Tweet
0,The White House,LIVE: Press Briefing with Coronavirus Task For...
1,isentoes2,Após governo americano dizer que China esconde...
2,Adam Schefter,Twitter CEO Jack Dorsey pledged $1 billion tow...
3,Kyle Griffin,"House Armed Services Chairman Adam Smith: ""The..."
4,Kamala Harris,Black communities disproportionately suffer fr...
...,...,...
14995,Juanita Broaddrick,Many questions emerging about Dr Fauci’s agend...
14996,Benny,Taiwan Says It Warned @WHO About Coronavirus I...
14997,The Spectator Index,SPAIN: Coronavirus death toll rises by 324 ove...
14998,Donald Trump Jr.,David Bossie: Americans uniting to fight coron...
