In [37]:
%config IPCompleter.greedy=True

### Modules and libraries required

In [38]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics

from IPython.core.interactiveshell import InteractiveShell
# Used to get multiple outputs per cell
InteractiveShell.ast_node_interactivity = "all"

## TASK 1
### Load Datasets

In [39]:
tweets_dataset = pd.read_csv('tweets_info.csv') 
user_dataset = pd.read_csv('user_info.csv')
# Tweets information
tweets_dataset.sort_values(by='Likes count', ascending = False).head()
tweets_dataset.describe()
# User information
user_dataset.sort_values(by='Followers', ascending=False).head()
user_dataset.describe()

Unnamed: 0,User,Date,Tweet,Binders,Permalink,Retweet count,Likes count,Tweet value
2168,Elon Musk,06.03.20 20:42,The coronavirus panic is dumb,,https://www.twitter.com/user/status/1236029449...,350555,1729596,47385.34
1764,Bernie Sanders,08.03.20 20:20,"Once a vaccine for coronavirus is developed, i...",,https://www.twitter.com/user/status/1236748536...,125019,841283,18291.78
544,J Hooch,10.03.20 17:25,Coronavirus is everywhere. BOOK THAT FLIGHT. T...,,https://www.twitter.com/user/status/1237429345...,139366,727945,14.75
1833,Ally Carter,08.03.20 14:53,I see a lot of people being like “I would surv...,,https://www.twitter.com/user/status/1236666460...,157052,719464,118.95
1284,Taylor ☾,09.03.20 19:22,Coronavirus has crossed the line for Italians ...,,https://www.twitter.com/user/status/1237096480...,206430,688358,30.02


Unnamed: 0,Binders,Retweet count,Likes count,Tweet value
count,0.0,5531.0,5531.0,5531.0
mean,,6608.259266,20200.66,3091.604668
std,,11996.116601,51820.59,11511.663314
min,,964.0,90.0,0.0
25%,,2074.0,4073.0,37.565
50%,,3249.0,8308.0,296.29
75%,,6433.0,17761.5,1347.74
max,,350555.0,1729596.0,149065.45


Unnamed: 0,Name,User,Location,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value
0,@BarackObama,Barack Obama,"Washington, DC",13,0,0,12,5,1,1489476161,115296834,150464.37,1466166.07
1,@Cristiano,Cristiano Ronaldo,"Turim, Piemonte",1,0,1,1,0,0,83133379,83133379,111363.1,146999.29
2,@ladygaga,Lady Gaga,,2,0,0,2,0,0,162436360,81218183,109802.6,241565.72
3,@realDonaldTrump,Donald J. Trump,"Washington, DC",55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11
4,@selenagomez,Selena Gomez,Los Angeles,1,0,0,1,0,0,60528525,60528525,83860.78,92246.86


Unnamed: 0,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value
count,14268.0,14268.0,14268.0,14268.0,14268.0,14268.0,14268.0,14268.0,14268.0,14268.0
mean,2.490258,0.0,0.855341,0.92739,0.062237,0.840903,4682540.0,410663.5,719.216365,8272.347
std,7.573669,0.0,3.059919,4.333562,0.702195,4.392951,108132800.0,2595982.0,3761.433023,170700.7
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,4220.25,4047.75,11.43,12.63
50%,1.0,0.0,0.0,0.0,0.0,0.0,27253.0,23240.0,58.795,74.525
75%,2.0,0.0,1.0,1.0,0.0,1.0,207333.5,122058.0,281.82,514.215
max,382.0,0.0,120.0,186.0,51.0,382.0,8960405000.0,115296800.0,150464.37,14461000.0


### 1.1 Clean Data (Remove punctuation and stop-words)
#### Count words by document

In [40]:
# first create the transform and stop words set for english
vectorizer = CountVectorizer(stop_words='english', strip_accents='unicode', dtype=np.float32, min_df=2)

# The Tweet column is the list of documents
word_counter = vectorizer.fit_transform(tweets_dataset['Tweet'])
# vectorizer.get_feature_names()

print('Word Counter Array Representation: ', word_counter.shape)
pd.DataFrame(word_counter.toarray(), columns=vectorizer.get_feature_names()).head()

Word Counter Array Representation:  (5531, 8828)


Unnamed: 0,00,000,004,01,02,029,03,0sy7pwigtb,10,100,...,ไมก,ไมม,ไมเอาร,ไวร,ไหน,웨이션브이_당장_입국시켜,트와이스,新型コロナウイルス,新型肺炎,速報
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### TF-IDF computation by document

In [41]:
# create the transform
# vectorizer = TfidfVectorizer(stop_words='english')
vectorizer = TfidfVectorizer(strip_accents='unicode',stop_words= 'english',dtype=np.float32, min_df=2)

# tokenize and build vocab
tf_idf = vectorizer.fit_transform(tweets_dataset['Tweet'])

# Tf-idf matrix by tweet
tf_idf = pd.DataFrame(tf_idf.toarray(), columns=vectorizer.get_feature_names())
# Add TF-IDF column to tweets_dataset
tweets_tf_idf = pd.concat([tweets_dataset, tf_idf], axis=1)
print("Tweet dataset and TF-IDF matrix: ",tweets_tf_idf.shape)
tweets_tf_idf.head()

Tweet dataset and TF-IDF matrix:  (5531, 8836)


Unnamed: 0,User,Date,Tweet,Binders,Permalink,Retweet count,Likes count,Tweet value,00,000,...,ไมก,ไมม,ไมเอาร,ไวร,ไหน,웨이션브이_당장_입국시켜,트와이스,新型コロナウイルス,新型肺炎,速報
0,Judd Legum,11.03.20 13:25,"TRUMP TWO WEEKS AGO: ""You have 15 people [in t...",,https://www.twitter.com/user/status/1237731484...,5503,19357,950.17,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CAPITÁN ADOBO,11.03.20 13:19,35 grados un 11 de marzo.\n\nLos sevillanos no...,,https://www.twitter.com/user/status/1237729790...,2117,4978,100.28,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Diamond and Silk®,11.03.20 13:18,"It looks like the Democrats, along with the le...",,https://www.twitter.com/user/status/1237729627...,1603,6231,2729.58,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Sebastián Campos C.,11.03.20 13:17,"Gente, con todo esto del coronavirus, hay pasa...",,https://www.twitter.com/user/status/1237729292...,1510,5002,12.47,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,David Cay Johnston,11.03.20 13:16,"Brooklyn teacher, returning from Italy and fee...",,https://www.twitter.com/user/status/1237729172...,4924,6380,435.53,0.0,0.156879,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Feature set
### 1.2 Merge User and Tweets info datasets

**In this step is created the dataset both for task 1 as task 2**

In [42]:
# Construct the feature set
user_dataset.drop_duplicates('User', inplace=True)

custom_dataset = pd.merge(user_dataset, tweets_tf_idf, on=['User'], how='inner')
drop_columns = ['User','Date','Tweet','Binders','Permalink','Name','Location','Total tweets','Retweets','Images','Links','Chats','Text tweets','User value','Tweets value','Tweet value']
custom_dataset.drop(drop_columns, axis=1, inplace=True)

### 1.3 Calculate the tweet popularity and append feature set

In [43]:
# Calculate tweet popularity
custom_dataset['Popular'] = np.where(custom_dataset['Likes count'] > 10000, True, False)
print("Custom Dataset matrix: ",custom_dataset.shape)
custom_dataset.head()

Custom Dataset matrix:  (5428, 8833)


Unnamed: 0,Impacts,Followers,Retweet count,Likes count,00,000,004,01,02,029,...,ไมม,ไมเอาร,ไวร,ไหน,웨이션브이_당장_입국시켜,트와이스,新型コロナウイルス,新型肺炎,速報,Popular
0,1489476161,115296834,131697,614387,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
1,4100089350,76238523,13104,68358,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
2,4100089350,76238523,81356,289443,0.0,0.449514,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
3,4100089350,76238523,14595,73526,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
4,4100089350,76238523,25424,90056,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True


### 1.4 Divide the dataset into 80% training and 20% testing

In [44]:
# split the custom_dataset both in training and testing sets
train_feat, test_feat, train_labels, test_labels = train_test_split(custom_dataset.drop('Popular', axis= 1), custom_dataset['Popular'], test_size= 0.2)
print('Training Dataset', train_feat.shape)
train_feat.head()
train_labels.head()
print('\nTesting Dataset: ', test_feat.shape)
test_feat.head()
test_labels.head()

Training Dataset (4342, 8832)


Unnamed: 0,Impacts,Followers,Retweet count,Likes count,00,000,004,01,02,029,...,ไมก,ไมม,ไมเอาร,ไวร,ไหน,웨이션브이_당장_입국시켜,트와이스,新型コロナウイルス,新型肺炎,速報
4544,4850,4850,1808,5546,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1614,7151769,453388,1656,8821,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2928,186493,93380,2533,10212,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1125,942067,942067,13803,74958,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5351,94,94,7125,15940,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


4544    False
1614    False
2928     True
1125     True
5351     True
Name: Popular, dtype: bool


Testing Dataset:  (1086, 8832)


Unnamed: 0,Impacts,Followers,Retweet count,Likes count,00,000,004,01,02,029,...,ไมก,ไมม,ไมเอาร,ไวร,ไหน,웨이션브이_당장_입국시켜,트와이스,新型コロナウイルス,新型肺炎,速報
1636,15283182,439048,6412,14282,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2556,631066,158704,2546,4418,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2502,515435,173893,2147,18328,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2139,8380294,282080,3071,9825,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4336,8158,8158,5172,10044,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


1636     True
2556    False
2502     True
2139    False
4336     True
Name: Popular, dtype: bool

### 1.5 Classifiers
#### Naive Bayes

In [45]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import MultinomialNB

#Create a Gaussian Classifier and train it
bayes_model = MultinomialNB().fit(train_feat, train_labels)
print('Train set score: ', bayes_model.score(train_feat, train_labels))

#Predict the response for test dataset
prediction = bayes_model.predict(test_feat)

# Model Accuracy, how often is the classifier correct?
print("Prediction accuracy:",metrics.accuracy_score(test_labels, prediction))

Train set score:  0.4776600644864118
Prediction accuracy: 0.4604051565377532


#### Nearest Neighbors 

In [46]:
#Import KNN Classifier
from sklearn.neighbors import KNeighborsClassifier

def compute_accuracy(model):
    print(f'Train set score for k= {model.n_neighbors}: ',model.score(train_feat, train_labels))

    #Predict the response for test dataset
    prediction = model.predict(test_feat)

    # Model Accuracy, how often is the classifier correct?
    print(f"Prediction accuracy k= {model.n_neighbors}:",metrics.accuracy_score(test_labels, prediction))
    
#Create a KNeighbors Classifier and Train the model using the training sets
compute_accuracy(KNeighborsClassifier(n_neighbors=5).fit(train_feat,train_labels))
#Create a KNeighbors Classifier and Train the model using the training sets
compute_accuracy(KNeighborsClassifier(n_neighbors=15).fit(train_feat,train_labels))
#Create a KNeighbors Classifier and Train the model using the training sets
compute_accuracy(KNeighborsClassifier(n_neighbors=45).fit(train_feat,train_labels))

Train set score for k= 5:  0.9078765545831414
Prediction accuracy k= 5: 0.861878453038674
Train set score for k= 15:  0.839705204974666
Prediction accuracy k= 15: 0.8121546961325967
Train set score for k= 45:  0.7828189774297559
Prediction accuracy k= 45: 0.7799263351749539


## Question: Compute the accuracy for the classifiers and identify the one that performs better
🏁 **Answer:** <span style="background-color:yellow">The Nearest Neighbors gives in general a better accuracy than Naive Bayes method. Although when the neighborhood size increase the accuracy prediction decrease.</span>

## Task 2
### 2.1 Logistic Regression
#### Split custom dataset into 90% for training and 10% for testing

In [47]:
# split the custom_dataset both in training and testing sets
train_feat, test_feat, train_labels, test_labels = train_test_split(custom_dataset.drop('Popular', axis= 1), custom_dataset['Popular'], test_size= 0.1)
print('Training Dataset', train_feat.shape)
train_feat.head()
train_labels.head()
print('\nTesting Dataset: ', test_feat.shape)
test_feat.head()
test_labels.head()

Training Dataset (4885, 8832)


Unnamed: 0,Impacts,Followers,Retweet count,Likes count,00,000,004,01,02,029,...,ไมก,ไมม,ไมเอาร,ไวร,ไหน,웨이션브이_당장_입국시켜,트와이스,新型コロナウイルス,新型肺炎,速報
2981,83076,83076,1136,3825,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3335,1138465,50723,5767,6999,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
955,162902380,1277956,5971,13335,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4972,1151,1151,1888,33820,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66,6136412251,46081874,1987,2204,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


2981    False
3335    False
955      True
4972     True
66      False
Name: Popular, dtype: bool


Testing Dataset:  (543, 8832)


Unnamed: 0,Impacts,Followers,Retweet count,Likes count,00,000,004,01,02,029,...,ไมก,ไมม,ไมเอาร,ไวร,ไหน,웨이션브이_당장_입국시켜,트와이스,新型コロナウイルス,新型肺炎,速報
4266,9338,9338,1120,710,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3033,2211680,78246,3565,11110,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4320,8410,8410,6879,67079,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2055,1169580,292395,12689,816,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1862,42510984,372490,8949,14347,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


4266    False
3033     True
4320     True
2055    False
1862     True
Name: Popular, dtype: bool

#### Logistic Regression implementation

In [48]:
#Import Logistic Regression classifier
from sklearn.linear_model import LogisticRegression

model = LogisticRegression().fit(train_feat, train_labels)
print('Train set score: ',model.score(train_feat, train_labels))

#Predict the response for test dataset
prediction = model.predict(test_feat)

# Model Accuracy, how often is the classifier correct?
print("Prediction accuracy: ", metrics.accuracy_score(test_labels, prediction))


Train set score:  0.4691914022517912
Prediction accuracy:  0.46408839779005523


🏁 **Answer:** <span style="background-color:yellow">Logistic Regression Classifier.</span>

### 2.2 Root Mean Squared Root (RMSE)

In [49]:
#Import RMSE metric
from sklearn.metrics import mean_squared_error
# converting True to 1 and False to 0, to compute RMSE
test_labels = test_labels.astype(int)
prediction = prediction.astype(int)

rmse = mean_squared_error(test_labels, prediction)
# RMSE result
print("RMSE score:",rmse)

RMSE score: 0.5359116022099447


🏁 **Answer:** <span style="background-color:yellow">RMSE.</span>

### 2.3 No. of retweets Prediction

#### Split custom dataset in labels(Retweet count) and features

In [50]:
# split the custom_dataset both in training and testing sets
new_custom_dataset = custom_dataset.drop(['Impacts','Followers','Likes count','Popular'], axis=1)
new_train_feat, x, new_train_labels,y=train_test_split(new_custom_dataset.drop('Retweet count', axis= 1), new_custom_dataset['Retweet count'], test_size = 1)
# new_train_feat, new_train_labels= new_custom_dataset.drop('Retweet count', axis= 1), new_custom_dataset['Retweet count']
print('Training Dataset', new_train_feat.shape)
new_train_feat.head()
new_train_labels.head()

Training Dataset (5427, 8828)


Unnamed: 0,00,000,004,01,02,029,03,0sy7pwigtb,10,100,...,ไมก,ไมม,ไมเอาร,ไวร,ไหน,웨이션브이_당장_입국시켜,트와이스,新型コロナウイルス,新型肺炎,速報
4115,0.0,0.0,0.0,0.0,0.0,0.0,0.189559,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
877,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2060,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
422,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
484,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


4115    4327
877     1713
2060    1184
422     2475
484     4507
Name: Retweet count, dtype: int64

In [51]:
#Import Logistic Regression classifier
from sklearn.linear_model import LogisticRegression
new_model = LogisticRegression(max_iter=10000).fit(new_train_feat, new_train_labels)
score_train =  new_model.score(new_train_feat, new_train_labels)
print('Train set score: ', score_train)

Train set score:  0.19918923899023402


#### Load new dataset

In [52]:
new_tweets_dataset = pd.read_csv('new_tweets.csv') 
new_tweets_dataset

Unnamed: 0,User,Tweet
0,The White House,LIVE: Press Briefing with Coronavirus Task For...
1,isentoes2,Após governo americano dizer que China esconde...
2,Adam Schefter,Twitter CEO Jack Dorsey pledged $1 billion tow...
3,Kyle Griffin,"House Armed Services Chairman Adam Smith: ""The..."
4,Kamala Harris,Black communities disproportionately suffer fr...
...,...,...
14995,Juanita Broaddrick,Many questions emerging about Dr Fauci’s agend...
14996,Benny,Taiwan Says It Warned @WHO About Coronavirus I...
14997,The Spectator Index,SPAIN: Coronavirus death toll rises by 324 ove...
14998,Donald Trump Jr.,David Bossie: Americans uniting to fight coron...


In [53]:
mess = new_tweets_dataset['Tweet']
output = new_model.predict(vectorizer.transform(mess))
prediction_rt = []
for i ,m in enumerate(mess):
    prediction_rt.append(output[i])

new_tweets_dataset['Prediction'] = prediction_rt
new_tweets_dataset.sort_values('Prediction', ascending=False)

Unnamed: 0,User,Tweet,Prediction
9385,ABS-CBN News,JUST IN: DOH announces 71 new cases of #COVID1...,5252
3777,ABS-CBN News,JUST IN: DOH announces 322 new cases of #COVID...,5252
2970,ABS-CBN News,"#COVID19 cases in the Philippines tops 3,000 a...",5252
7240,ABS-CBN News,JUST IN: #COVID19 cases in the country surpass...,5252
9304,Pascal BeltrandelRio,Viendo cómo se agrava la propagación del coro...,5252
...,...,...,...
12170,Jano García,Hoy ha llegado un avión de Inditex a España co...,1349
13779,Arnaldo Otegi 🔻,¿Quién entiende que pasemos todo el fin de sem...,1349
14326,Marcos G Morin Aguirre,Ahora que no agarren de pretexto el coronaviru...,1349
246,Randeep Singh Surjewala,"प्रधान मंत्री जी,\n\nकोई भारत को धमका नही सकता...",1221


## Request: Predict the new labels(Retweet counts) Retraining the Logistic Regression model using the whole dataset
🏁 **Answer:** <span style="background-color:yellow">The previous table shows the message and the prediction obtained.</span>