### Modules and libraries required

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics

from IPython.core.interactiveshell import InteractiveShell
# Used to get multiple outputs per cell
InteractiveShell.ast_node_interactivity = "all"

## TASK 1
### Load Datasets

In [2]:
tweets_dataset = pd.read_csv('tweets_info.csv') 
user_dataset = pd.read_csv('user_info.csv') 
tweets_dataset.sort_values(by='Likes count', ascending = False)
# tweets_dataset.groupby('User').count().sort_values(['Tweet'], ascending=False)
user_dataset.sort_values(by='Followers', ascending=False)
# user_dataset.groupby('User').count().sort_values(['Impacts'], ascending=False)

Unnamed: 0,User,Date,Tweet,Binders,Permalink,Retweet count,Likes count,Tweet value
2168,Elon Musk,06.03.20 20:42,The coronavirus panic is dumb,,https://www.twitter.com/user/status/1236029449...,350555,1729596,47385.34
1764,Bernie Sanders,08.03.20 20:20,"Once a vaccine for coronavirus is developed, i...",,https://www.twitter.com/user/status/1236748536...,125019,841283,18291.78
544,J Hooch,10.03.20 17:25,Coronavirus is everywhere. BOOK THAT FLIGHT. T...,,https://www.twitter.com/user/status/1237429345...,139366,727945,14.75
1833,Ally Carter,08.03.20 14:53,I see a lot of people being like “I would surv...,,https://www.twitter.com/user/status/1236666460...,157052,719464,118.95
1284,Taylor ☾,09.03.20 19:22,Coronavirus has crossed the line for Italians ...,,https://www.twitter.com/user/status/1237096480...,206430,688358,30.02
...,...,...,...,...,...,...,...,...
293,ϟ 𝓟𝓻𝓲𝓶𝓶𝓲𝓲 𝓟𝓸𝓽𝓽𝓮𝓻 ☍,11.03.20 00:07,คือร้านอาหารใน All Seasons มีคนเป็น #COVID19 แ...,,https://www.twitter.com/user/status/1237530520...,1285,150,0.52
807,Cel ☆ ‎ลาแล้วปีสี่,10.03.20 10:50,มาขอใช้พื้นที่ประชาสัมพันธ์นิดนึงค่ะ ตอนนี้รพ....,,https://www.twitter.com/user/status/1237330092...,1903,138,7.76
902,Ryn J.,10.03.20 05:13,สำนักข่าว Bloomberg รายงานว่าชายไทยอายุ 26 ปี ...,,https://www.twitter.com/user/status/1237245162...,2852,137,61.16
1011,พส,10.03.20 00:34,รัฐมนตรีว่าการกระทรวงวัฒนธรรมฝรั่งเศส ป่วย #CO...,,https://www.twitter.com/user/status/1237175016...,3306,100,107.36


Unnamed: 0,Name,User,Location,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value
0,@BarackObama,Barack Obama,"Washington, DC",13,0,0,12,5,1,1489476161,115296834,150464.37,1466166.07
1,@Cristiano,Cristiano Ronaldo,"Turim, Piemonte",1,0,1,1,0,0,83133379,83133379,111363.10,146999.29
2,@ladygaga,Lady Gaga,,2,0,0,2,0,0,162436360,81218183,109802.60,241565.72
3,@realDonaldTrump,Donald J. Trump,"Washington, DC",55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11
4,@selenagomez,Selena Gomez,Los Angeles,1,0,0,1,0,0,60528525,60528525,83860.78,92246.86
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14263,@gegyrigle,JewelRigle,,1,0,1,0,0,0,1,1,0.00,0.00
14264,@sadbadme,กูจะอยู่กูจะรอด,,1,0,1,0,0,0,1,1,0.00,0.00
14265,@pawat58720145,pawat,,1,0,1,0,0,0,1,1,0.00,0.00
14266,@DelsVeja,🐉Y A S U K E🤺,"Ile-de-France, France",1,0,0,0,0,1,0,0,0.00,0.00


### 1.1 Clean Data (Remove punctuation and stop-words)
#### Count words by document

In [3]:
# first create the transform and stop words set for english
vectorizer = CountVectorizer(stop_words='english')

# The Tweet column is the list of documents
vectorizer.fit(tweets_dataset['Tweet'])

# Counter words by tweet array
word_counter = vectorizer.transform(tweets_dataset['Tweet'])
print('Vector Representation')
pd.DataFrame(word_counter.toarray(), columns=vectorizer.get_feature_names())

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

Vector Representation


Unnamed: 0,00,000,00001,000morts,004,005,008uae3frs,01,015,016,...,대구,수호,엑소,엔시티드림_당장_입국시켜,웨이션브이_당장_입국시켜,윤기야,좋은_날이_앞으로_많기를,찬열,트와이스,화이팅
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5526,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5527,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5528,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5529,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### TF-IDF computation by document

In [4]:
# create the transform
vectorizer = TfidfVectorizer(stop_words='english')

# tokenize and build vocab
vectorizer.fit(tweets_dataset['Tweet'])
pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names(), columns=['IDF']).head(10)

# Tf-idf matrix by tweet
tf_idf = vectorizer.transform(tweets_dataset['Tweet'])
tf_idf = pd.DataFrame(tf_idf.toarray(), columns=vectorizer.get_feature_names())
# Add TF-IDF column to tweets_dataset
tweets_dataset = pd.concat([tweets_dataset, tf_idf], axis=1)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

Unnamed: 0,IDF
00,7.672395
000,4.530708
00001,8.925158
000morts,8.925158
004,8.519692
005,8.925158
008uae3frs,8.925158
01,8.519692
015,8.925158
016,8.925158


## Feature set
### 1.2 Merge User and Tweets info datasets

**In this step is created the dataset both for task 1 as task 2, therefore is required to run this before [task_2.ipynb](task_2.ipynb)**

In [24]:
# Construct the feature set
custom_dataset = pd.merge(user_dataset, tweets_dataset, on=['User'], how='inner')
drop_columns = ['User','Date','Tweet','Binders','Permalink','Name','Location']
custom_dataset.drop(drop_columns, axis=1, inplace=True)
custom_dataset

Unnamed: 0,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value,...,대구,수호,엑소,엔시티드림_당장_입국시켜,웨이션브이_당장_입국시켜,윤기야,좋은_날이_앞으로_많기를,찬열,트와이스,화이팅
0,13,0,0,12,5,1,1489476161,115296834,150464.37,1466166.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5756,1,0,1,0,0,0,4,4,0.02,0.02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5757,1,0,1,0,0,0,3,3,0.01,0.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5758,1,0,1,0,0,0,2,2,0.01,0.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5759,1,0,1,0,0,0,1,1,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 1.3 Calculate the tweet popularity and append feature set

In [25]:
# Calculate tweet popularity
custom_dataset['Popular'] = np.where(custom_dataset['Likes count'] > 10000, True, False)
custom_dataset

Unnamed: 0,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value,...,수호,엑소,엔시티드림_당장_입국시켜,웨이션브이_당장_입국시켜,윤기야,좋은_날이_앞으로_많기를,찬열,트와이스,화이팅,Popular
0,13,0,0,12,5,1,1489476161,115296834,150464.37,1466166.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
1,55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
2,55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
3,55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
4,55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5756,1,0,1,0,0,0,4,4,0.02,0.02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
5757,1,0,1,0,0,0,3,3,0.01,0.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
5758,1,0,1,0,0,0,2,2,0.01,0.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
5759,1,0,1,0,0,0,1,1,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False


### 1.4 Divide the dataset into 80% training and 20% testing

In [17]:
# split the custom_dataset both in training and testing sets
train_feat, test_feat, train_labels, test_labels = train_test_split(custom_dataset.drop('Popular', axis= 1), custom_dataset['Popular'], test_size= 0.2)
print('Training Dataset\n')
train_feat
train_labels
print('Testing Dataset\n')
test_feat
test_labels

Training Dataset



Unnamed: 0,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value,...,대구,수호,엑소,엔시티드림_당장_입국시켜,웨이션브이_당장_입국시켜,윤기야,좋은_날이_앞으로_많기를,찬열,트와이스,화이팅
5661,1,0,1,0,1,0,124,124,0.45,0.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4983,1,0,0,0,0,1,3429,3429,11.35,11.35,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2607,4,0,1,2,0,1,799400,199850,353.51,1555.44,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4294,1,0,0,0,0,1,16309,16309,39.93,39.93,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5653,1,0,0,0,0,1,144,144,0.47,0.47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4613,2,0,0,2,0,0,17270,8777,30.89,67.10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1654,5,0,0,0,0,5,2684385,541070,1181.70,5921.54,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
947,32,0,1,26,0,5,50259973,1611077,3118.35,106087.24,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4699,1,0,1,0,0,0,7316,7316,19.42,23.30,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


5661    False
4983    False
2607     True
4294    False
5653     True
        ...  
4613    False
1654    False
947     False
4699    False
1296    False
Name: Popular, Length: 4608, dtype: bool

Testing Dataset



Unnamed: 0,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value,...,대구,수호,엑소,엔시티드림_당장_입국시켜,웨이션브이_당장_입국시켜,윤기야,좋은_날이_앞으로_많기를,찬열,트와이스,화이팅
591,35,0,7,29,2,1,81865185,2495232,4588.70,162362.63,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1022,129,0,4,134,1,0,162902380,1277956,2552.10,357075.98,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1690,206,0,119,15,3,72,78434881,536506,1135.01,189277.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1749,14,0,4,0,3,8,6743360,500597,1152.11,13630.90,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5095,1,0,1,0,0,0,2474,2474,6.22,7.46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4059,2,0,1,0,0,1,34838,17420,43.66,96.05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4757,2,0,0,1,1,0,12604,6302,16.65,21.65,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5531,1,0,0,1,0,0,370,370,1.18,1.30,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4372,1,0,1,0,0,0,14023,14023,34.67,41.60,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


591      True
1022     True
1690    False
1749    False
5095    False
        ...  
4059    False
4757    False
5531     True
4372    False
1502     True
Name: Popular, Length: 1153, dtype: bool

### 1.5 Classifiers
#### Naive Bayes

In [8]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import MultinomialNB

#Create a Gaussian Classifier
model = MultinomialNB()

# Train the model using the training sets
model.fit(train_feat, train_labels)

score_train =  model.score(train_feat, train_labels)
print('Train set score: ', score_train)

#Predict the response for test dataset
prediction = model.predict(test_feat)

# Model Accuracy, how often is the classifier correct?
print("Prediction accuracy:",metrics.accuracy_score(test_labels, prediction))

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Train set score:  0.4811197916666667
Prediction accuracy: 0.48829141370338247


#### Nearest Neighbors 

In [9]:
#Import KNN Classifier
from sklearn.neighbors import KNeighborsClassifier

#Create a KNeighbors Classifier
model = KNeighborsClassifier(n_neighbors=5)

# Train the model using the training sets
model.fit(train_feat,train_labels)

score_train =  model.score(train_feat, train_labels)
print('Train set score: ', score_train)

#Predict the response for test dataset
prediction = model.predict(test_feat)

# Model Accuracy, how often is the classifier correct?
print("Prediction accuracy:",metrics.accuracy_score(test_labels, prediction))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

Train set score:  0.9060329861111112
Prediction accuracy: 0.8438855160450998


## Task 2
### 2.1 Logistic Regression
#### Split custom dataset into 90% for training and 10% for testing

In [10]:
# split the custom_dataset both in training and testing sets
train_feat, test_feat, train_labels, test_labels = train_test_split(custom_dataset.drop('Popular', axis= 1), custom_dataset['Popular'], test_size= 0.1)
print('Training Dataset\n')
train_feat
train_labels
print('Testing Dataset\n')
test_feat
test_labels

Training Dataset



Unnamed: 0,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value,...,대구,수호,엑소,엔시티드림_당장_입국시켜,웨이션브이_당장_입국시켜,윤기야,좋은_날이_앞으로_많기를,찬열,트와이스,화이팅
5118,1,0,0,0,0,1,2298,2298,6.63,6.63,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5642,1,0,1,0,0,0,174,174,0.56,0.67,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2870,4,0,1,1,0,2,559423,142444,336.25,1459.32,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1094,3,0,0,1,0,2,3808144,1274354,2546.22,7866.14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5623,1,0,1,0,0,0,204,204,0.56,0.67,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5659,2,0,2,0,0,0,250,125,0.40,0.96,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1241,53,0,43,1,2,7,47317605,929284,1875.41,108782.02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2345,137,0,21,113,3,5,36355851,282601,654.62,92432.65,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3856,10,0,4,5,0,1,322126,32619,73.48,833.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


5118    False
5642     True
2870     True
1094     True
5623    False
        ...  
5659    False
1241    False
2345    False
3856    False
4473     True
Name: Popular, Length: 5184, dtype: bool

Testing Dataset



Unnamed: 0,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value,...,대구,수호,엑소,엔시티드림_당장_입국시켜,웨이션브이_당장_입국시켜,윤기야,좋은_날이_앞으로_많기를,찬열,트와이스,화이팅
4297,1,0,1,0,0,0,16218,16218,41.55,49.86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2479,3,0,2,2,0,0,684442,229660,544.14,1957.73,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2398,8,0,7,0,0,1,2058007,259147,490.34,4598.19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1969,1,0,1,0,0,0,385866,385866,842.59,1011.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3633,1,0,0,0,0,1,47637,47637,100.46,100.46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2222,3,0,0,0,0,3,895307,298436,680.31,2040.93,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
506,9,0,9,0,0,0,30481387,3406130,6170.01,66545.89,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2332,137,0,21,113,3,5,36355851,282601,654.62,92432.65,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,8,0,8,8,0,0,12856768,1618859,3219.81,34097.90,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


4297     True
2479    False
2398     True
1969    False
3633    False
        ...  
2222    False
506     False
2332    False
940     False
602     False
Name: Popular, Length: 577, dtype: bool

#### Logistic Regression implementation

In [11]:
#Import Logistic Regression classifier
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(train_feat, train_labels)

score_train =  model.score(train_feat, train_labels)
print('Train set score: ', score_train)

#Predict the response for test dataset
prediction = model.predict(test_feat)

# Model Accuracy, how often is the classifier correct?
print("Prediction accuracy:",metrics.accuracy_score(test_labels, prediction))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Train set score:  0.6587577160493827
Prediction accuracy: 0.6984402079722704


### 2.2 Root Mean Squared Root (RMSE)

In [12]:
#Import RMSE metric
from sklearn.metrics import mean_squared_error
# converting True to 1 and False to 0, to compute RMSE
test_labels = test_labels.astype(int)
prediction = prediction.astype(int)

rmse = mean_squared_error(test_labels, prediction)
# RMSE result
print("RMSE score:",rmse)

RMSE score: 0.30155979202772965


### 2.3 No. of retweets Prediction
#### Load new dataset

In [13]:
new_tweets_dataset = pd.read_csv('new_tweets.csv') 
new_tweets_dataset

Unnamed: 0,User,Tweet
0,The White House,LIVE: Press Briefing with Coronavirus Task For...
1,isentoes2,Após governo americano dizer que China esconde...
2,Adam Schefter,Twitter CEO Jack Dorsey pledged $1 billion tow...
3,Kyle Griffin,"House Armed Services Chairman Adam Smith: ""The..."
4,Kamala Harris,Black communities disproportionately suffer fr...
...,...,...
14995,Juanita Broaddrick,Many questions emerging about Dr Fauci’s agend...
14996,Benny,Taiwan Says It Warned @WHO About Coronavirus I...
14997,The Spectator Index,SPAIN: Coronavirus death toll rises by 324 ove...
14998,Donald Trump Jr.,David Bossie: Americans uniting to fight coron...


In [14]:
# create the transform
vectorizer = TfidfVectorizer(stop_words='english')

# tokenize and build vocab
vectorizer.fit(new_tweets_dataset['Tweet'])

# Tf-idf matrix by tweet
tf_idf = vectorizer.transform(new_tweets_dataset['Tweet'])
tf_idf = pd.DataFrame(tf_idf.toarray(), columns=vectorizer.get_feature_names())
tf_idf
# Add TF-IDF column to tweets_dataset
# tweets_dataset = pd.concat([new_tweets_dataset, tf_idf], axis=1)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

Unnamed: 0,00,000,0002,000citizens,000l,000m,000s,000th,000人以上と伝える,002,...,𝗙𝗲𝗯𝗿𝘂𝗮𝗿𝘆,𝗚𝗟𝗢𝗕𝗔𝗟,𝗝𝗮𝗻𝘂𝗮𝗿𝘆,𝗠𝗮𝗿𝗰𝗵,𝗿𝗲𝘀𝗽𝗼𝗻𝘀𝗲,𝘁𝗼,𝘢𝘤𝘤𝘶𝘳𝘢𝘵𝘦,𝘤𝘶𝘳𝘳𝘦𝘯𝘵,𝟭𝟳,𝟮𝟰
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
# split the custom_dataset both in training and testing sets
new_custom_dataset = custom_dataset.drop(['Likes count','Tweet value','Popular'], axis=1)
train_feat, train_labels= new_custom_dataset.drop('Retweet count', axis= 1), new_custom_dataset['Retweet count']
train_feat
train_labels

Unnamed: 0,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value,...,대구,수호,엑소,엔시티드림_당장_입국시켜,웨이션브이_당장_입국시켜,윤기야,좋은_날이_앞으로_많기를,찬열,트와이스,화이팅
0,13,0,0,12,5,1,1489476161,115296834,150464.37,1466166.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5756,1,0,1,0,0,0,4,4,0.02,0.02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5757,1,0,1,0,0,0,3,3,0.01,0.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5758,1,0,1,0,0,0,2,2,0.01,0.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5759,1,0,1,0,0,0,1,1,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


0       131697
1        13104
2        81356
3        14595
4        25424
         ...  
5756     11880
5757      2723
5758      1590
5759      6907
5760     15214
Name: Retweet count, Length: 5761, dtype: int64