### Modules and libraries required

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics

from IPython.core.interactiveshell import InteractiveShell
# Used to get multiple outputs per cell
InteractiveShell.ast_node_interactivity = "all"

## TASK 1
### Load Datasets

In [3]:
tweets_dataset = pd.read_csv('tweets_info.csv') 
user_dataset = pd.read_csv('user_info.csv') 
tweets_dataset.sort_values(by='Likes count', ascending = False)
# tweets_dataset.groupby('User').count().sort_values(['Tweet'], ascending=False)
user_dataset.sort_values(by='Followers', ascending=False)
# user_dataset.groupby('User').count().sort_values(['Impacts'], ascending=False)

Unnamed: 0,User,Date,Tweet,Binders,Permalink,Retweet count,Likes count,Tweet value
2168,Elon Musk,06.03.20 20:42,The coronavirus panic is dumb,,https://www.twitter.com/user/status/1236029449...,350555,1729596,47385.34
1764,Bernie Sanders,08.03.20 20:20,"Once a vaccine for coronavirus is developed, i...",,https://www.twitter.com/user/status/1236748536...,125019,841283,18291.78
544,J Hooch,10.03.20 17:25,Coronavirus is everywhere. BOOK THAT FLIGHT. T...,,https://www.twitter.com/user/status/1237429345...,139366,727945,14.75
1833,Ally Carter,08.03.20 14:53,I see a lot of people being like “I would surv...,,https://www.twitter.com/user/status/1236666460...,157052,719464,118.95
1284,Taylor ☾,09.03.20 19:22,Coronavirus has crossed the line for Italians ...,,https://www.twitter.com/user/status/1237096480...,206430,688358,30.02
...,...,...,...,...,...,...,...,...
293,ϟ 𝓟𝓻𝓲𝓶𝓶𝓲𝓲 𝓟𝓸𝓽𝓽𝓮𝓻 ☍,11.03.20 00:07,คือร้านอาหารใน All Seasons มีคนเป็น #COVID19 แ...,,https://www.twitter.com/user/status/1237530520...,1285,150,0.52
807,Cel ☆ ‎ลาแล้วปีสี่,10.03.20 10:50,มาขอใช้พื้นที่ประชาสัมพันธ์นิดนึงค่ะ ตอนนี้รพ....,,https://www.twitter.com/user/status/1237330092...,1903,138,7.76
902,Ryn J.,10.03.20 05:13,สำนักข่าว Bloomberg รายงานว่าชายไทยอายุ 26 ปี ...,,https://www.twitter.com/user/status/1237245162...,2852,137,61.16
1011,พส,10.03.20 00:34,รัฐมนตรีว่าการกระทรวงวัฒนธรรมฝรั่งเศส ป่วย #CO...,,https://www.twitter.com/user/status/1237175016...,3306,100,107.36


Unnamed: 0,Name,User,Location,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value
0,@BarackObama,Barack Obama,"Washington, DC",13,0,0,12,5,1,1489476161,115296834,150464.37,1466166.07
1,@Cristiano,Cristiano Ronaldo,"Turim, Piemonte",1,0,1,1,0,0,83133379,83133379,111363.10,146999.29
2,@ladygaga,Lady Gaga,,2,0,0,2,0,0,162436360,81218183,109802.60,241565.72
3,@realDonaldTrump,Donald J. Trump,"Washington, DC",55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11
4,@selenagomez,Selena Gomez,Los Angeles,1,0,0,1,0,0,60528525,60528525,83860.78,92246.86
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14263,@gegyrigle,JewelRigle,,1,0,1,0,0,0,1,1,0.00,0.00
14264,@sadbadme,กูจะอยู่กูจะรอด,,1,0,1,0,0,0,1,1,0.00,0.00
14265,@pawat58720145,pawat,,1,0,1,0,0,0,1,1,0.00,0.00
14266,@DelsVeja,🐉Y A S U K E🤺,"Ile-de-France, France",1,0,0,0,0,1,0,0,0.00,0.00


### 1.1 Clean Data (Remove punctuation and stop-words)
#### Count words by document

In [4]:
# first create the transform and stop words set for english
vectorizer = CountVectorizer(stop_words='english')

# The Tweet column is the list of documents
vectorizer.fit(tweets_dataset['Tweet'])

# Counter words by tweet array
word_counter = vectorizer.transform(tweets_dataset['Tweet'])
print('Vector Representation')
pd.DataFrame(word_counter.toarray(), columns=vectorizer.get_feature_names())

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

Vector Representation


Unnamed: 0,00,000,00001,000morts,004,005,008uae3frs,01,015,016,...,대구,수호,엑소,엔시티드림_당장_입국시켜,웨이션브이_당장_입국시켜,윤기야,좋은_날이_앞으로_많기를,찬열,트와이스,화이팅
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5526,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5527,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5528,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5529,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### TF-IDF computation by document

In [5]:
# create the transform
vectorizer = TfidfVectorizer(stop_words='english')

# tokenize and build vocab
vectorizer.fit(tweets_dataset['Tweet'])
pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names(), columns=['IDF']).head(10)

# Tf-idf matrix by tweet
tf_idf = vectorizer.transform(tweets_dataset['Tweet'])
tf_idf = pd.DataFrame(tf_idf.toarray(), columns=vectorizer.get_feature_names())
# Add TF-IDF column to tweets_dataset
tweets_dataset = pd.concat([tweets_dataset, tf_idf], axis=1)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

Unnamed: 0,IDF
00,7.672395
000,4.530708
00001,8.925158
000morts,8.925158
004,8.519692
005,8.925158
008uae3frs,8.925158
01,8.519692
015,8.925158
016,8.925158


## Feature set
### 1.2 Merge User and Tweets info datasets

**In this step is created the dataset both for task 1 as task 2, therefore is required to run this before [task_2.ipynb](task_2.ipynb)**

In [6]:
# Construct the feature set
custom_dataset = pd.merge(user_dataset, tweets_dataset, on=['User'], how='inner')
drop_columns = ['User','Date','Tweet','Binders','Permalink','Name','Location']
custom_dataset.drop(drop_columns, axis=1, inplace=True)
custom_dataset

Unnamed: 0,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value,...,대구,수호,엑소,엔시티드림_당장_입국시켜,웨이션브이_당장_입국시켜,윤기야,좋은_날이_앞으로_많기를,찬열,트와이스,화이팅
0,13,0,0,12,5,1,1489476161,115296834,150464.37,1466166.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5756,1,0,1,0,0,0,4,4,0.02,0.02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5757,1,0,1,0,0,0,3,3,0.01,0.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5758,1,0,1,0,0,0,2,2,0.01,0.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5759,1,0,1,0,0,0,1,1,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 1.3 Calculate the tweet popularity and append feature set

In [7]:
# Calculate tweet popularity
custom_dataset['Popular'] = np.where(custom_dataset['Likes count'] > 10000, True, False)
custom_dataset

Unnamed: 0,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value,...,수호,엑소,엔시티드림_당장_입국시켜,웨이션브이_당장_입국시켜,윤기야,좋은_날이_앞으로_많기를,찬열,트와이스,화이팅,Popular
0,13,0,0,12,5,1,1489476161,115296834,150464.37,1466166.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
1,55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
2,55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
3,55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
4,55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5756,1,0,1,0,0,0,4,4,0.02,0.02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
5757,1,0,1,0,0,0,3,3,0.01,0.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
5758,1,0,1,0,0,0,2,2,0.01,0.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
5759,1,0,1,0,0,0,1,1,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False


### 1.4 Divide the dataset into 80% training and 20% testing

In [8]:
# split the custom_dataset both in training and testing sets
train_feat, test_feat, train_labels, test_labels = train_test_split(custom_dataset.drop('Popular', axis= 1), custom_dataset['Popular'], test_size= 0.2)
print('Training Dataset\n')
train_feat
train_labels
print('Testing Dataset\n')
test_feat
test_labels

Training Dataset



Unnamed: 0,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value,...,대구,수호,엑소,엔시티드림_당장_입국시켜,웨이션브이_당장_입국시켜,윤기야,좋은_날이_앞으로_많기를,찬열,트와이스,화이팅
2378,17,0,1,13,0,3,4426349,268917,498.49,9135.69,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4004,1,0,1,1,0,0,14662,14662,35.20,46.46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1820,12,0,4,4,0,4,5359956,451856,1010.38,13206.08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1511,16,0,3,7,0,6,10510192,660932,1402.81,24133.18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5638,1,0,1,0,0,0,179,179,0.64,0.77,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5730,1,0,1,0,0,0,31,31,0.12,0.14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2524,1,0,0,0,0,1,221690,221690,511.93,511.93,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2803,26,0,10,17,0,0,3546247,155905,397.74,9436.71,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
453,28,0,7,5,0,16,120891870,4347795,7654.53,227443.77,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


2378    False
4004    False
1820    False
1511     True
5638    False
        ...  
5730    False
2524    False
2803    False
453      True
5410     True
Name: Popular, Length: 4608, dtype: bool

Testing Dataset



Unnamed: 0,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value,...,대구,수호,엑소,엔시티드림_당장_입국시켜,웨이션브이_당장_입국시켜,윤기야,좋은_날이_앞으로_많기를,찬열,트와이스,화이팅
1776,15,0,1,9,0,5,6972664,475601,1042.57,16442.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2142,6,0,6,0,0,0,1972471,329084,610.58,4397.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4315,1,0,0,0,0,1,15791,15791,46.47,46.47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3579,2,0,1,0,0,1,101530,50798,110.86,243.74,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4658,2,0,2,0,0,0,15808,7904,20.44,49.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4386,1,0,0,1,0,0,13650,13650,34.73,38.20,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2676,11,0,5,3,0,6,1489711,186267,440.42,3971.44,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
595,35,0,16,35,6,0,85425882,2484384,4635.50,163766.90,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1862,25,0,0,1,0,24,10485807,425420,955.44,23677.41,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


1776     True
2142    False
4315     True
3579     True
4658    False
        ...  
4386     True
2676    False
595     False
1862     True
1934     True
Name: Popular, Length: 1153, dtype: bool

### 1.5 Classifiers
#### Naive Bayes

In [9]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import MultinomialNB

#Create a Gaussian Classifier
model = MultinomialNB()

# Train the model using the training sets
model.fit(train_feat, train_labels)

score_train =  model.score(train_feat, train_labels)
print('Train set score: ', score_train)

#Predict the response for test dataset
prediction = model.predict(test_feat)

# Model Accuracy, how often is the classifier correct?
print("Prediction accuracy:",metrics.accuracy_score(test_labels, prediction))

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Train set score:  0.4822048611111111
Prediction accuracy: 0.4856895056374675


#### Nearest Neighbors 

In [10]:
#Import KNN Classifier
from sklearn.neighbors import KNeighborsClassifier

#Create a KNeighbors Classifier
model = KNeighborsClassifier(n_neighbors=5)

# Train the model using the training sets
model.fit(train_feat,train_labels)

score_train =  model.score(train_feat, train_labels)
print('Train set score: ', score_train)

#Predict the response for test dataset
prediction = model.predict(test_feat)

# Model Accuracy, how often is the classifier correct?
print("Prediction accuracy:",metrics.accuracy_score(test_labels, prediction))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

Train set score:  0.9086371527777778
Prediction accuracy: 0.8447528187337381


## Task 2
### 2.1 Logistic Regression
#### Split custom dataset into 90% for training and 10% for testing

In [11]:
# split the custom_dataset both in training and testing sets
train_feat, test_feat, train_labels, test_labels = train_test_split(custom_dataset.drop('Popular', axis= 1), custom_dataset['Popular'], test_size= 0.1)
print('Training Dataset\n')
train_feat
train_labels
print('Testing Dataset\n')
test_feat
test_labels

Training Dataset



Unnamed: 0,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value,...,대구,수호,엑소,엔시티드림_당장_입국시켜,웨이션브이_당장_입국시켜,윤기야,좋은_날이_앞으로_많기를,찬열,트와이스,화이팅
4666,1,0,0,0,0,1,7796,7796,23.61,23.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3774,1,0,1,0,0,0,38573,38573,90.68,108.82,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4153,1,0,0,0,0,1,19887,19887,48.88,48.88,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
757,44,0,5,7,0,33,75610388,1741615,3294.98,148832.02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1280,25,0,0,1,0,24,20977608,847475,1663.30,41363.65,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1397,2,0,0,0,0,2,8628,4314,11.74,23.48,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2634,22,0,8,6,1,8,4110417,193074,397.93,9050.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4601,1,0,1,0,0,0,9014,9014,23.35,28.02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2734,1,0,0,1,0,0,171424,171424,406.18,446.80,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


4666     True
3774    False
4153    False
757      True
1280     True
        ...  
1397     True
2634     True
4601    False
2734     True
773     False
Name: Popular, Length: 5184, dtype: bool

Testing Dataset



Unnamed: 0,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value,...,대구,수호,엑소,엔시티드림_당장_입국시켜,웨이션브이_당장_입국시켜,윤기야,좋은_날이_앞으로_많기를,찬열,트와이스,화이팅
2280,33,0,5,21,1,7,9278657,287660,538.78,18581.92,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2587,18,0,1,0,1,16,3614789,204342,398.27,6822.52,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2170,33,0,1,1,2,29,10409222,317898,684.50,21545.93,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4024,1,0,1,0,0,0,958,958,2.66,3.19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1589,7,0,0,3,0,4,4228811,604133,1288.21,9403.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2064,115,0,55,102,0,15,42510984,372490,686.90,93457.67,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
960,41,0,41,0,0,0,64001481,1582264,3093.17,150365.47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3591,23,0,22,2,0,0,996539,49463,108.80,2664.71,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5114,1,0,0,1,0,0,2347,2347,6.46,7.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


2280    False
2587    False
2170    False
4024    False
1589     True
        ...  
2064     True
960      True
3591    False
5114    False
1645    False
Name: Popular, Length: 577, dtype: bool

#### Logistic Regression implementation

In [12]:
#Import Logistic Regression classifier
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(train_feat, train_labels)

score_train =  model.score(train_feat, train_labels)
print('Train set score: ', score_train)

#Predict the response for test dataset
prediction = model.predict(test_feat)

# Model Accuracy, how often is the classifier correct?
print("Prediction accuracy:",metrics.accuracy_score(test_labels, prediction))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Train set score:  0.6730324074074074
Prediction accuracy: 0.6221837088388215


### 2.2 Root Mean Squared Root (RMSE)

In [13]:
#Import RMSE metric
from sklearn.metrics import mean_squared_error
# converting True to 1 and False to 0, to compute RMSE
test_labels = test_labels.astype(int)
prediction = prediction.astype(int)

rmse = mean_squared_error(test_labels, prediction)
# RMSE result
print("RMSE score:",rmse)

RMSE score: 0.3778162911611785


### 2.3 No. of retweets Prediction
#### Load new dataset

In [14]:
new_tweets_dataset = pd.read_csv('new_tweets.csv') 
new_tweets_dataset

Unnamed: 0,User,Tweet
0,The White House,LIVE: Press Briefing with Coronavirus Task For...
1,isentoes2,Após governo americano dizer que China esconde...
2,Adam Schefter,Twitter CEO Jack Dorsey pledged $1 billion tow...
3,Kyle Griffin,"House Armed Services Chairman Adam Smith: ""The..."
4,Kamala Harris,Black communities disproportionately suffer fr...
...,...,...
14995,Juanita Broaddrick,Many questions emerging about Dr Fauci’s agend...
14996,Benny,Taiwan Says It Warned @WHO About Coronavirus I...
14997,The Spectator Index,SPAIN: Coronavirus death toll rises by 324 ove...
14998,Donald Trump Jr.,David Bossie: Americans uniting to fight coron...


#### Calculate the tf-idf of new tweets

In [15]:
vectorizer = TfidfVectorizer(stop_words='english')

# tokenize and build vocab
vectorizer.fit(new_tweets_dataset['Tweet'])

# Tf-idf matrix by tweet
tf_idf = vectorizer.transform(new_tweets_dataset['Tweet'])
tf_idf = pd.DataFrame(tf_idf.toarray(), columns=vectorizer.get_feature_names())
# Add TF-IDF column to tweets_dataset
new_tweets_dataset = pd.concat([new_tweets_dataset, tf_idf], axis=1)
new_tweets_dataset

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

Unnamed: 0,User,Tweet,00,000,0002,000citizens,000l,000m,000s,000th,...,𝗙𝗲𝗯𝗿𝘂𝗮𝗿𝘆,𝗚𝗟𝗢𝗕𝗔𝗟,𝗝𝗮𝗻𝘂𝗮𝗿𝘆,𝗠𝗮𝗿𝗰𝗵,𝗿𝗲𝘀𝗽𝗼𝗻𝘀𝗲,𝘁𝗼,𝘢𝘤𝘤𝘶𝘳𝘢𝘵𝘦,𝘤𝘶𝘳𝘳𝘦𝘯𝘵,𝟭𝟳,𝟮𝟰
0,The White House,LIVE: Press Briefing with Coronavirus Task For...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,isentoes2,Após governo americano dizer que China esconde...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Adam Schefter,Twitter CEO Jack Dorsey pledged $1 billion tow...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Kyle Griffin,"House Armed Services Chairman Adam Smith: ""The...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Kamala Harris,Black communities disproportionately suffer fr...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,Juanita Broaddrick,Many questions emerging about Dr Fauci’s agend...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14996,Benny,Taiwan Says It Warned @WHO About Coronavirus I...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14997,The Spectator Index,SPAIN: Coronavirus death toll rises by 324 ove...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14998,Donald Trump Jr.,David Bossie: Americans uniting to fight coron...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Split custom dataset in labels(Retweet count) and features

In [16]:
# split the custom_dataset both in training and testing sets
new_custom_dataset = custom_dataset.drop(['Likes count','Tweet value','Popular','Total tweets','Retweets','Images','Links','Chats','Text tweets','Impacts','Followers','User value','Tweets value'], axis=1)
new_train_feat, new_train_labels= new_custom_dataset.drop('Retweet count', axis= 1), new_custom_dataset['Retweet count']
new_train_feat
new_train_labels

Unnamed: 0,00,000,00001,000morts,004,005,008uae3frs,01,015,016,...,대구,수호,엑소,엔시티드림_당장_입국시켜,웨이션브이_당장_입국시켜,윤기야,좋은_날이_앞으로_많기를,찬열,트와이스,화이팅
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.431112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5756,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5757,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5758,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5759,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


0       131697
1        13104
2        81356
3        14595
4        25424
         ...  
5756     11880
5757      2723
5758      1590
5759      6907
5760     15214
Name: Retweet count, Length: 5761, dtype: int64

In [17]:
model = LogisticRegression()
model.fit(new_train_feat, new_train_labels)

score_train =  model.score(new_train_feat, new_train_labels)
print('Train set score: ', score_train)

MemoryError: Unable to allocate 823. MiB for an array with shape (107875017,) and data type float64