### Modules and libraries required

In [84]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics

from IPython.core.interactiveshell import InteractiveShell
# Used to get multiple outputs per cell
InteractiveShell.ast_node_interactivity = "all"

### TASK 1: Main implementation
### Load Datasets

In [85]:
tweets_dataset = pd.read_csv('tweets_info.csv') 
user_dataset = pd.read_csv('user_info.csv') 
tweets_dataset.sort_values(by='Likes count', ascending = False)
# tweets_dataset.groupby('User').count().sort_values(['Tweet'], ascending=False)
user_dataset.sort_values(by='Followers', ascending=False)
# user_dataset.groupby('User').count().sort_values(['Impacts'], ascending=False)

Unnamed: 0,User,Date,Tweet,Binders,Permalink,Retweet count,Likes count,Tweet value
2168,Elon Musk,06.03.20 20:42,The coronavirus panic is dumb,,https://www.twitter.com/user/status/1236029449...,350555,1729596,47385.34
1764,Bernie Sanders,08.03.20 20:20,"Once a vaccine for coronavirus is developed, i...",,https://www.twitter.com/user/status/1236748536...,125019,841283,18291.78
544,J Hooch,10.03.20 17:25,Coronavirus is everywhere. BOOK THAT FLIGHT. T...,,https://www.twitter.com/user/status/1237429345...,139366,727945,14.75
1833,Ally Carter,08.03.20 14:53,I see a lot of people being like “I would surv...,,https://www.twitter.com/user/status/1236666460...,157052,719464,118.95
1284,Taylor ☾,09.03.20 19:22,Coronavirus has crossed the line for Italians ...,,https://www.twitter.com/user/status/1237096480...,206430,688358,30.02
...,...,...,...,...,...,...,...,...
293,ϟ 𝓟𝓻𝓲𝓶𝓶𝓲𝓲 𝓟𝓸𝓽𝓽𝓮𝓻 ☍,11.03.20 00:07,คือร้านอาหารใน All Seasons มีคนเป็น #COVID19 แ...,,https://www.twitter.com/user/status/1237530520...,1285,150,0.52
807,Cel ☆ ‎ลาแล้วปีสี่,10.03.20 10:50,มาขอใช้พื้นที่ประชาสัมพันธ์นิดนึงค่ะ ตอนนี้รพ....,,https://www.twitter.com/user/status/1237330092...,1903,138,7.76
902,Ryn J.,10.03.20 05:13,สำนักข่าว Bloomberg รายงานว่าชายไทยอายุ 26 ปี ...,,https://www.twitter.com/user/status/1237245162...,2852,137,61.16
1011,พส,10.03.20 00:34,รัฐมนตรีว่าการกระทรวงวัฒนธรรมฝรั่งเศส ป่วย #CO...,,https://www.twitter.com/user/status/1237175016...,3306,100,107.36


Unnamed: 0,Name,User,Location,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value
0,@BarackObama,Barack Obama,"Washington, DC",13,0,0,12,5,1,1489476161,115296834,150464.37,1466166.07
1,@Cristiano,Cristiano Ronaldo,"Turim, Piemonte",1,0,1,1,0,0,83133379,83133379,111363.10,146999.29
2,@ladygaga,Lady Gaga,,2,0,0,2,0,0,162436360,81218183,109802.60,241565.72
3,@realDonaldTrump,Donald J. Trump,"Washington, DC",55,0,3,11,1,42,4100089350,76238523,103166.11,5644215.11
4,@selenagomez,Selena Gomez,Los Angeles,1,0,0,1,0,0,60528525,60528525,83860.78,92246.86
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14263,@gegyrigle,JewelRigle,,1,0,1,0,0,0,1,1,0.00,0.00
14264,@sadbadme,กูจะอยู่กูจะรอด,,1,0,1,0,0,0,1,1,0.00,0.00
14265,@pawat58720145,pawat,,1,0,1,0,0,0,1,1,0.00,0.00
14266,@DelsVeja,🐉Y A S U K E🤺,"Ile-de-France, France",1,0,0,0,0,1,0,0,0.00,0.00


### 1.1 Clean Data (Remove punctuation and stop-words)
#### Count words by document

In [86]:
# first create the transform and stop words set for english
vectorizer = CountVectorizer(stop_words='english')

# The Tweet column is the list of documents
vectorizer.fit(tweets_dataset['Tweet'])

# Counter words by tweet array
word_counter = vectorizer.transform(tweets_dataset['Tweet'])
print('Vector Representation')
pd.DataFrame(word_counter.toarray(), columns=vectorizer.get_feature_names())

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

Vector Representation


Unnamed: 0,00,000,00001,000morts,004,005,008uae3frs,01,015,016,...,대구,수호,엑소,엔시티드림_당장_입국시켜,웨이션브이_당장_입국시켜,윤기야,좋은_날이_앞으로_많기를,찬열,트와이스,화이팅
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5526,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5527,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5528,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5529,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### TF-IDF computation by document

In [87]:
# create the transform
vectorizer = TfidfVectorizer(stop_words='english')

# tokenize and build vocab
vectorizer.fit(tweets_dataset['Tweet'])
pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names(), columns=['IDF']).head(10)

# Tf-idf matrix by tweet
tf_idf = vectorizer.transform(tweets_dataset['Tweet'])
print('TF-IDF')
pd.DataFrame(tf_idf.toarray(), columns=vectorizer.get_feature_names())

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

Unnamed: 0,IDF
00,7.672395
000,4.530708
00001,8.925158
000morts,8.925158
004,8.519692
005,8.925158
008uae3frs,8.925158
01,8.519692
015,8.925158
016,8.925158


TF-IDF


Unnamed: 0,00,000,00001,000morts,004,005,008uae3frs,01,015,016,...,대구,수호,엑소,엔시티드림_당장_입국시켜,웨이션브이_당장_입국시켜,윤기야,좋은_날이_앞으로_많기를,찬열,트와이스,화이팅
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.133443,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5526,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5527,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5528,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5529,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Feature set
#### 1.2 Merge User and Tweets info datasets

In [88]:
# Construct the feature set
custom_dataset = pd.merge(tweets_dataset, user_dataset, on=['User'], how='inner', indicator=True)
columns = ['Total tweets','Retweets','Images','Links','Chats','Text tweets','Impacts', 'Followers', 'User value','Tweets value', 'Retweet count', 'Likes count','Tweet value']
custom_dataset = custom_dataset[columns]

#### 1.3 Calculate the tweet popularity and append feature set

In [89]:
# Calculate tweet popularity
custom_dataset['Popular'] = np.where(custom_dataset['Likes count'] > 10000, True, False)
custom_dataset

Unnamed: 0,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value,Retweet count,Likes count,Tweet value,Popular
0,22,0,3,5,0,14,9061712,416833,960.40,21959.22,5503,19357,950.17,True
1,22,0,3,5,0,14,9061712,416833,960.40,21959.22,1520,3253,1140.20,False
2,22,0,3,5,0,14,9061712,416833,960.40,21959.22,1278,2567,1140.20,False
3,22,0,3,5,0,14,9061712,416833,960.40,21959.22,1827,4246,950.17,False
4,22,0,3,5,0,14,9061712,416833,960.40,21959.22,3770,9834,1045.19,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5756,2,0,2,1,0,0,52386,26285,63.42,156.26,4398,4607,76.10,False
5757,1,0,1,0,0,0,387,387,1.17,1.40,4522,8830,1.40,False
5758,1,0,0,1,0,0,24010,24010,76.18,83.80,1891,2105,83.80,False
5759,2,0,2,0,1,0,6330,3292,8.50,13.03,2336,2884,2.04,False


#### 1.4 Divide the dataset into 80% training and 20% testing

In [90]:
# split the custom_dataset both in training and testing sets
train_feat, test_feat, train_labels, test_labels = train_test_split(custom_dataset.drop('Popular', axis= 1), custom_dataset['Popular'], test_size= 0.2)
print('Training Dataset\n')
train_feat
train_labels
print('Testing Dataset\n')
test_feat
test_labels

Training Dataset



Unnamed: 0,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value,Retweet count,Likes count,Tweet value
1369,4,0,2,2,0,0,177287,66521,83.02,487.12,1983,3788,91.32
4494,1,0,0,0,0,1,11877,11877,36.77,36.77,2320,4270,36.77
1638,26,0,10,17,0,0,3546247,155905,397.74,9436.71,2070,4483,296.29
4819,1,0,0,0,0,1,768,768,2.20,2.20,2592,3528,2.20
5163,5,0,0,0,0,5,860734,174098,400.22,2014.94,3252,12670,400.22
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5268,6,0,6,0,0,0,701850,117917,242.05,1730.13,2062,7161,287.46
944,115,0,55,102,0,15,42510984,372490,686.90,93457.67,2618,7162,681.76
1582,20,0,2,2,3,14,3639162,224986,521.83,7947.06,2481,5253,356.00
4901,5,0,1,5,0,0,1478913,296816,684.18,3901.79,2259,2449,748.70


1369    False
4494    False
1638    False
4819    False
5163     True
        ...  
5268    False
944     False
1582    False
4901    False
2024    False
Name: Popular, Length: 4608, dtype: bool

Testing Dataset



Unnamed: 0,Total tweets,Retweets,Images,Links,Chats,Text tweets,Impacts,Followers,User value,Tweets value,Retweet count,Likes count,Tweet value
5709,1,0,1,0,0,0,750,750,2.58,3.10,5213,15259,3.10
2484,11,0,1,1,0,9,820707,76829,151.49,1638.28,3832,31347,140.87
2770,50,0,0,27,2,25,117545980,2399451,4604.92,229539.39,2694,6942,4483.29
3730,1,0,1,0,0,0,3266,3266,8.92,10.70,4246,11360,10.70
3178,3,0,0,3,0,0,278874,93377,198.90,657.67,3644,8985,218.79
...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,8,0,8,8,0,0,12856768,1618859,3219.81,34097.90,1626,7310,4250.15
4367,2,0,2,1,0,0,2422,1211,3.73,9.40,2064,2726,4.48
614,2,0,2,0,0,0,27613,13940,29.93,71.39,3644,5307,2.30
4046,5,0,5,0,1,0,23640,4728,12.88,64.93,6092,657,3.09


5709     True
2484     True
2770    False
3730     True
3178    False
        ...  
235     False
4367    False
614     False
4046    False
3234     True
Name: Popular, Length: 1153, dtype: bool

### 1.5 Classifiers
#### Naive Bayes

In [93]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
model = GaussianNB()

# Train the model using the training sets
model.fit(train_feat, train_labels)

score_train =  model.score(train_feat, train_labels)
print('Train set score: ', score_train)

#Predict the response for test dataset
prediction = model.predict(test_feat)

# Model Accuracy, how often is the classifier correct?
print("Prediction accuracy:",metrics.accuracy_score(test_labels, prediction))

GaussianNB(priors=None, var_smoothing=1e-09)

Train set score:  0.6473524305555556
Prediction accuracy: 0.6556808326105811
