In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math 

In [13]:
tweetsData=pd.read_csv("data.csv")

In [18]:
# Let's see number of observations(size of data) we are dealing with 
tweetsData.shape

(25694, 34)

So we are having 25694 observations over 34 distinct parameters

In [17]:
# Let see if what are the attributes we are dealing with
tweetsData.columns

Index(['tweet_id', 'tweet_created_at', 'tweet_created_on_holiday_bool',
       'tweet_created_on_weekend_bool', 'tweet_created_at_noon_bool',
       'tweet_created_at_eve_bool', 'user_id', 'user_screen_name',
       'user_screen_name_length', 'user_no_of_tweets', 'user_no_of_followers',
       'user_no_of_followings', 'user_account_age', 'user_no_of_favourites',
       'user_average_tweets', 'user_average_favourites',
       'user_account_location', 'tweet_text', 'tweet_text_length',
       'tweet_text_optimal_length', 'tweet_text_no_of_hashtags',
       'tweet_text_contains_hashtags', 'tweet_text_contains_url',
       'tweet_text_no_of_user_mentions', 'tweet_text_contains_user_mentions',
       'tweet_text_sentiment', 'tweet_text_contains_media',
       'tweet_text_contains_number', 'tweet_text_contains_upper_words',
       'tweet_text_contains_lower_words', 'tweet_text_contains_excl',
       'tweet_text_contains_retweet_suggestion', 'retweeted', 'retweets'],
      dtype='object')

In [15]:
# Let's take a basic look at this columns
tweetsData.head()

Unnamed: 0,tweet_id,tweet_created_at,tweet_created_on_holiday_bool,tweet_created_on_weekend_bool,tweet_created_at_noon_bool,tweet_created_at_eve_bool,user_id,user_screen_name,user_screen_name_length,user_no_of_tweets,...,tweet_text_contains_user_mentions,tweet_text_sentiment,tweet_text_contains_media,tweet_text_contains_number,tweet_text_contains_upper_words,tweet_text_contains_lower_words,tweet_text_contains_excl,tweet_text_contains_retweet_suggestion,retweeted,retweets
0,~1357587318119407616~,Fri Feb 05 07:10:14 +0000 2021,0,0,0,0,~327890669~,mavirise,8,4355,...,0,nue,0,1,0,1,1,0,False,0
1,~1357587287681323010~,Fri Feb 05 07:10:06 +0000 2021,0,0,0,0,~327890669~,mavirise,8,4355,...,0,nue,0,1,0,1,1,0,False,0
2,~1357587268689485825~,Fri Feb 05 07:10:02 +0000 2021,0,0,0,0,~1216545386040418304~,aman89086818,12,48,...,0,nue,0,0,0,1,0,0,False,0
3,~1357587262469525507~,Fri Feb 05 07:10:00 +0000 2021,0,0,0,0,~1731144182~,jagbirsingh111,14,196,...,1,neg,0,0,0,1,0,0,False,0
4,~1357587255699775488~,Fri Feb 05 07:09:59 +0000 2021,0,0,0,0,~327890669~,mavirise,8,4355,...,0,nue,0,1,0,1,1,0,False,0


In [21]:
# Let us see how many missing values each column have
tweetsData.isna().sum()

tweet_id                                      0
tweet_created_at                              0
tweet_created_on_holiday_bool                 0
tweet_created_on_weekend_bool                 0
tweet_created_at_noon_bool                    0
tweet_created_at_eve_bool                     0
user_id                                       0
user_screen_name                              0
user_screen_name_length                       0
user_no_of_tweets                             0
user_no_of_followers                          0
user_no_of_followings                         0
user_account_age                              0
user_no_of_favourites                         0
user_average_tweets                           0
user_average_favourites                       0
user_account_location                     11436
tweet_text                                    0
tweet_text_length                             0
tweet_text_optimal_length                     0
tweet_text_no_of_hashtags               

In [56]:
len(tweetsData[tweetsData.retweets==0])

15827

So we can see that we only have missing values in user_account_location which is approx 40% which is due to the user not given permission to let twitter know user's account location. However this will not matter a lot here. Also retweets column has no missing value.

In [19]:
# Let's have a look at data types of each parameter and number of non null entries
tweetsData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25694 entries, 0 to 25693
Data columns (total 34 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   tweet_id                                25694 non-null  object 
 1   tweet_created_at                        25694 non-null  object 
 2   tweet_created_on_holiday_bool           25694 non-null  int64  
 3   tweet_created_on_weekend_bool           25694 non-null  int64  
 4   tweet_created_at_noon_bool              25694 non-null  int64  
 5   tweet_created_at_eve_bool               25694 non-null  int64  
 6   user_id                                 25694 non-null  object 
 7   user_screen_name                        25694 non-null  object 
 8   user_screen_name_length                 25694 non-null  int64  
 9   user_no_of_tweets                       25694 non-null  int64  
 10  user_no_of_followers                    25694 non-null  in

In [20]:
# Let us have a brief overview of the summary statistics over NUMERIC columns in twitter data
tweetsData.describe()

Unnamed: 0,tweet_created_on_holiday_bool,tweet_created_on_weekend_bool,tweet_created_at_noon_bool,tweet_created_at_eve_bool,user_screen_name_length,user_no_of_tweets,user_no_of_followers,user_no_of_followings,user_account_age,user_no_of_favourites,...,tweet_text_contains_url,tweet_text_no_of_user_mentions,tweet_text_contains_user_mentions,tweet_text_contains_media,tweet_text_contains_number,tweet_text_contains_upper_words,tweet_text_contains_lower_words,tweet_text_contains_excl,tweet_text_contains_retweet_suggestion,retweets
count,25694.0,25694.0,25694.0,25694.0,25694.0,25694.0,25694.0,25694.0,25694.0,25694.0,...,25694.0,25694.0,25694.0,25694.0,25694.0,25694.0,25694.0,25694.0,25694.0,25694.0
mean,0.0,0.015256,0.096404,0.080213,11.995135,5521.590255,2410.177,402.654705,1270.251382,5792.574103,...,0.619717,0.657975,0.385615,0.119327,0.663813,0.214992,0.892387,0.093329,0.23414,9.884954
std,0.0,0.122574,0.29515,0.271628,2.567414,20504.915268,81797.65,1219.261763,1406.53454,18283.982312,...,0.485466,1.153367,0.48675,0.32418,0.472413,0.410825,0.309897,0.290899,0.423469,415.07924
min,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,10.0,95.0,8.0,38.0,67.0,97.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,12.0,651.0,49.0,118.0,487.0,666.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,14.0,4019.0,250.75,345.75,2391.75,3854.5,...,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
max,0.0,1.0,1.0,1.0,15.0,829961.0,5483319.0,79636.0,5044.0,451377.0,...,1.0,11.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,62666.0


In [50]:
bins=[0,1000,5000,10000,100000]
labs = ["Ordinary","Getting Attention","Popular","Trending"]

In [51]:
tweetsData["Attention_gathered"]=pd.cut(tweetsData.retweets,bins,labels=labs)
tweetsData.Attention_gathered.unique()

[NaN, Ordinary, Getting Attention, Popular, Trending]
Categories (4, object): [Ordinary < Getting Attention < Popular < Trending]

In [52]:
tweetsData['Attention_gathered'].unique()

[NaN, Ordinary, Getting Attention, Popular, Trending]
Categories (4, object): [Ordinary < Getting Attention < Popular < Trending]

In [53]:
df=dict(tuple(tweetsData.groupby('Attention_gathered')))

In [59]:
# Seems Greta is quite getting attention of multiple people. 
df['Trending']

Unnamed: 0,tweet_id,tweet_created_at,tweet_created_on_holiday_bool,tweet_created_on_weekend_bool,tweet_created_at_noon_bool,tweet_created_at_eve_bool,user_id,user_screen_name,user_screen_name_length,user_no_of_tweets,...,tweet_text_sentiment,tweet_text_contains_media,tweet_text_contains_number,tweet_text_contains_upper_words,tweet_text_contains_lower_words,tweet_text_contains_excl,tweet_text_contains_retweet_suggestion,retweeted,retweets,Attention_gathered
7444,~1357282507616645122~,Thu Feb 04 10:59:01 +0000 2021,0,0,0,0,~1006419421244678144~,GretaThunberg,13,8876,...,neg,0,1,1,1,0,1,False,62666,Trending
10095,~1357054451769606147~,Wed Feb 03 19:52:48 +0000 2021,0,0,0,1,~1006419421244678144~,GretaThunberg,13,8876,...,neg,0,1,0,1,0,0,False,13986,Trending


Since Greta is getting lot of attention. Let's see what she her hashtags 

In [60]:
df['Trending'].tweet_text_sentiment

7444     neg
10095    neg
Name: tweet_text_sentiment, dtype: object

In [61]:
df['Trending'].tweet_text

7444     I still #StandWithFarmers and support their pe...
10095    Here’s an updated toolkit by people on the gro...
Name: tweet_text, dtype: object

So her tweets are regarding the Farmers Protest going on in India

In [63]:
df['Trending'].tweet_text_contains_url

7444     1
10095    1
Name: tweet_text_contains_url, dtype: int64

In [34]:
df["Popular"]

Unnamed: 0,tweet_id,tweet_created_at,tweet_created_on_holiday_bool,tweet_created_on_weekend_bool,tweet_created_at_noon_bool,tweet_created_at_eve_bool,user_id,user_screen_name,user_screen_name_length,user_no_of_tweets,...,tweet_text_sentiment,tweet_text_contains_media,tweet_text_contains_number,tweet_text_contains_upper_words,tweet_text_contains_lower_words,tweet_text_contains_excl,tweet_text_contains_retweet_suggestion,retweeted,retweets,Attention_gathered
1757,~1357531309745209349~,Fri Feb 05 03:27:40 +0000 2021,0,0,0,0,~2314959096~,boxervijender,13,7528,...,neg,0,0,0,1,0,0,False,6388,Popular
4282,~1357373229044625409~,Thu Feb 04 16:59:31 +0000 2021,0,0,0,0,~574795929~,meenaharris,11,9754,...,neg,0,1,1,1,0,1,False,8280,Popular
6448,~1357301622637006849~,Thu Feb 04 12:14:58 +0000 2021,0,0,1,0,~1134059457191776257~,vanessa_vash,12,12799,...,neg,0,0,1,1,0,1,False,5598,Popular
10914,~1356997985662558213~,Wed Feb 03 16:08:26 +0000 2021,0,0,0,0,~810585618031984640~,Jamie_Margolin,14,21112,...,nue,0,1,0,1,1,0,False,7277,Popular


So all the tweets regarding Farmer protest are trending and popular.

In [41]:
tweetsData['Attention_gathered'].value_counts()

Ordinary             9848
Getting Attention      13
Popular                 4
Trending                2
Name: Attention_gathered, dtype: int64

In [57]:
tweetsData['Attention_gathered'].isna().sum()

15827

So basically 15827 tweets were not retweeted. Those accounts may have less followers.