In [1]:
## required imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
plt.style.use('ggplot')
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
## read data
df_1 = pd.read_csv('Twitter_5_year.csv')

In [3]:
df_other_artists = pd.read_csv('SZA_Chainsmoker_Twitter.csv')

In [4]:
df = pd.concat([df_1,df_other_artists])

In [5]:
# Shape
df.shape 

(316205, 15)

In [6]:
# Print columns
df.columns

Index(['Account', 'User Name', 'Followers at Posting', 'Created', 'Type',
       'Likes', 'Retweets', 'URL', 'Message', 'Screen Name', 'Link 1',
       'Final Link 1', 'Link 2', 'Final Link 2', 'Score'],
      dtype='object')

In [7]:
df.nunique()

Account                    130
User Name                  130
Followers at Posting     88200
Created                 314625
Type                         6
Likes                    24863
Retweets                 13524
URL                     316205
Message                 311869
Screen Name                144
Link 1                  232905
Final Link 1            230680
Link 2                   32549
Final Link 2             32166
Score                     7156
dtype: int64

In [8]:
df['Type'].value_counts()

Link            150041
Tweet            65643
Photo            64786
Native Video     24933
Video            10745
Vine                57
Name: Type, dtype: int64

In [9]:
df.count()

Account                 316205
User Name               316205
Followers at Posting    305308
Created                 316205
Type                    316205
Likes                   316205
Retweets                316205
URL                     316205
Message                 316205
Screen Name             310605
Link 1                  250582
Final Link 1            250582
Link 2                   40136
Final Link 2             40136
Score                   316205
dtype: int64

In [10]:
sum(df['Followers at Posting'].isna())

10897

In [11]:
df_selected = df[['User Name', 'Followers at Posting', 'Created', 'Type',
       'Likes', 'Retweets','Message','Score']]
df_selected.head()

Unnamed: 0,User Name,Followers at Posting,Created,Type,Likes,Retweets,Message,Score
0,therealjuicyj,2645413.0,2019-06-13 23:49:25 EDT,Tweet,864,190,Whomever made bets made good money tonight,-3.59
1,lukecombs,336828.0,2019-06-13 23:18:23 EDT,Tweet,3859,102,.@nicohocking is bae,1.14
2,adamlambert,2477885.0,2019-06-13 23:03:07 EDT,Link,832,343,Korea! Tickets are on sale NOW for Queen + Ada...,-1.33
3,russdiemon,2043124.0,2019-06-13 22:56:16 EDT,Tweet,986,31,What questions y’all want @KidSuper to ask me?...,-9.22
4,gerardoortiznet,3082321.0,2019-06-13 22:43:58 EDT,Photo,727,76,🚘😎😜 https://t.co/4yeXAIbwA7,1.89


In [12]:
df_selected.count()

User Name               316205
Followers at Posting    305308
Created                 316205
Type                    316205
Likes                   316205
Retweets                316205
Message                 316205
Score                   316205
dtype: int64

In [13]:
df_selected = df_selected.dropna()
df_selected = df_selected.reset_index(drop = True)

In [14]:
df_selected.count()

User Name               305308
Followers at Posting    305308
Created                 305308
Type                    305308
Likes                   305308
Retweets                305308
Message                 305308
Score                   305308
dtype: int64

In [15]:
# Convert "Created" into datetime 
df_selected['Created'] = pd.to_datetime(df_selected['Created'])

In [16]:
#Number of characters in message custom feature
df_selected['message_length'] = df_selected['Message'].apply(len)

In [17]:
#Number of hashtags custom feature
df_selected['hashtag_count'] = df_selected.apply(lambda x: x['Message'].count('#'), axis=1)

In [18]:
# Combine post types
df_selected['Type'].value_counts()

Link            145209
Photo            63159
Tweet            62374
Native Video     24481
Video            10041
Vine                44
Name: Type, dtype: int64

In [19]:
df_selected.loc[df_selected['Type'] == 'Vine', 'Type'] = 'Video'
df_selected.loc[df_selected['Type'] == 'Native Video', 'Type'] = 'Video'

In [20]:
df_selected['Type'].value_counts()

Link     145209
Photo     63159
Tweet     62374
Video     34566
Name: Type, dtype: int64

In [21]:
#OHE for Type Field
df_selected = pd.concat([df_selected,pd.get_dummies(df_selected['Type'],prefix='type')],axis=1)

In [22]:
def time_of_day_creation(x):
    '''
    Takes in an integer and returns one of four time slots:
    ['12am-6am', '6am-12pm', '12pm-6pm', '6pm-12am']
    '''
    
    slots_out = ['12am-6am', '6am-12pm', '12pm-6pm', '6pm-12am']
    slot0 = np.arange(0,6)
    slot1 = np.arange(6,12)
    slot2 = np.arange(12,18)
    slot3 = np.arange(18,24)

    if x in slot0:
        return slots_out[0]
    elif x in slot1:
        return slots_out[1]
    elif x in slot2:
        return slots_out[2]
    elif x in slot3:
        return slots_out[3]
    else:
        return 'NaN'

In [23]:
#Creating time of day buckets (e.g. 0-6am, 6-12pm, 12-6, 6-12am)
df_selected['time_of_day'] = df_selected['Created'].apply(lambda x : time_of_day_creation(x.hour))

In [24]:
#OHE for time of day field
df_selected = pd.concat([df_selected,pd.get_dummies(df_selected['time_of_day'],prefix='tod')],axis=1)

In [25]:
df_selected.sort_values(by=['User Name','Created'], inplace=True)

In [26]:
#Time between prior and current post
df_selected['time_since_last_post'] = df_selected.groupby('User Name')['Created'].diff()

In [27]:
df_selected['within_week_release'] = 0

In [28]:
df_selected['within_month_release'] = 0

In [29]:
df_selected['social_engagement_score'] = df_selected.apply(lambda x: (x['Likes'] + x['Retweets']) / x['Followers at Posting'], axis=1)

In [30]:
df_selected['social_engagement_score'].describe()

count    305308.000000
mean          0.002229
std           0.009328
min           0.000000
25%           0.000067
50%           0.000275
75%           0.001302
max           1.488100
Name: social_engagement_score, dtype: float64

In [31]:
df_selected = df_selected.drop(['Type','Likes', 'Retweets','time_of_day'], axis=1)

In [32]:
df_selected.reset_index(drop=True, inplace=True)

In [33]:
df_selected.head()

Unnamed: 0,User Name,Followers at Posting,Created,Message,Score,message_length,hashtag_count,type_Link,type_Photo,type_Tweet,type_Video,tod_12am-6am,tod_12pm-6pm,tod_6am-12pm,tod_6pm-12am,time_since_last_post,within_week_release,within_month_release,social_engagement_score
0,1future,1668274.0,2015-09-11 19:24:44,Download 'March Madness' on iTunes NOW!!!! htt...,-3.3,70,1,1,0,0,0,0,0,0,1,NaT,0,0,0.000225
1,1future,1668274.0,2015-09-11 21:42:00,Naw frfr... https://t.co/Dmav5zl5N4,-3.88,35,0,1,0,0,0,0,0,0,1,02:17:16,0,0,0.000191
2,1future,1668274.0,2015-09-11 22:33:50,Stream #DS2 on @Spotify NOW!!!!! http://t.co/V...,-3.14,69,2,1,0,0,0,0,0,0,1,00:51:50,0,0,0.000236
3,1future,1668274.0,2015-09-12 01:14:14,R/p @theshootrr https://t.co/emi7dj17hH,-7.37,39,0,1,0,0,0,1,0,0,0,02:40:24,0,0,0.000101
4,1future,1668274.0,2015-09-12 03:29:25,Took the stage in Toronto & shit will never be...,1.17,124,1,0,0,1,0,1,0,0,0,02:15:11,0,0,0.003585


In [34]:
df_selected.columns

Index(['User Name', 'Followers at Posting', 'Created', 'Message', 'Score',
       'message_length', 'hashtag_count', 'type_Link', 'type_Photo',
       'type_Tweet', 'type_Video', 'tod_12am-6am', 'tod_12pm-6pm',
       'tod_6am-12pm', 'tod_6pm-12am', 'time_since_last_post',
       'within_week_release', 'within_month_release',
       'social_engagement_score'],
      dtype='object')

In [35]:
# Rename columns
df_selected.rename(columns={"Created": "created",
                            "Message": "description",
                            "Score":"crowdtangle_score", 
                            "Followers at Posting" : "count_of_followers",
                            "tod_12am-6am" : "created_12am-6am",
                            "tod_6am-12pm" : "created_6am-12pm",
                            "tod_12pm-6pm" : "created_12pm-6pm",
                            "tod_6pm-12am" : "created_6pm-12am",
                            "User Name" : "Twitter_Handle",
                            "message_length":"description_length",
                            "type_Photo":"type_photo",
                            "type_Tweet":"type_text",
                            "type_Video":"type_video",
                            "type_Link":"type_link"
                           }, inplace=True)

In [36]:
df_selected.head()

Unnamed: 0,Twitter_Handle,count_of_followers,created,description,crowdtangle_score,description_length,hashtag_count,type_link,type_photo,type_text,type_video,created_12am-6am,created_12pm-6pm,created_6am-12pm,created_6pm-12am,time_since_last_post,within_week_release,within_month_release,social_engagement_score
0,1future,1668274.0,2015-09-11 19:24:44,Download 'March Madness' on iTunes NOW!!!! htt...,-3.3,70,1,1,0,0,0,0,0,0,1,NaT,0,0,0.000225
1,1future,1668274.0,2015-09-11 21:42:00,Naw frfr... https://t.co/Dmav5zl5N4,-3.88,35,0,1,0,0,0,0,0,0,1,02:17:16,0,0,0.000191
2,1future,1668274.0,2015-09-11 22:33:50,Stream #DS2 on @Spotify NOW!!!!! http://t.co/V...,-3.14,69,2,1,0,0,0,0,0,0,1,00:51:50,0,0,0.000236
3,1future,1668274.0,2015-09-12 01:14:14,R/p @theshootrr https://t.co/emi7dj17hH,-7.37,39,0,1,0,0,0,1,0,0,0,02:40:24,0,0,0.000101
4,1future,1668274.0,2015-09-12 03:29:25,Took the stage in Toronto & shit will never be...,1.17,124,1,0,0,1,0,1,0,0,0,02:15:11,0,0,0.003585


In [37]:
df_selected['posts_last_7days'] = 0
df_selected['posts_last_30days'] = 0

In [38]:
# Missing dataframe columns
df_selected.columns

Index(['Twitter_Handle', 'count_of_followers', 'created', 'description',
       'crowdtangle_score', 'description_length', 'hashtag_count', 'type_link',
       'type_photo', 'type_text', 'type_video', 'created_12am-6am',
       'created_12pm-6pm', 'created_6am-12pm', 'created_6pm-12am',
       'time_since_last_post', 'within_week_release', 'within_month_release',
       'social_engagement_score', 'posts_last_7days', 'posts_last_30days'],
      dtype='object')

In [39]:
df_selected.to_csv('twitter_features.csv')