In [1]:
## required imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
plt.style.use('ggplot')
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
## read data
df = pd.read_csv('Twitter_5_year.csv')

In [3]:
# Shape
df.shape 

(299998, 15)

In [5]:
# Print columns
df.columns

Index(['Account', 'User Name', 'Followers at Posting', 'Created', 'Type',
       'Likes', 'Retweets', 'URL', 'Message', 'Screen Name', 'Link 1',
       'Final Link 1', 'Link 2', 'Final Link 2', 'Score'],
      dtype='object')

In [6]:
df.nunique()

Account                    128
User Name                  128
Followers at Posting     86577
Created                 298457
Type                         6
Likes                    24456
Retweets                 13291
URL                     299998
Message                 295701
Screen Name                142
Link 1                  225630
Final Link 1            223468
Link 2                   32123
Final Link 2             31749
Score                     6753
dtype: int64

In [7]:
df['Type'].value_counts()

Link            146122
Photo            61678
Tweet            56976
Native Video     24707
Video            10486
Vine                29
Name: Type, dtype: int64

In [8]:
df.count()

Account                 299998
User Name               299998
Followers at Posting    290677
Created                 299998
Type                    299998
Likes                   299998
Retweets                299998
URL                     299998
Message                 299998
Screen Name             294460
Link 1                  243040
Final Link 1            243040
Link 2                   39680
Final Link 2             39680
Score                   299998
dtype: int64

In [9]:
sum(df['Followers at Posting'].isna())

9321

In [10]:
df_selected = df[['User Name', 'Followers at Posting', 'Created', 'Type',
       'Likes', 'Retweets','Message','Score']]
df_selected.head()

Unnamed: 0,User Name,Followers at Posting,Created,Type,Likes,Retweets,Message,Score
0,therealjuicyj,2645413.0,2019-06-13 23:49:25 EDT,Tweet,864,190,Whomever made bets made good money tonight,-3.59
1,lukecombs,336828.0,2019-06-13 23:18:23 EDT,Tweet,3859,102,.@nicohocking is bae,1.14
2,adamlambert,2477885.0,2019-06-13 23:03:07 EDT,Link,832,343,Korea! Tickets are on sale NOW for Queen + Ada...,-1.33
3,russdiemon,2043124.0,2019-06-13 22:56:16 EDT,Tweet,986,31,What questions y’all want @KidSuper to ask me?...,-9.22
4,gerardoortiznet,3082321.0,2019-06-13 22:43:58 EDT,Photo,727,76,🚘😎😜 https://t.co/4yeXAIbwA7,1.89


In [11]:
df_selected.count()

User Name               299998
Followers at Posting    290677
Created                 299998
Type                    299998
Likes                   299998
Retweets                299998
Message                 299998
Score                   299998
dtype: int64

In [12]:
df_selected = df_selected.dropna()
df_selected = df_selected.reset_index(drop = True)

In [13]:
df_selected.count()

User Name               290677
Followers at Posting    290677
Created                 290677
Type                    290677
Likes                   290677
Retweets                290677
Message                 290677
Score                   290677
dtype: int64

In [14]:
# Convert "Created" into datetime 
df_selected['Created'] = pd.to_datetime(df_selected['Created'])

In [16]:
#Number of characters in message custom feature
df_selected['message_length'] = df_selected['Message'].apply(len)

In [17]:
#Number of hashtags custom feature
df_selected['hashtag_count'] = df_selected.apply(lambda x: x['Message'].count('#'), axis=1)

In [18]:
#OHE for Type Field
df_selected = pd.concat([df_selected,pd.get_dummies(df_selected['Type'],prefix='type')],axis=1)

In [20]:
def time_of_day_creation(x):
    '''
    Takes in an integer and returns one of four time slots:
    ['12am-6am', '6am-12pm', '12pm-6pm', '6pm-12am']
    '''
    
    slots_out = ['12am-6am', '6am-12pm', '12pm-6pm', '6pm-12am']
    slot0 = np.arange(0,6)
    slot1 = np.arange(6,12)
    slot2 = np.arange(12,18)
    slot3 = np.arange(18,24)

    if x in slot0:
        return slots_out[0]
    elif x in slot1:
        return slots_out[1]
    elif x in slot2:
        return slots_out[2]
    elif x in slot3:
        return slots_out[3]
    else:
        return 'NaN'

In [21]:
#Creating time of day buckets (e.g. 0-6am, 6-12pm, 12-6, 6-12am)
df_selected['time_of_day'] = df_selected['Created'].apply(lambda x : time_of_day_creation(x.hour))

In [22]:
#OHE for time of day field
df_selected = pd.concat([df_selected,pd.get_dummies(df_selected['time_of_day'],prefix='tod')],axis=1)

In [24]:
df_selected.sort_values(by=['User Name','Created'], inplace=True)

In [25]:
#Time between prior and current post
df_selected['time_since_last_post'] = df_selected.groupby('User Name')['Created'].diff()

In [26]:
df_selected['within_week_release'] = 0

In [27]:
df_selected['within_month_release'] = 0

In [28]:
df_selected['social_engagement_score'] = df_selected.apply(lambda x: (x['Likes'] + x['Retweets']) / x['Followers at Posting'], axis=1)

In [30]:
df_selected['social_engagement_score'].describe()

count    290677.000000
mean          0.002099
std           0.008438
min           0.000000
25%           0.000063
50%           0.000245
75%           0.001200
max           1.488100
Name: social_engagement_score, dtype: float64

In [31]:
df_selected = df_selected.drop(['Followers at Posting', 'Likes', 'Retweets','time_of_day'], axis=1)

In [32]:
df_selected.reset_index(drop=True, inplace=True)

In [33]:
df_selected.head()

Unnamed: 0,User Name,Created,Type,Message,Score,message_length,hashtag_count,type_Link,type_Native Video,type_Photo,...,type_Video,type_Vine,tod_12am-6am,tod_12pm-6pm,tod_6am-12pm,tod_6pm-12am,time_since_last_post,within_week_release,within_month_release,social_engagement_score
0,1future,2015-09-11 15:24:44,Link,Download 'March Madness' on iTunes NOW!!!! htt...,-3.3,70,1,1,0,0,...,0,0,0,1,0,0,NaT,0,0,0.000225
1,1future,2015-09-11 17:42:00,Link,Naw frfr... https://t.co/Dmav5zl5N4,-3.88,35,0,1,0,0,...,0,0,0,1,0,0,02:17:16,0,0,0.000191
2,1future,2015-09-11 18:33:50,Link,Stream #DS2 on @Spotify NOW!!!!! http://t.co/V...,-3.14,69,2,1,0,0,...,0,0,0,0,0,1,00:51:50,0,0,0.000236
3,1future,2015-09-11 21:14:14,Link,R/p @theshootrr https://t.co/emi7dj17hH,-7.37,39,0,1,0,0,...,0,0,0,0,0,1,02:40:24,0,0,0.000101
4,1future,2015-09-11 23:29:25,Tweet,Took the stage in Toronto & shit will never be...,1.17,124,1,0,0,0,...,0,0,0,0,0,1,02:15:11,0,0,0.003585


In [35]:
df_selected.to_csv('twitter_5yrs_features.csv')