In [1]:
import pandas as pd

In [3]:
tweets_df = pd.read_json('data/tweets_DM.json', lines=True)

In [4]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1867535 entries, 0 to 1867534
Data columns (total 5 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   _score      int64 
 1   _index      object
 2   _source     object
 3   _crawldate  object
 4   _type       object
dtypes: int64(1), object(4)
memory usage: 71.2+ MB


In [5]:
tweets_df.head()

Unnamed: 0,_score,_index,_source,_crawldate,_type
0,391,hashtag_tweets,"{'tweet': {'hashtags': ['Snapchat'], 'tweet_id...",2015-05-23 11:42:47,tweets
1,433,hashtag_tweets,"{'tweet': {'hashtags': ['freepress', 'TrumpLeg...",2016-01-28 04:52:09,tweets
2,232,hashtag_tweets,"{'tweet': {'hashtags': ['bibleverse'], 'tweet_...",2017-12-25 04:39:20,tweets
3,376,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x1cd5...",2016-01-24 23:53:05,tweets
4,989,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x2de2...",2016-01-08 17:18:59,tweets


In [6]:
tweets_df._source[0]

{'tweet': {'hashtags': ['Snapchat'],
  'tweet_id': '0x376b20',
  'text': 'People who post "add me on #Snapchat" must be dehydrated. Cuz man.... that\'s <LH>'}}

In [7]:
def convert_to_tweets_df(source_col):
    hashtags = []
    tweet_ids = []
    texts = []
    for dic in source_col:
        dic = dic['tweet']
        hashtags.append(dic['hashtags'])
        tweet_ids.append(dic['tweet_id'])
        texts.append(dic['text'])
    df = pd.DataFrame(list(zip(hashtags, tweet_ids, texts)),
               columns =['hashtag', 'tweet_id', 'text'])
    return df

In [8]:
tweets_df = convert_to_tweets_df(tweets_df._source)

In [9]:
tweets_df.head()

Unnamed: 0,hashtag,tweet_id,text
0,[Snapchat],0x376b20,"People who post ""add me on #Snapchat"" must be ..."
1,"[freepress, TrumpLegacy, CNN]",0x2d5350,"@brianklaas As we see, Trump is dangerous to #..."
2,[bibleverse],0x28b412,"Confident of your obedience, I write to you, k..."
3,[],0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>
4,[],0x2de201,"""Trust is not the same as faith. A friend is s..."


In [10]:
# separate train and test data

In [7]:
identity_df = pd.read_csv('data/data_identification.csv')

In [12]:
identity_df.head()

Unnamed: 0,tweet_id,identification
0,0x28cc61,test
1,0x29e452,train
2,0x2b3819,train
3,0x2db41f,test
4,0x2a2acc,train


In [4]:
def create_identity_dict(id_col, class_col):
    identity_dict = {}
    for i in range(len(id_col)):
        identity_dict[id_col[i]] = class_col[i]
    return identity_dict

In [8]:
identity_dict = create_identity_dict(identity_df.tweet_id, identity_df.identification)

In [None]:
new_train_test_id_col = []
for tweet_id in tweets_df.tweet_id:
    new_train_test_id_col.append(identity_dict[tweet_id])

In [16]:
tweets_df['train_or_test'] = new_train_test_id_col

In [17]:
tweets_df.head()

Unnamed: 0,hashtag,tweet_id,text,train_or_test
0,[Snapchat],0x376b20,"People who post ""add me on #Snapchat"" must be ...",train
1,"[freepress, TrumpLegacy, CNN]",0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",train
2,[bibleverse],0x28b412,"Confident of your obedience, I write to you, k...",test
3,[],0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,train
4,[],0x2de201,"""Trust is not the same as faith. A friend is s...",test


In [18]:
# Add Emotion information

In [19]:
train_df = tweets_df.loc[tweets_df.train_or_test == 'train']

In [20]:
len(train_df)

1455563

In [2]:
emotion_df = pd.read_csv('data/emotion.csv')

In [5]:
emotion_dict = create_identity_dict(emotion_df.tweet_id, emotion_df.emotion)

In [17]:
emotion_dict['0x2d5350']

'sadness'

In [19]:
new_emotion_col = []
for tweet_id in train_df.tweet_id:
    new_emotion_col.append(emotion_dict[tweet_id])

In [20]:
new_emotion_col[:3]

['anticipation', 'sadness', 'fear']

In [11]:
train_df = pd.read_pickle('data/tweets_train.pkl')

In [12]:
train_df.head()

Unnamed: 0,hashtag,tweet_id,text,train_or_test,class
0,[Snapchat],0x376b20,"People who post ""add me on #Snapchat"" must be ...",train,sadness
1,"[freepress, TrumpLegacy, CNN]",0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",train,disgust
3,[],0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,train,anticipation
5,"[authentic, LaughOutLoud]",0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,train,joy
6,[],0x2c91a8,Still waiting on those supplies Liscus. <LH>,train,anticipation


In [21]:
train_df['class'] = new_emotion_col

In [22]:
train_df.head()

Unnamed: 0,hashtag,tweet_id,text,train_or_test,class
0,[Snapchat],0x376b20,"People who post ""add me on #Snapchat"" must be ...",train,anticipation
1,"[freepress, TrumpLegacy, CNN]",0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",train,sadness
3,[],0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,train,fear
5,"[authentic, LaughOutLoud]",0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,train,joy
6,[],0x2c91a8,Still waiting on those supplies Liscus. <LH>,train,anticipation


In [26]:
tweets_df.head()

Unnamed: 0,hashtag,tweet_id,text,train_or_test
0,[Snapchat],0x376b20,"People who post ""add me on #Snapchat"" must be ...",train
1,"[freepress, TrumpLegacy, CNN]",0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",train
2,[bibleverse],0x28b412,"Confident of your obedience, I write to you, k...",test
3,[],0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,train
4,[],0x2de201,"""Trust is not the same as faith. A friend is s...",test


In [27]:
test_df = tweets_df.loc[tweets_df.train_or_test == 'test']

In [23]:
train_df.to_pickle('data/tweets_train.pkl')
test_df.to_pickle('data/tweets_test.pkl')
tweets_df.to_pickle('data/all_tweets.pkl')