In [1]:
import json
import pandas as pd
import numpy as np
import itertools as it

#### Users Extract

In [2]:
# load data
users = pd.read_csv("../data/stocktwits_users.csv")

In [3]:
# check out number of records
len(users)

5000

In [4]:
# exampine first few rows
users.head()

Unnamed: 0,id,username,experience,approach,holding_period,assets_traded
0,1,trustingPonie5,Novice,Growth,Position Trader,Equities
1,2,dreadfulZebra2,Novice,Technical,Position Trader,"Bonds,Futures"
2,3,annoyedOwl0,Intermediate,Fundamental,Position Trader,"Forex,Bonds,Equities"
3,4,abjectVenison8,Experienced,Growth,Position Trader,"Futures,Equities,Options"
4,5,decimalGatorade0,Intermediate,Technical,Position Trader,Private Companies


In [5]:
# check data types
users.dtypes

id                 int64
username          object
experience        object
approach          object
holding_period    object
assets_traded     object
dtype: object

In [6]:
# check if any columns have null values
np.sum(users.isnull(), axis=0)

id                0
username          0
experience        0
approach          0
holding_period    0
assets_traded     0
dtype: int64

In [7]:
# check distinct experience values
users.experience.unique()

array(['Novice', 'Intermediate', 'Experienced'], dtype=object)

In [8]:
# check distinct approach values
users.approach.unique()

array(['Growth', 'Technical', 'Fundamental', 'Value', 'Global Macro',
       'Momentum'], dtype=object)

In [9]:
# check dinstinct holding_period values
users.holding_period.unique()

array(['Position Trader', 'Long Term Investor', 'Day Trader',
       'Swing Trader'], dtype=object)

In [10]:
# check unique asset_traded values
users.assets_traded.unique()

array(['Equities', 'Bonds,Futures', 'Forex,Bonds,Equities',
       'Futures,Equities,Options', 'Private Companies',
       'Futures,Bonds,Options', 'Bonds,Private Companies,Options',
       'Futures', 'Futures,Private Companies,Equities', 'Forex,Bonds',
       'Forex,Futures', 'Private Companies,Bonds,Futures',
       'Forex,Private Companies,Bonds', 'Forex', 'Equities,Futures,Forex',
       'Equities,Forex,Private Companies',
       'Private Companies,Bonds,Equities', 'Futures,Options', 'Bonds',
       'Futures,Bonds,Equities', 'Bonds,Forex,Futures', 'Forex,Options',
       'Equities,Futures,Options', 'Equities,Private Companies',
       'Bonds,Futures,Forex', 'Options,Equities', 'Options',
       'Private Companies,Bonds', 'Private Companies,Equities',
       'Options,Forex,Futures', 'Equities,Options',
       'Forex,Private Companies', 'Private Companies,Options',
       'Forex,Equities,Bonds', 'Equities,Options,Bonds',
       'Options,Equities,Futures', 'Equities,Bonds',
       'Op

In [11]:
# We will probably want to split these out into unique indicator columns
set([asset for asset in it.chain.from_iterable([assets.split(",") for assets in users.assets_traded.unique()])])

{'Bonds', 'Equities', 'Forex', 'Futures', 'Options', 'Private Companies'}

#### Rooms Extract

In [12]:
# load data
rooms = pd.read_csv("../data/stocktwits_rooms.csv")

In [13]:
# check out number of rooms
len(rooms)

1000

In [14]:
# exampine first few rows
rooms.head()

Unnamed: 0,id,slug,topics
0,1,Help,Non-Market Talk
1,2,Stocktwits,Long Term Investing
2,3,AskJustin,Non-Market Talk
3,4,MarketConspiracies,Education
4,5,quantnews,Quant


In [15]:
# check data types
users.dtypes

id                 int64
username          object
experience        object
approach          object
holding_period    object
assets_traded     object
dtype: object

In [16]:
# check if any columns have null values
np.sum(rooms.isnull(), axis=0)

id        0
slug      0
topics    0
dtype: int64

In [17]:
# check distinct topic values
rooms.topics.unique()

array(['Non-Market Talk', 'Long Term Investing', 'Education', 'Quant',
       'Long Term Investing,Non-Market Talk,Green Energy',
       'Technology,Fundamentals', 'Education,Quant,Non-Market Talk',
       'Day Trading,Swing Trading,Education',
       'Swing Trading,Momentum Trading',
       'Day Trading,Swing Trading,Technicals', 'Cryptocurrencies',
       'Swing Trading,Biotechnology,Technicals',
       'Day Trading,Momentum Trading,Education',
       'Futures,Cryptocurrencies,Day Trading',
       'Long Term Investing,Fundamentals,Personal Finance', 'Technology',
       'Education,Technicals', 'ETFs',
       'Day Trading,Swing Trading,Momentum Trading',
       'Options,Swing Trading,Technicals', 'Swing Trading,Education',
       'Cryptocurrencies,Technology,Technicals', 'Swing Trading',
       'Day Trading,Momentum Trading,Non-Market Talk',
       'Day Trading,Long Term Investing,Technology', 'Day Trading',
       'Futures,Options,Technicals',
       'Long Term Investing,Momentum Tra

In [18]:
# We will probably want to split these out into unique indicator columns
set([topic for topic in it.chain.from_iterable([topics.split(",") for topics in rooms.topics.unique()])])

{'Biotechnology',
 'Cryptocurrencies',
 'Day Trading',
 'ETFs',
 'Education',
 'Energy',
 'Equities',
 'Forex',
 'Fundamentals',
 'Futures',
 'Global Macro',
 'Green Energy',
 'Long Term Investing',
 'Momentum Trading',
 'News',
 'Non-Market Talk',
 'Options',
 'Personal Finance',
 'Precious Metals',
 'Quant',
 'Swing Trading',
 'Technicals',
 'Technology'}

#### Activity Data

In [19]:
# check out number of records
i = 0
with open("../data/activity.json", "r") as f:
    for line in f.readlines():
        i += 1
        
print(i)

1209379


In [20]:
## Print first 50 records from activity file
i = 0
with open("../data/activity.json", "r") as f:
    for line in f.readlines():
        i += 1
        print(json.loads(line))
        if i > 50:
            break

{'action': 'like', 'user_id': 4768, 'message_id': 37304}
{'action': 'like', 'user_id': 2970, 'message_id': 19257}
{'action': 'subscribe', 'user_id': 4751, 'room_id': 487}
{'action': 'like', 'user_id': 1285, 'message_id': 47238}
{'action': 'like', 'user_id': 4699, 'message_id': 22290}
{'action': 'like', 'user_id': 2984, 'message_id': 14710}
{'action': 'like', 'user_id': 3621, 'message_id': 28280}
{'action': 'like', 'user_id': 1114, 'message_id': 40779}
{'action': 'message', 'message_id': 42235, 'user_id': 386, 'room_id': 833, 'mention_ids': ''}
{'action': 'follow', 'user_id': 1531, 'following_user_id': 2744}
{'action': 'follow', 'user_id': 1447, 'following_user_id': 3656}
{'action': 'follow', 'user_id': 4051, 'following_user_id': 2529}
{'action': 'like', 'user_id': 1427, 'message_id': 32042}
{'action': 'follow', 'user_id': 1338, 'following_user_id': 3670}
{'action': 'subscribe', 'user_id': 2990, 'room_id': 828}
{'action': 'follow', 'user_id': 448, 'following_user_id': 2859}
{'action': '

In [None]:
# all activities have an action indicated a follow, subscribe, message or like.
# all activites contain the user_id field with the user id of the user who performed the action
# the rest of the schema depends on the type of action:
# follow has a following_user_id with the id of the user followed
# subscribe has a room_id with the id of the room subscribed to
# message has a room_id with the room posted in and the mention_ids of a comma separated list of users mentioned
# like has a message_id with the id of the message that was liked