In [14]:
# -*- coding: utf-8 -*-
"""
data_collection.ipynb
Automatically generated by Colaboratory.
Original file is located at:
    https://colab.research.google.com/drive/1auiMh_kYPUusGn0avpANgkqYQD3mD2Xf
"""

import os
import json

def dw_config(filename):
    '''
    Args:
        filename: the JSON file containing the authentication token

    Return:
        None

    Load a JSON config file at `filename` containing the authentication and 
    other information. Before running this cell, make sure that `filename` has 
    been uploaded into Colab, or is at least in the current working directory of
    this notebook. The file's contents should have the following format:

    `
    {"token": <YOUR_TOKEN>}
    `

    In this file, `<YOUR_TOKEN>` is the authentication token string for the 
    Python integration with data.world. Once the authentication token is 
    obtained, set it as an environment variable so that it can be used by the 
    datadotworld library, which is used to import datasets automatically. The 
    specific environment variable is `DW_AUTH_TOKEN`, as specified in the docs
    for `datadotworld`. 
    
    For information on obtaining this token and how it is being used in this 
    method, see the links below. You will need to create an account and generate
    your own authentication token to run this code. The first link is to the
    integration page. Once you have enabled the integration, go to the Manage
    tab and you will see you authentication token. The second link is to help
    understand how the library is configured and how datasets are imported.

    https://data.world/integrations/python 
    https://help.data.world/hc/en-us/articles/360039429733-Python-SDK
    '''
    with open(filename) as f:
        config = json.load(f)
        token = config['auth_token']
        os.environ['DW_AUTH_TOKEN'] = token

import datadotworld as dw

def get_twitter_dataframes():
    '''
    Args:
        None

    Return:
        till2017_df: the dataframe for the dataset containg Tweets till 2017
        from2017_df: the dataframe for the dataset containg Tweets till 2017

    A function that uses the `datadotworld` library, to load the two datasets
    used in this project, and the dataframes in this datasets are extracted.
    Below are links to the datasets being used. 
    
    https://data.world/adamhelsinger/elon-musk-tweets-until-4-6-17 (2010-2017)
    https://data.world/barbaramaseda/elon-musk-tweets (2017 onwards)

    The object returned from `dw.load_dataset()` contains a dataframe that needs
    to be extracted from a dictionary of other dataframes. Because there is only
    one dataframe in each dataset, it is extracted using set-list conversion, 
    avoiding the need to use a key to look it up.
    '''
    till2017 = dw.load_dataset('adamhelsinger/elon-musk-tweets-until-4-6-17')
    from2017 = dw.load_dataset('barbaramaseda/elon-musk-tweets')

    till2017_df = list(till2017.dataframes.values())[0]
    from2017_df = list(from2017.dataframes.values())[0]
    return till2017_df, from2017_df

"""
From here, the next step is cleaning the data. `from2017_df` contains the full Tweet URL, and the Tweet ID is at the end of the URL.
`till2017_df` uses these same IDs, so it is possible to extract IDs from the URLs in `from2017_df`.
Once these IDs are extracted, they can be used to index Tweets.
This makes it easier to identify and remove duplicate Tweets when combining the two dataframes into one.
We can also remove Retweets by finding all Tweets with the prefix "RT".
This helps identify which Tweets are not actually authored by Musk specifically.

Other next steps may involve removing formatting characters, retweets, and URLs from the Tweets as they appear.
Mentions of other Twitter users should be left in the message with the "@" character removed so that the sentence still makes sense, and so that those words can be identified as referring to other users.
After this is complete, columns relating to TSLA stock valuations can be added to this merged dataframe.

"""

import numpy as np
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

def clean_data(till2017_df_orig, from2017_df_orig):
    '''
    Args:
        till2017_df_orig: the original dataframe containing data till 2017
        from2017_df_orig: the original dataframe containing data from 2017

    Return:
        df: the cleaned dataframe containing the Tweets and other information
    
    Clean the dataframes obtained from `get_dataframes()`. Those dataframes 
    should be stored somewhere before calling this function in case changes need
    to be reverted. The cleaning process is detailed below. 
    '''
    # Make a copy of the original, remove binary string identifiers, rename columns, set index
    till2017_df = till2017_df_orig.copy()
    till2017_df['text'] = till2017_df['text'].str.strip('b\'\"')
    till2017_df = till2017_df.rename(columns={'created_at': 'timestamp'})
    till2017_df = till2017_df.reindex(columns=['id', 'timestamp', 'text'])
    till2017_df = till2017_df.set_index('id')

    # Make a copy of the original, extract and format ids, drop and rename some columns, set index
    from2017_df = from2017_df_orig.copy()
    from2017_df['linktotweet'] = from2017_df['linktotweet'].str.strip('http://twitter.com/elonmusk/status/')
    from2017_df['linktotweet'] = from2017_df['linktotweet'].astype(int)
    from2017_df = from2017_df.drop(labels=['username', 'tweetembedcode'], axis=1)
    from2017_df = from2017_df.rename(columns={'createdat': 'timestamp', 'linktotweet': 'id'})
    from2017_df = from2017_df.reindex(columns=['id', 'timestamp', 'text'])
    from2017_df = from2017_df.set_index('id')

    # Concatenate dataframes, drop duplicates
    tweets_df = pd.concat([till2017_df, from2017_df])
    tweets_df = tweets_df.drop_duplicates()

    # Separate timestamp into date and time, so that stock data can be found
    tweets_df['date'] = pd.to_datetime(tweets_df['timestamp']).dt.date
    tweets_df['time'] = pd.to_datetime(tweets_df['timestamp']).dt.time

    # Reset index, remove id and timestamp columns, index by date column
    tweets_df = tweets_df.reset_index()
    tweets_df = tweets_df.drop(labels=['id', 'timestamp'], axis=1)
    tweets_df = tweets_df.reindex(columns=['date', 'time', 'text'])
    tweets_df = tweets_df.set_index('date')
    tweets_df = tweets_df.sort_index()

    '''
    TODO:
        Add more code here to do any word processing on the text column
    '''
    # Remove retweets
    #tweets_df = tweets_df[~tweets_df['text'].str.startswith('RT')]

    # Remove mentions of other users
    #tweets_df = tweets_df[~df.text.str.contains("@", na=False)]

    # Export tweets to CSV
    tweets_df.to_csv('tweets.csv')
    print(tweets_df)
    return tweets_df

In [11]:
import sys
!{sys.executable} -m pip install datadotworld

Collecting datadotworld
  Downloading datadotworld-1.7.0-py2.py3-none-any.whl (158 kB)
[K     |████████████████████████████████| 158 kB 1.2 MB/s eta 0:00:01
[?25hCollecting configparser<4.0a,>=3.5.0
  Downloading configparser-3.8.1-py2.py3-none-any.whl (22 kB)
Collecting tabulator>=1.22.0
  Downloading tabulator-1.52.5-py2.py3-none-any.whl (71 kB)
[K     |████████████████████████████████| 71 kB 946 kB/s eta 0:00:01
Collecting datapackage<2.0a,>=1.6.2
  Downloading datapackage-1.15.1-py2.py3-none-any.whl (85 kB)
[K     |████████████████████████████████| 85 kB 753 kB/s eta 0:00:01:01
Collecting tableschema<2.0a,>=1.5.2
  Downloading tableschema-1.20.0-py2.py3-none-any.whl (68 kB)
[K     |████████████████████████████████| 68 kB 1.1 MB/s eta 0:00:01
Collecting boto3>=1.9
  Downloading boto3-1.16.10-py2.py3-none-any.whl (129 kB)
[K     |████████████████████████████████| 129 kB 959 kB/s eta 0:00:01
Collecting linear-tsv>=1.0
  Downloading linear-tsv-1.1.0.tar.gz (9.6 kB)
Collecting ijs

In [15]:
get_twitter_dataframes()

RuntimeError: Configuration file not found at /Users/anshuduggal/.dw/config.To fix this issue, run dw configure