## Notes for next week:
* filter down json file
  * flatten the data
  * remove the retweets
  * save in format usable by nltk or scikit learn tools
* Scikit for text analysis 
* Answering the questions for exploration of data
* Decide an actual question to answer
  * Media and text in combination


1. how many are geo-tagged in syria
2. how many have pictures/videos?
3. counts of distinct words in the text
4. Explore the hashtags used (can maybe confirm how they were scraped in the first place and/or find other tags)
5. Identify the most influential users/posts

In [1]:
import pandas
import json
import ijson
import pprint
from itertools import islice

In [15]:
# set file path for tweets dataset
tweet_path = '/Users/adamstueckrath/Desktop/SyriaProjectNotes/data/tweets/tweets.json'

# read n number of json objects from the tweets dataset
def read_n_from_file(json_path, n_lines):
    data = []
    with open(json_path) as f:
        for line in islice(f, n_lines):
            data.append(json.loads(line))
        return(data)  

# get sample json objects 
sample_json_objects = read_n_from_file(tweet_path, 1)

# print sample
pprint.pprint(sample_json_objects)


[{'_id': {'$oid': '595e82d713bbf01307babbba'},
  'contributors': None,
  'coordinates': None,
  'created_at': {'$date': '2017-07-06T18:34:37.000Z'},
  'entities': {'hashtags': [{'indices': [79, 88], 'text': 'Basirhat'}],
               'symbols': [],
               'urls': [{'expanded_url': None,
                         'indices': [134, 134],
                         'url': ''}],
               'user_mentions': [{'id': 147994804,
                                  'id_str': '147994804',
                                  'indices': [3, 15],
                                  'name': 'Rishi Bagree 🇮🇳',
                                  'screen_name': 'rishibagree'},
                                 {'id': 37034483,
                                  'id_str': '37034483',
                                  'indices': [117, 122],
                                  'name': 'NDTV',
                                  'screen_name': 'ndtv'}]},
  'favorite_count': 0,
  'favorited': False,
  'filter_

In [113]:
# %%timeit
# get the total number of json objects in file 
# json objects must be stored per line, not in an arrary 
# file contains separate JSON object on each line.
def count_json_objects(json_path):
    count = 0
    with open(json_path, 'r') as file:
        for line in file: 
            count+=1
        return count
    
# print total number of json objects
# count_json_objects(file_path)



In [17]:
# how to profile a specific function
# %lprun -f count_json_objects count_json_objects(json_path)

In [18]:
# %%timeit
def count_json_objects_chunk(json_path):
    count = 0
    with open(json_path) as f:
        while True:
            next_n_lines = list(islice(f, 20))
            if not next_n_lines:
                break
            for line in next_n_lines: 
                count += 1
        return count

# json_objects_total = count_json_objects_v2(file_path)
# print(json_objects_total)


In [19]:
# get column names and types
def get_columns_types(dataframe):
    column_details = {}
    columns = dataframe.columns.values.tolist()
    for column in columns: 
        column_details[column] = type(dataframe[column].iat[0])
    return column_details


In [77]:
from pandas.io.json import json_normalize

# new line json streamer
def nljson_generator(json_path):
    with open(json_path) as file:
        for line in file: 
            yield json.loads(line)
        
# load json to dataframe
def json_to_dataframe(json_object):
    dataframe = json_normalize(json_object)
    return dataframe

# source code: https://towardsdatascience.com/flattening-json-objects-in-python-f5343c794b10
# flattens nested json
def flatten_json(json_object):
    out = {}
    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x
    flatten(json_object)
    return out


In [122]:
from pandas import ExcelWriter
# create sample excel file with n number of tweets 
# read from json file, normalize json, and create dataframe
# transpose dataframe and save each tweet into seperate excel tab
def json_tweets_xlsx_sample(json_path, excel_path, 
                         tweet_start=0, tweet_limit='all', 
                         normalize=False):
    tweet_list = []
    json_stream = nljson_generator(json_path)
    
    if tweet_limit == 'all':
        tweets = json_stream
    else:
        if isinstance(tweet_limit, int): 
            tweets = islice(json_stream, tweet_start, tweet_start+tweet_limit)
    
    for tweet in tweets: 
        if normalize:
            tweet_normalize = json_to_dataframe(tweet)
            tweet_object = tweet_normalize.transpose()
        else:
            tweet_object = pandas.Series(tweet)
        tweet_list.append(tweet_object)
            
    print("writing tweets")
    writer = ExcelWriter(excel_path)
    for n, tweet in enumerate(tweet_list):
        tweet.to_excel(writer,'tweet%s' % n)
    writer.save()


# output_path = '/Users/adamstueckrath/Desktop/adam_testing3.xlsx'
# json_tweets_xlsx_sample(tweet_path, output_path, tweet_limit=10, normalize=True)
# json_tweets_xlsx_sample(tweet_path, output_path, tweet_start=100000, tweet_limit=15)


In [132]:
def remove_retweets(json_path, output_json_path):
    json_stream = nljson_generator(json_path)
    with open(output_json_path, 'w') as output:
        for tweet in json_stream:
            text = tweet['text']
            if text.startswith('RT'):
                continue
            else:
                json.dump(tweet, output)
                #output.write("\n")
                
tweets_no_retweets_path = '/Users/adamstueckrath/Desktop/tweets_no_retweetsv2.json'
remove_retweets(tweet_path, tweets_no_retweets_path)

In [118]:
count_tweets_no_retweets = count_json_objects(tweets_no_retweets_path)
print(count_tweets_no_retweets)

1160088


In [123]:
tweets_no_retweets_xlsx_path = '/Users/adamstueckrath/Desktop/tweets_no_retweets.xlsx'
json_tweets_xlsx_sample(tweets_no_retweets_path, tweets_no_retweets_xlsx_path, tweet_start=0, tweet_limit=10, normalize=False)


writing tweets


In [148]:


# def convert_nljson_to_json(nljson_path, output_json_path):
#     with open(output_json_path, 'w') as output: 
help(json.load)
# with open(tweets_no_retweets_path, 'r') as f:
#      data = json.loads(json_normalize(f))
        
        
# jdata = json.load(json_normalize(tweets_no_retweets_path))
#df = pandas.DataFrame(jdata)
df = pandas.DataFrame.from_dict(json_normalize(tweets_no_retweets_path, orient='records')






Help on function load in module json:

load(fp, *, cls=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, object_pairs_hook=None, **kw)
    Deserialize ``fp`` (a ``.read()``-supporting file-like object containing
    a JSON document) to a Python object.
    
    ``object_hook`` is an optional function that will be called with the
    result of any object literal decode (a ``dict``). The return value of
    ``object_hook`` will be used instead of the ``dict``. This feature
    can be used to implement custom decoders (e.g. JSON-RPC class hinting).
    
    ``object_pairs_hook`` is an optional function that will be called with the
    result of any object literal decoded with an ordered list of pairs.  The
    return value of ``object_pairs_hook`` will be used instead of the ``dict``.
    This feature can be used to implement custom decoders that rely on the
    order that the key and value pairs are decoded (for example,
    collections.OrderedDict will remem

ValueError: DataFrame constructor not properly called!

In [174]:
from collections import Counter

def run():
    
    json_stream = nljson_generator(file_path)
    tweet_limit = 0
    c = Counter()
    
    for tweet in json_stream:
        tweet_limit += 1        
        flatten_tweet = flatten_json(tweet)
        
        if tweet_limit == 1000000:
            return c 
            break
        
        for col, item in flatten_tweet.items():
            if col == 'text' and "Syria" in item:
                c['Syria'] += 1


hello
Counter({'Syria': 579466})
248.3317940235138
