In [1]:
import json

In [2]:
with open("twitter_data.json") as fh:
    print(fh.read()[:10])

[{"created


In [4]:
with open("twitter_data.json") as fh:
    content = fh.read()  # All file handles have .read() that produces a str
    data = json.loads(content)  # load*s* receives a *s*tring
    # data = json.loads(fh.read())

len(data), type(data)

(9, list)

In [15]:
with open("twitter_data.json") as fh:
    data = json.load(fh)  # load receives a file handle

len(data), type(data)

(9, list)

In [18]:
# data

In [19]:
t0 = data[0]

In [20]:
t0["coordinates"]  # GeoJSON

{'type': 'Point', 'coordinates': [-117.894444, 33.606389]}

In [24]:
t0["coordinates"]

{'type': 'Point', 'coordinates': [-117.894444, 33.606389]}

In [25]:
t0["coordinates"]["coordinates"]

[-117.894444, 33.606389]

In [23]:
t0["coordinates"]["coordinates"][0]

-117.894444

In [26]:
[tweet["coordinates"]["coordinates"][0] for tweet in data]

TypeError: 'NoneType' object is not subscriptable

In [32]:
longitudes = []
for tweet in data:
    if tweet["coordinates"] is not None:
        longitudes.append(tweet["coordinates"]["coordinates"][0])
longitudes

[-117.894444, -122.26923]

In [38]:
longitudes = [
    tweet["coordinates"]["coordinates"][0]
    for tweet in data
    if tweet["coordinates"] is not None
]
longitudes

[-117.894444, -122.26923]

In [44]:
# while True:
#     divisor = input("Write a number")
#     print(1 / float(divisor))

In [42]:
try:
    print("About to divide by zero...")
    1 / 0
    print("This will never happen")
except ZeroDivisionError:
    print("Error with the division!")

About to divide by zero...
Error with the division!


In [46]:
# open?

In [56]:
longitudes = []
for tweet in data:
    try:
        longitudes.append(tweet["coordinates"]["coordinates"][0])
    except TypeError:
        # tweet["coordinates"] = None
        # pass  # pass doesn't do anything
        # print("Tweet has no coordinates, skipping")
longitudes

[-117.894444, -122.26923]

In [58]:
%%timeit 
longitudes = []
for tweet in data:
    try:
        longitudes.append(tweet["coordinates"]["coordinates"][0])
    except TypeError:
        pass
longitudes

3.63 µs ± 280 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [59]:
%%timeit
longitudes = []
for tweet in data:
    if tweet["coordinates"] is not None:
        longitudes.append(tweet["coordinates"]["coordinates"][0])
longitudes

820 ns ± 164 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [60]:
%%timeit
longitudes = [
    tweet["coordinates"]["coordinates"][0]
    for tweet in data
    if tweet["coordinates"] is not None
]
longitudes

854 ns ± 156 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [57]:
[
    tweet["coordinates"] for tweet in data
]

[{'type': 'Point', 'coordinates': [-117.894444, 33.606389]},
 {'type': 'Point', 'coordinates': [-122.26923, 37.80805]},
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [33]:
for tweet in data:
    if tweet["coordinates"] is not None:
        print(tweet["id"])

1252840795737997317
1256259052461592580


## Data

Use `twitter_data.json` from previous sessions.

### Expanded urls

Assemble a list of expanded URLs of users that have them in their profile.

### Most popular user

What's the tweet made by the user with the largest number of followers? And what if we exclude those without hashtags?

Try to write the solution in two ways:

- Using `for` loops and normal `if` conditionals
- Using `lambda` functions and Python built-ins (`min`, `max`, `filter`)

---

## Exercises

### 1. Cities of tweets

Build a set of the cities in which the tweets written. How many distinct cities are there?

In [62]:
{tweet["place"]["name"] for tweet in data}  # Set comprehension

{'Dixon',
 'Los Angeles',
 'Modesto',
 'Newport Beach',
 'Oakland',
 'San Francisco',
 'West Hollywood'}

In [63]:
len(_62)

7

### 2. How many tweets in each city

Build a dictionary, where the keys are cities and the values are the number of tweets in each city.

In [74]:
[tweet["place"]["name"] for tweet in data]

['Newport Beach',
 'Oakland',
 'Los Angeles',
 'San Francisco',
 'Dixon',
 'Modesto',
 'Los Angeles',
 'West Hollywood',
 'Los Angeles']

In [5]:
city_counts = {}
for tweet in data:
    city_name = tweet["place"]["name"]
    if city_name not in city_counts:
        city_counts[city_name] = 1
    else:
        city_counts[city_name] += 1
city_counts

{'Newport Beach': 1,
 'Oakland': 1,
 'Los Angeles': 3,
 'San Francisco': 1,
 'Dixon': 1,
 'Modesto': 1,
 'West Hollywood': 1}

In [6]:
all_cities = [tweet["place"]["name"] for tweet in data]

city_counts = {}
for city_name in all_cities:
    city_counts[city_name] = all_cities.count(city_name)

city_counts

{'Newport Beach': 1,
 'Oakland': 1,
 'Los Angeles': 3,
 'San Francisco': 1,
 'Dixon': 1,
 'Modesto': 1,
 'West Hollywood': 1}

### 3. How many unique hashtags

Build a set of unique, **lowercase** hashtag texts.

In [77]:
[hashtag["text"] for hashtag in t0["entities"]["hashtags"]]

['balboaisland',
 'newportbeach',
 'tennis',
 'covid_19',
 'coronavirus',
 'orangecounty',
 'california']

In [81]:
hashtags = set()
for tweet in data:
    for hashtag in tweet["entities"]["hashtags"]:
        hashtags.add(hashtag["text"].lower())
hashtags

{'activities',
 'balboaisland',
 'california',
 'coronavirus',
 'covid19',
 'covid_19',
 'la',
 'lockdown',
 'losangeles',
 'newportbeach',
 'orangecounty',
 'protesters',
 'repost',
 'tennis',
 'thisweekend'}

### 4. How many tweets mentioning each hashtag

Build a dictionary, where the keys are the lowercase hashtag texts and the values the number of tweets containing each hashtag. Is there any hashtag that appears more than once?

In [94]:
[1, 2, 3]["a"]

  [1, 2, 3]["a"]


TypeError: list indices must be integers or slices, not str

In [93]:
hashtag_counts = {}

for tweet in data:
    for hashtag in tweet["entities"]["hashtags"]:
        ht_text = hashtag["text"].lower()
        if ht_text not in hashtag_counts:  # If it's the first time that I see it
            hashtag_counts[ht_text] = 1
        else:
            hashtag_counts[ht_text] += 1

hashtag_counts

{'balboaisland': 1,
 'newportbeach': 1,
 'tennis': 1,
 'covid_19': 1,
 'coronavirus': 1,
 'orangecounty': 1,
 'california': 1,
 'repost': 1,
 'thisweekend': 1,
 'la': 1,
 'losangeles': 1,
 'activities': 1,
 'lockdown': 1,
 'covid19': 1,
 'protesters': 1}

In [83]:
hashtag_counts = {}

for tweet in data:
    for hashtag in tweet["entities"]["hashtags"]:
        ht_text = hashtag["text"].lower()
        if ht_text not in hashtag_counts:
            hashtag_counts[ht_text] = 1
        else:
            hashtag_counts[ht_text] += 1

hashtag_counts

{'balboaisland': 1,
 'newportbeach': 1,
 'tennis': 1,
 'covid_19': 1,
 'coronavirus': 1,
 'orangecounty': 1,
 'california': 1,
 'repost': 1,
 'thisweekend': 1,
 'la': 1,
 'losangeles': 1,
 'activities': 1,
 'lockdown': 1,
 'covid19': 1,
 'protesters': 1}