# Import libraries

In [45]:
from datetime import datetime
import pandas as pd
import cProfile
import emoji

from utils.extract_json import extract_json
from utils.find_top10 import find_top10
from utils.df_to_list import df_to_list
from utils.dict_to_df import dict_to_df
from utils.groupby_and_count import groupby_and_count

from q1_memory import q1_memory
from q2_memory import q2_memory
from q3_memory import q3_memory

from q1_time import q1_time
from q2_time import q2_time
from q3_time import q3_time

---

# Open file

Taking a quick look at the data file, we can see that it consists of a collection of json objects, as shows the image bellow.

![img](../img/data_file.png "screenshot data file")

The amount of data sums up to 117407 lines of json objects.

## Get data

To work with the data we have to extract it from the json file. To do that will run the follwing lines of code.

In [2]:
file_path = "farmers-protest-tweets-2021-2-4.json"

df = pd.read_json(file_path, lines=True)
df.head(10)

Unnamed: 0,url,date,content,renderedContent,id,user,outlinks,tcooutlinks,replyCount,retweetCount,...,quoteCount,conversationId,lang,source,sourceUrl,sourceLabel,media,retweetedTweet,quotedTweet,mentionedUsers
0,https://twitter.com/ArjunSinghPanam/status/136...,2021-02-24 09:23:35+00:00,The world progresses while the Indian police a...,The world progresses while the Indian police a...,1364506249291784198,"{'username': 'ArjunSinghPanam', 'displayname':...",[https://twitter.com/ravisinghka/status/136415...,[https://t.co/es3kn0IQAF],0,0,...,0,1364506249291784198,en,"<a href=""http://twitter.com/download/iphone"" r...",http://twitter.com/download/iphone,Twitter for iPhone,,,{'url': 'https://twitter.com/RaviSinghKA/statu...,"[{'username': 'narendramodi', 'displayname': '..."
1,https://twitter.com/PrdeepNain/status/13645062...,2021-02-24 09:23:32+00:00,#FarmersProtest \n#ModiIgnoringFarmersDeaths \...,#FarmersProtest \n#ModiIgnoringFarmersDeaths \...,1364506237451313155,"{'username': 'PrdeepNain', 'displayname': 'Pra...",[],[],0,0,...,0,1364506237451313155,en,"<a href=""http://twitter.com/download/android"" ...",http://twitter.com/download/android,Twitter for Android,[{'thumbnailUrl': 'https://pbs.twimg.com/ext_t...,,,"[{'username': 'Kisanektamorcha', 'displayname'..."
2,https://twitter.com/parmarmaninder/status/1364...,2021-02-24 09:23:22+00:00,ਪੈਟਰੋਲ ਦੀਆਂ ਕੀਮਤਾਂ ਨੂੰ ਮੱਦੇਨਜ਼ਰ ਰੱਖਦੇ ਹੋਏ \nਮੇ...,ਪੈਟਰੋਲ ਦੀਆਂ ਕੀਮਤਾਂ ਨੂੰ ਮੱਦੇਨਜ਼ਰ ਰੱਖਦੇ ਹੋਏ \nਮੇ...,1364506195453767680,"{'username': 'parmarmaninder', 'displayname': ...",[],[],0,0,...,0,1364506195453767680,pa,"<a href=""http://twitter.com/download/android"" ...",http://twitter.com/download/android,Twitter for Android,,,,
3,https://twitter.com/anmoldhaliwal/status/13645...,2021-02-24 09:23:16+00:00,@ReallySwara @rohini_sgh watch full video here...,@ReallySwara @rohini_sgh watch full video here...,1364506167226032128,"{'username': 'anmoldhaliwal', 'displayname': '...",[https://youtu.be/-bUKumwq-J8],[https://t.co/wBPNdJdB0n],0,0,...,0,1364350947099484160,en,"<a href=""https://mobile.twitter.com"" rel=""nofo...",https://mobile.twitter.com,Twitter Web App,[{'thumbnailUrl': 'https://pbs.twimg.com/ext_t...,,,"[{'username': 'ReallySwara', 'displayname': 'S..."
4,https://twitter.com/KotiaPreet/status/13645061...,2021-02-24 09:23:10+00:00,#KisanEktaMorcha #FarmersProtest #NoFarmersNoF...,#KisanEktaMorcha #FarmersProtest #NoFarmersNoF...,1364506144002088963,"{'username': 'KotiaPreet', 'displayname': 'Pre...",[],[],0,0,...,0,1364506144002088963,und,"<a href=""http://twitter.com/download/iphone"" r...",http://twitter.com/download/iphone,Twitter for iPhone,[{'previewUrl': 'https://pbs.twimg.com/media/E...,,,
5,https://twitter.com/babli_708/status/136450612...,2021-02-24 09:23:05+00:00,Jai jwaan jai kissan #FarmersProtest #ModiIgno...,Jai jwaan jai kissan #FarmersProtest #ModiIgno...,1364506120497360896,"{'username': 'babli_708', 'displayname': 'Babl...",[https://twitter.com/rajeshpunia15/status/1364...,[https://t.co/LXi7d92wwf],0,0,...,0,1364506120497360896,hi,"<a href=""http://twitter.com/download/iphone"" r...",http://twitter.com/download/iphone,Twitter for iPhone,,,{'url': 'https://twitter.com/RajeshPunia15/sta...,
6,https://twitter.com/Varinde17354019/status/136...,2021-02-24 09:22:54+00:00,#FarmersProtest,#FarmersProtest,1364506076272496640,"{'username': 'Varinde17354019', 'displayname':...",[],[],0,0,...,0,1364506076272496640,und,"<a href=""http://twitter.com/download/android"" ...",http://twitter.com/download/android,Twitter for Android,,,,
7,https://twitter.com/BitnamSingh/status/1364505...,2021-02-24 09:22:35+00:00,#ModiDontSellFarmers\n#FarmersProtest https://...,#ModiDontSellFarmers\n#FarmersProtest twitter....,1364505995859423234,"{'username': 'BitnamSingh', 'displayname': 'Bi...",[https://twitter.com/jagjitvaheguru/status/136...,[https://t.co/uGQb1O5Jg9],0,0,...,0,1364505995859423234,und,"<a href=""https://mobile.twitter.com"" rel=""nofo...",https://mobile.twitter.com,Twitter Web App,,,{'url': 'https://twitter.com/jagjitvaheguru/st...,
8,https://twitter.com/anmoldhaliwal/status/13645...,2021-02-24 09:22:34+00:00,@mandeeppunia1 watch full video here https://t...,@mandeeppunia1 watch full video here youtu.be/...,1364505991887347714,"{'username': 'anmoldhaliwal', 'displayname': '...",[https://youtu.be/-bUKumwq-J8],[https://t.co/wBPNdJdB0n],0,0,...,0,1364428985074032646,en,"<a href=""https://mobile.twitter.com"" rel=""nofo...",https://mobile.twitter.com,Twitter Web App,[{'thumbnailUrl': 'https://pbs.twimg.com/ext_t...,,,"[{'username': 'mandeeppunia1', 'displayname': ..."
9,https://twitter.com/SatThiara/status/136450589...,2021-02-24 09:22:11+00:00,#FarmersProtest https://t.co/ehd5FBSZGx,#FarmersProtest twitter.com/borisjohnson/s…,1364505896576053248,"{'username': 'SatThiara', 'displayname': 'Sat ...",[https://twitter.com/borisjohnson/status/13642...,[https://t.co/ehd5FBSZGx],0,0,...,0,1364505896576053248,und,"<a href=""http://twitter.com/download/iphone"" r...",http://twitter.com/download/iphone,Twitter for iPhone,,,{'url': 'https://twitter.com/BorisJohnson/stat...,


In [4]:
df.tail(10)

Unnamed: 0,url,date,content,renderedContent,id,user,outlinks,tcooutlinks,replyCount,retweetCount,...,quoteCount,conversationId,lang,source,sourceUrl,sourceLabel,media,retweetedTweet,quotedTweet,mentionedUsers
117397,https://twitter.com/rupindr79/status/136004022...,2021-02-12 01:37:13+00:00,Now Farmers Agitation is no longer confined to...,Now Farmers Agitation is no longer confined to...,1360040229265022979,"{'username': 'rupindr79', 'displayname': 'ਰੁ ਪ...",[],[],0,31,...,4,1360040229265022979,en,"<a href=""http://twitter.com/download/android"" ...",http://twitter.com/download/android,Twitter for Android,[{'previewUrl': 'https://pbs.twimg.com/media/E...,,,
117398,https://twitter.com/bali_mandeep/status/136004...,2021-02-12 01:37:12+00:00,Kisan Ekta Zindabaad ✊\n#FarmersProtest,Kisan Ekta Zindabaad ✊\n#FarmersProtest,1360040222986178563,"{'username': 'bali_mandeep', 'displayname': 'M...",[],[],0,0,...,0,1360040222986178563,hi,"<a href=""http://twitter.com/download/android"" ...",http://twitter.com/download/android,Twitter for Android,,,,
117399,https://twitter.com/Satveer22950341/status/136...,2021-02-12 01:37:06+00:00,#FarmersProtest \n\n#MahapanchayatRevolution,#FarmersProtest \n\n#MahapanchayatRevolution,1360040199493799936,"{'username': 'Satveer22950341', 'displayname':...",[],[],0,0,...,0,1360040199493799936,und,"<a href=""https://mobile.twitter.com"" rel=""nofo...",https://mobile.twitter.com,Twitter Web App,,,,
117400,https://twitter.com/PushpSamra/status/13600401...,2021-02-12 01:37:05+00:00,The first Mahapanchayat of Punjab. The revolut...,The first Mahapanchayat of Punjab. The revolut...,1360040195786067969,"{'username': 'PushpSamra', 'displayname': 'Pus...",[],[],0,43,...,3,1360040195786067969,en,"<a href=""http://twitter.com/download/android"" ...",http://twitter.com/download/android,Twitter for Android,[{'previewUrl': 'https://pbs.twimg.com/media/E...,,,
117401,https://twitter.com/lovehazran1/status/1360040...,2021-02-12 01:37:04+00:00,#BJPGovtDictatingTwitter #MahapanchayatRevolut...,#BJPGovtDictatingTwitter #MahapanchayatRevolut...,1360040192090857475,"{'username': 'lovehazran1', 'displayname': '#F...",[],[],0,1,...,0,1360040192090857475,pa,"<a href=""http://twitter.com/download/iphone"" r...",http://twitter.com/download/iphone,Twitter for iPhone,,,,
117402,https://twitter.com/rickyrickstir/status/13600...,2021-02-12 01:37:02+00:00,#FarmersProtest #KisanAndolan #KisaanMajdoorEk...,#FarmersProtest #KisanAndolan #KisaanMajdoorEk...,1360040182771163138,"{'username': 'rickyrickstir', 'displayname': '...",[],[],0,0,...,0,1360040182771163138,und,"<a href=""http://twitter.com/download/iphone"" r...",http://twitter.com/download/iphone,Twitter for iPhone,,,,
117403,https://twitter.com/PunjabTak/status/136004014...,2021-02-12 01:36:53+00:00,PM मोदी की अपील के बीच संयुक्त किसान मोर्चा का...,PM मोदी की अपील के बीच संयुक्त किसान मोर्चा का...,1360040146402373637,"{'username': 'PunjabTak', 'displayname': 'Punj...",[https://youtu.be/aG3qHGwoYag],[https://t.co/AzZNOGI8BX],0,0,...,0,1360040146402373637,hi,"<a href=""https://mobile.twitter.com"" rel=""nofo...",https://mobile.twitter.com,Twitter Web App,[{'previewUrl': 'https://pbs.twimg.com/media/E...,,,
117404,https://twitter.com/ish_kayy/status/1360040134...,2021-02-12 01:36:50+00:00,United we stand.\nDivided we fall\n#Mahapancha...,United we stand.\nDivided we fall\n#Mahapancha...,1360040134230556678,"{'username': 'ish_kayy', 'displayname': 'ishy'...",[],[],0,65,...,6,1360040134230556678,en,"<a href=""https://mobile.twitter.com"" rel=""nofo...",https://mobile.twitter.com,Twitter Web App,[{'previewUrl': 'https://pbs.twimg.com/media/E...,,,
117405,https://twitter.com/TV9Bharatvarsh/status/1360...,2021-02-12 01:36:49+00:00,"सिंघु बॉर्डर पर लंबी लड़ाई की तैयारी, किसानों ...","सिंघु बॉर्डर पर लंबी लड़ाई की तैयारी, किसानों ...",1360040127679000577,"{'username': 'TV9Bharatvarsh', 'displayname': ...",[https://www.tv9hindi.com/india/farmers-protes...,[https://t.co/bkjh7WXc0w],0,1,...,1,1360040127679000577,hi,"<a href=""https://mobile.twitter.com"" rel=""nofo...",https://mobile.twitter.com,Twitter Web App,,,,
117406,https://twitter.com/SikhVibes/status/136004012...,2021-02-12 01:36:49+00:00,"@Kisanektamorcha We are with you, keep the mor...","@Kisanektamorcha We are with you, keep the mor...",1360040127146430470,"{'username': 'SikhVibes', 'displayname': 'Sikh...",[],[],2,19,...,2,1360038291471388672,en,"<a href=""http://twitter.com/download/iphone"" r...",http://twitter.com/download/iphone,Twitter for iPhone,,,,"[{'username': 'Kisanektamorcha', 'displayname'..."


In [5]:
shape = df.shape
print("Rows: ", shape[0])
print("Columns: ", df.columns)
print("Columns count: ", shape[1])

Rows:  117407
Columns:  Index(['url', 'date', 'content', 'renderedContent', 'id', 'user', 'outlinks',
       'tcooutlinks', 'replyCount', 'retweetCount', 'likeCount', 'quoteCount',
       'conversationId', 'lang', 'source', 'sourceUrl', 'sourceLabel', 'media',
       'retweetedTweet', 'quotedTweet', 'mentionedUsers'],
      dtype='object')
Columns count:  21


We have 117407 lines with 21 columns in the dataframe

In [6]:
df.isna().sum()

url                     0
date                    0
content                 0
renderedContent         0
id                      0
user                    0
outlinks                0
tcooutlinks             0
replyCount              0
retweetCount            0
likeCount               0
quoteCount              0
conversationId          0
lang                    0
source                  0
sourceUrl             912
sourceLabel           912
media               89298
retweetedTweet     117407
quotedTweet         75971
mentionedUsers      79373
dtype: int64

We can see that `media`, `retweetedTweet`, `quotedTweet` and `mentionedUsers` have a relevant amount of `None` value in its columns. As we answer the questions, we'll avaluate if it will impact us or just ignore them.

---

In [7]:
df.dtypes

url                             object
date               datetime64[ns, UTC]
content                         object
renderedContent                 object
id                               int64
user                            object
outlinks                        object
tcooutlinks                     object
replyCount                       int64
retweetCount                     int64
likeCount                        int64
quoteCount                       int64
conversationId                   int64
lang                            object
source                          object
sourceUrl                       object
sourceLabel                     object
media                           object
retweetedTweet                 float64
quotedTweet                     object
mentionedUsers                  object
dtype: object

# Question 1

**Las top 10 fechas donde hay más tweets. Mencionar el usuario (username) que más publicaciones tiene por cada uno de esos días. Debe incluir las siguientes funciones:**

```python
def q1_time(file_path: str) -> List[Tuple[datetime.date, str]]:
```
```python
def q1_memory(file_path: str) -> List[Tuple[datetime.date, str]]:
```
```python
Returns: 
[(datetime.date(1999, 11, 15), "LATAM321"), (datetime.date(1999, 7, 15), "LATAM_CHI"), ...]
```

Let's take the coulmns we're interested in and work it out from there. For this question we'll isolate `date` and `user` columns in a separete dataframe to not mess up with the original data.

For that, we'll use `extrat_json()` from `extract_json.py` file inside utils folder.

In [24]:
df_q1 = extract_json(file_path=file_path, col=["date", "user"])
df_q1.head(5)

Unnamed: 0,date,user
0,2021-02-24 09:23:35+00:00,"{'username': 'ArjunSinghPanam', 'displayname':..."
1,2021-02-24 09:23:32+00:00,"{'username': 'PrdeepNain', 'displayname': 'Pra..."
2,2021-02-24 09:23:22+00:00,"{'username': 'parmarmaninder', 'displayname': ..."
3,2021-02-24 09:23:16+00:00,"{'username': 'anmoldhaliwal', 'displayname': '..."
4,2021-02-24 09:23:10+00:00,"{'username': 'KotiaPreet', 'displayname': 'Pre..."


Comparing the inputs to the expected output, it's clear that the values in the dataframe need to be processed first.

Starting with the `date` column. Taking the date on index 0, we have `2021-02-24T09:23:35+00:00`, in general terms: YYYY-MM-DD HH:MM:SS UTC time. So what needs to be done is, convert the date values on the `date` column into a datetime object. 

In [25]:
df_q1["date"] = df_q1['date'].dt.date
df_q1.head(5)

Unnamed: 0,date,user
0,2021-02-24,"{'username': 'ArjunSinghPanam', 'displayname':..."
1,2021-02-24,"{'username': 'PrdeepNain', 'displayname': 'Pra..."
2,2021-02-24,"{'username': 'parmarmaninder', 'displayname': ..."
3,2021-02-24,"{'username': 'anmoldhaliwal', 'displayname': '..."
4,2021-02-24,"{'username': 'KotiaPreet', 'displayname': 'Pre..."


Furthermore, we have to process the `user` column to filter information we are interested in: the value of `username` key.

In [26]:
df_q1["user"] = df_q1.apply(lambda x: x.user["username"], axis=1)
df_q1.head(5)

Unnamed: 0,date,user
0,2021-02-24,ArjunSinghPanam
1,2021-02-24,PrdeepNain
2,2021-02-24,parmarmaninder
3,2021-02-24,anmoldhaliwal
4,2021-02-24,KotiaPreet


Now, lets first verify the total amount of tweets made in all the dates. Then, will sort the results and tke the top 10 of them.

In [41]:
grouped_date = groupby_and_count(df_q1, ["date"])
top_dates = find_top10(grouped_date, "user")
top_dates

Unnamed: 0,date,user
0,2021-02-12,12347
1,2021-02-13,11296
5,2021-02-17,11087
4,2021-02-16,10443
2,2021-02-14,10249
6,2021-02-18,9625
3,2021-02-15,9197
8,2021-02-20,8502
11,2021-02-23,8417
7,2021-02-19,8204


With the above results, we know the list of dates that the answer must contain.

With the same idea, we'll group `date` and `username` coulmns together.

In [29]:
grouped_user = find_top10(groupby_and_count(df_q1, ["date", "user"]), "size")
grouped_user

Unnamed: 0,date,user,size
35219,2021-02-19,Preetm91,267
33193,2021-02-18,neetuanjle_nitu,195
26577,2021-02-17,RaaJVinderkaur,185
7536,2021-02-13,MaanDee08215437,178
2740,2021-02-12,RanbirS00614606,176
42691,2021-02-21,Surrypuria,161
33396,2021-02-18,rebelpacifist,153
34733,2021-02-19,KaurDosanjh1979,138
48696,2021-02-23,Surrypuria,135
18540,2021-02-15,jot__b,134


We've got to be careful now. It might look like the final answer, but what the output really shows is most tweeted dates made by a single user. Which is different from what we are looking for.

So, now we have to filter the dates based on the results showed at `top_dates` dataframe, sort the results then drop the duplicated dates.

In [47]:
dates = top_dates.date.to_list()
top_users = groupby_and_count(df_q1, ["date", "user"])                                         # Group and Count
top_users = top_users.set_index("date").loc[dates].sort_values(by="size", ascending=False)     # Filter the relevant dates and sort the result
top_users = top_users.reset_index().drop_duplicates(subset=["date"])                           # Drop the df by dates, this will leave us the user that most tweeted in each day
top_users

Unnamed: 0,date,user,size
0,2021-02-19,Preetm91,267
1,2021-02-18,neetuanjle_nitu,195
2,2021-02-17,RaaJVinderkaur,185
3,2021-02-13,MaanDee08215437,178
4,2021-02-12,RanbirS00614606,176
7,2021-02-23,Surrypuria,135
8,2021-02-15,jot__b,134
9,2021-02-16,jot__b,133
12,2021-02-14,rebelpacifist,119
23,2021-02-20,MangalJ23056160,108


We can see that the two previous results are slightly different from each another. If you look closely, you'll notice that `2021-02-21` is in the first but not int the seconde answer.

Furthermore, we'll sort the answer by the most to less tweeted ones.

In [51]:
top_dates_dict = top_dates.set_index("date").to_dict()

df_q1_result = top_users.copy()
df_q1_result["tweets"] = [top_dates_dict["user"][d] for d in top_users.date]
df_q1_result = find_top10(df_q1_result, "tweets", reset_index=True, drop=True)
df_q1_result

Unnamed: 0,date,user,size,tweets
0,2021-02-12,RanbirS00614606,176,12347
1,2021-02-13,MaanDee08215437,178,11296
2,2021-02-17,RaaJVinderkaur,185,11087
3,2021-02-16,jot__b,133,10443
4,2021-02-14,rebelpacifist,119,10249
5,2021-02-18,neetuanjle_nitu,195,9625
6,2021-02-15,jot__b,134,9197
7,2021-02-20,MangalJ23056160,108,8502
8,2021-02-23,Surrypuria,135,8417
9,2021-02-19,Preetm91,267,8204


Lastly, we can have turn the answer into the right form.

In [34]:
q1_result = df_to_list(df_q1_result, ["date", "user"])
q1_result

[(datetime.date(2021, 2, 12), 'RanbirS00614606'),
 (datetime.date(2021, 2, 13), 'MaanDee08215437'),
 (datetime.date(2021, 2, 17), 'RaaJVinderkaur'),
 (datetime.date(2021, 2, 16), 'jot__b'),
 (datetime.date(2021, 2, 14), 'rebelpacifist'),
 (datetime.date(2021, 2, 18), 'neetuanjle_nitu'),
 (datetime.date(2021, 2, 15), 'jot__b'),
 (datetime.date(2021, 2, 20), 'MangalJ23056160'),
 (datetime.date(2021, 2, 23), 'Surrypuria'),
 (datetime.date(2021, 2, 19), 'Preetm91')]

## Q1 Result

In [53]:
q1_time(file_path)

[(datetime.date(2021, 2, 12), 'RanbirS00614606'),
 (datetime.date(2021, 2, 13), 'MaanDee08215437'),
 (datetime.date(2021, 2, 17), 'RaaJVinderkaur'),
 (datetime.date(2021, 2, 16), 'jot__b'),
 (datetime.date(2021, 2, 14), 'rebelpacifist'),
 (datetime.date(2021, 2, 18), 'neetuanjle_nitu'),
 (datetime.date(2021, 2, 15), 'jot__b'),
 (datetime.date(2021, 2, 20), 'MangalJ23056160'),
 (datetime.date(2021, 2, 23), 'Surrypuria'),
 (datetime.date(2021, 2, 19), 'Preetm91')]

## Q1 Memory Usage

In [54]:
!python -m memory_profiler q1_memory.py "farmers-protest-tweets-2021-2-4.json"

Filename: q1_memory.py

Line #    Mem usage    Increment  Occurrences   Line Contents
    14     76.9 MiB     76.9 MiB           1   @profile
    15                                         def q1_memory(file_path: str) -> List[Tuple[datetime.date, str]]:
    16                                         
    17                                             # Get data from json file
    18   1581.0 MiB   1504.1 MiB           1       df_q1 = extract_json(file_path=file_path, col=["date", "user"])
    19                                         
    20                                             # Transform the values in the columns
    21   1581.9 MiB      1.0 MiB           1       df_q1["date"] = df_q1['date'].dt.date
    22   1586.6 MiB   -277.4 MiB      234815       df_q1["user"] = df_q1.apply(lambda x: x.user["username"], axis=1)
    23                                         
    24                                             # Most tweeted dates
    25   1304.7 MiB   -281.9 MiB          

It's clear that the data that, ,in this case, is saved in a dataframe, takes a great part of the memory usage.

An improvement to be made is to search for another method to store data. It has to be light weight and simple to be used as pandas dataframes.

## Q1 Time Consumption

In [55]:
!python -m cProfile q1_time.py "farmers-protest-tweets-2021-2-4.json"

         4969667 function calls (4963014 primitive calls) in 22.343 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(all)
       10    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(append)
        9    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(argsort)
        9    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(atleast_2d)
        6    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(bincount)
       35    0.000    0.000    0.135    0.004 <__array_function__ internals>:177(concatenate)
       95    0.000    0.000    0.001    0.000 <__array_function__ internals>:177(copyto)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(delete)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(dot)
        1    0

We can see that the total time consumed to generate the answer to the question 1 was almost 20 seconds.

We can see that almost all of the time was dedicated to extract the json file. Another time consumming task, but less relevant, is the `apply` method.

As mentioned earlier, dataframes might not be the best way to handle the data as it should. A better option would be to look for a tool suitable for big data. The one should be light weight, easy to use and fast to extract and work with.

---

# Question 2

**Los top 10 emojis más usados con su respectivo conteo. Debe incluir las siguientes funciones:**

```python
def q2_time(file_path: str) -> List[Tuple[str, int]]:
```
```python
def q2_memory(file_path: str) -> List[Tuple[str, int]]:
```
```python
Returns: 
[("✈️", 6856), ("❤️", 5876), ...]
```

First of all, lets extract the variables we are interested in. In this case, we'll look at the `content` column.


In [56]:
df_q2 = extract_json(file_path=file_path, col=["content"])
df_q2.head(10)

Unnamed: 0,content
0,The world progresses while the Indian police a...
1,#FarmersProtest \n#ModiIgnoringFarmersDeaths \...
2,ਪੈਟਰੋਲ ਦੀਆਂ ਕੀਮਤਾਂ ਨੂੰ ਮੱਦੇਨਜ਼ਰ ਰੱਖਦੇ ਹੋਏ \nਮੇ...
3,@ReallySwara @rohini_sgh watch full video here...
4,#KisanEktaMorcha #FarmersProtest #NoFarmersNoF...
5,Jai jwaan jai kissan #FarmersProtest #ModiIgno...
6,#FarmersProtest
7,#ModiDontSellFarmers\n#FarmersProtest https://...
8,@mandeeppunia1 watch full video here https://t...
9,#FarmersProtest https://t.co/ehd5FBSZGx


Let's take a look in some of the content in the tweets data.

In [57]:
[df_q2.content.iloc[i] for i in range(10)]

['The world progresses while the Indian police and Govt are still trying to take India back to the horrific past through its tyranny. \n\n@narendramodi @DelhiPolice Shame on you. \n\n#ModiDontSellFarmers \n#FarmersProtest \n#FreeNodeepKaur https://t.co/es3kn0IQAF',
 "#FarmersProtest \n#ModiIgnoringFarmersDeaths \n#ModiDontSellFarmers \n@Kisanektamorcha \nFarmers constantly distroying crops throughout India. \nReally, it's hearts breaking...we care about our crops like our children. And govt. agriculture minister is laughing on us🚜🌾WE WILL WIN💪 https://t.co/kLspngG9xE",
 "ਪੈਟਰੋਲ ਦੀਆਂ ਕੀਮਤਾਂ ਨੂੰ ਮੱਦੇਨਜ਼ਰ ਰੱਖਦੇ ਹੋਏ \nਮੇਰੇ ਹਿਸਾਬ ਨਾਲ ਬਾਹਰ(ਪ੍ਰਦੇਸ਼) ਜਾਣ ਨਾਲੋਂ ਬਿਹਤਰ ਆ ਭਾਰਤ 'ਚ ਪੈਟਰੋਲ ਪੰਪ ਪਾ ਲਈਏ। 🤫🤫🤔🤔\n#FarmersProtest",
 '@ReallySwara @rohini_sgh watch full video here https://t.co/wBPNdJdB0n\n#farmersprotest #NoFarmersNoFood https://t.co/fUsTOKOcXK',
 '#KisanEktaMorcha #FarmersProtest #NoFarmersNoFood https://t.co/g9TYYBHQRH',
 'Jai jwaan jai kissan #FarmersProtest #ModiIgnoringFarmersDeaths htt

We can see that some of them have emojis and some of them do not.

To work with emojis I'll be using the emoji library. The documentarion of the library can be found [here](https://carpedm20.github.io/emoji/docs/).

What we'll do first is create a list of emojis for each content that has emoji.

In [58]:
emoji_list = [emoji.distinct_emoji_list(content) for content in df_q2.content if emoji.emoji_count(content)>0]
emoji_list

[['💪', '🌾', '🚜'],
 ['🤫', '🤔'],
 ['🙄'],
 ['🇮🇳'],
 ['👇'],
 ['🙏🏽'],
 ['‼'],
 ['👍'],
 ['🙏🏻'],
 ['👇🏽'],
 ['🙌', '💜'],
 ['🧑\u200d🌾', '🕊', '🌱', '🌍', '🤣', '👩\u200d🌾'],
 ['🤣'],
 ['🤔'],
 ['🧑\u200d🌾', '🕊', '🌱', '😱', '🌍', '👩\u200d🌾'],
 ['🙏🏻'],
 ['👇🏾'],
 ['‼'],
 ['✨', '💫', '🌈', '❣️', '🔰', '🌼'],
 ['😂'],
 ['✍🏻'],
 ['💪'],
 ['☘️'],
 ['💚'],
 ['🇮🇳', '🇵🇰'],
 ['🇮🇳'],
 ['🙏🏻'],
 ['😛'],
 ['♨️'],
 ['✊'],
 ['💔'],
 ['✊🏾'],
 ['🌾', '🚜'],
 ['👇🏻', '👍🏻', '💪🏻', '🤨'],
 ['✊🏽'],
 ['🌾', '🚜', '🥳', '🔥'],
 ['🤣', '😅'],
 ['😂', '👇'],
 ['👏🏽'],
 ['🙏'],
 ['👏', '👍'],
 ['😢'],
 ['📢'],
 ['👇🏼'],
 ['💔', '🥺'],
 ['🙏🏽'],
 ['✊'],
 ['👇🏽'],
 ['😛'],
 ['🌄', '☀️'],
 ['🤔'],
 ['❤️'],
 ['✊'],
 ['🤣'],
 ['🤔'],
 ['🙌'],
 ['👏'],
 ['🙏'],
 ['😡', '💔', '💚'],
 ['🤣'],
 ['✊'],
 ['😜'],
 ['🤣', '😂'],
 ['🌾'],
 ['💪🏻'],
 ['😎'],
 ['🙏'],
 ['😂'],
 ['🙏'],
 ['😊'],
 ['😂'],
 ['💪🏻'],
 ['😂'],
 ['😂'],
 ['💪🏻'],
 ['🤯'],
 ['🙏', '💪🏻'],
 ['😂'],
 ['😂'],
 ['😒'],
 ['😊'],
 ['💯', '💪'],
 ['💔', '📸'],
 ['🙏'],
 ['😂'],
 ['😂'],
 ['🙄'],
 ['🙏'],
 ['💪🏻'],
 ['😂'],
 ['😂'],
 ['😂'],
 ['💯', '💪'],
 ['

Now, we'll run the list and count the emojis as they show up. This information will be stored in `emoji_dict` dictionary.

In [59]:
emoji_dict = {}
for emojis in emoji_list:
    for emj in emojis:
        if emj in emoji_dict:
            emoji_dict[emj] += 1
        else:
            emoji_dict[emj] = 1

emoji_dict

{'💪': 394,
 '🌾': 1298,
 '🚜': 1334,
 '🤫': 21,
 '🤔': 463,
 '🙄': 153,
 '🇮🇳': 938,
 '👇': 542,
 '🙏🏽': 411,
 '‼': 35,
 '👍': 634,
 '🙏🏻': 580,
 '👇🏽': 60,
 '🙌': 73,
 '💜': 16,
 '🧑\u200d🌾': 59,
 '🕊': 38,
 '🌱': 105,
 '🌍': 86,
 '🤣': 759,
 '👩\u200d🌾': 65,
 '😱': 40,
 '👇🏾': 17,
 '✨': 31,
 '💫': 9,
 '🌈': 3,
 '❣️': 27,
 '🔰': 1,
 '🌼': 11,
 '😂': 1387,
 '✍🏻': 15,
 '☘️': 4,
 '💚': 486,
 '🇵🇰': 7,
 '😛': 44,
 '♨️': 1,
 '✊': 1110,
 '💔': 113,
 '✊🏾': 57,
 '👇🏻': 28,
 '👍🏻': 41,
 '💪🏻': 104,
 '🤨': 47,
 '✊🏽': 322,
 '🥳': 39,
 '🔥': 247,
 '😅': 102,
 '👏🏽': 34,
 '🙏': 3203,
 '👏': 289,
 '😢': 152,
 '📢': 73,
 '👇🏼': 62,
 '🥺': 80,
 '🌄': 4,
 '☀️': 12,
 '❤️': 1205,
 '😡': 246,
 '😜': 150,
 '😎': 77,
 '😊': 234,
 '🤯': 14,
 '😒': 38,
 '💯': 142,
 '📸': 33,
 '🤦\u200d♀️': 34,
 '⤵️': 23,
 '📷': 8,
 '🤦🏼\u200d♂️': 2,
 '👍🏼': 13,
 '💪🏽': 49,
 '💕': 17,
 '👳': 23,
 '😞': 58,
 '😁': 180,
 '❤': 267,
 '⛽': 33,
 '🤝': 32,
 '❗': 24,
 '😶': 8,
 '✊🏼': 140,
 '4️⃣': 2,
 '😍': 85,
 '🦁': 20,
 '🌻': 28,
 '🌳': 25,
 '🌎': 100,
 '🌿': 38,
 '🌞': 25,
 '🌏': 104,
 '🦋': 20,
 '🏵️':

Then, we'll turn the dictionaty into a dataframe to easy management.

In [62]:
df_emoji = dict_to_df(emoji_dict, ["count"])
df_emoji

Unnamed: 0,count
💪,394
🌾,1298
🚜,1334
🤫,21
🤔,463
...,...
🧑‍🤝‍🧑,1
🎠,1
♾,1
🙅‍♀️,2


Furthermomre, we have the answer in a dataframe form. Next, we have to turn it into a list, as the problem asked for.

In [46]:
q2_result_df = find_top10(df=df_emoji, by="count", reset_index=True)
q2_result_df

Unnamed: 0,index,count
0,🙏,3203
1,😂,1387
2,🚜,1334
3,🌾,1298
4,❤️,1205
5,✊,1110
6,🇮🇳,938
7,🤣,759
8,👍,634
9,🙏🏻,580


In [47]:
q2_result = df_to_list(df=q2_result_df, cols=["index", "count"])
q2_result

[('🙏', 3203),
 ('😂', 1387),
 ('🚜', 1334),
 ('🌾', 1298),
 ('❤️', 1205),
 ('✊', 1110),
 ('🇮🇳', 938),
 ('🤣', 759),
 ('👍', 634),
 ('🙏🏻', 580)]

## Q2 Result

In [63]:
q2_time(file_path)

[('🙏', 3203),
 ('😂', 1387),
 ('🚜', 1334),
 ('🌾', 1298),
 ('❤️', 1205),
 ('✊', 1110),
 ('🇮🇳', 938),
 ('🤣', 759),
 ('👍', 634),
 ('🙏🏻', 580)]

## Q2 Memory Usage

In [64]:
!python -m memory_profiler q2_memory.py "farmers-protest-tweets-2021-2-4.json"

Filename: q2_memory.py

Line #    Mem usage    Increment  Occurrences   Line Contents
    12     89.0 MiB     89.0 MiB           1   @profile
    13                                         def q2_memory(file_path: str) -> List[Tuple[str, int]]:
    14                                         
    15                                             # Extract data
    16   1612.8 MiB   1523.8 MiB           1       df_q2 = extract_json(file_path=file_path, col=["content"])
    17                                         
    18                                             # Make a list of emojis
    19   1612.9 MiB      0.2 MiB      117410       emoji_list = [emoji.distinct_emoji_list(content) for content in df_q2.content if emoji.emoji_count(content)>0]
    20    415.8 MiB  -1197.1 MiB           1       del df_q2
    21                                         
    22                                             # Dict: {emoji_str: count, ...}
    23    415.8 MiB      0.0 MiB           1       emo

The same as in the question 1, the great amount of memory is used to store the data in the dataframe. 

An alternative for storing the data that was lighter than pandas would help reduce the amount of memory usage.

## Q2 Time Consumption

In [65]:
!python -m cProfile q2_time.py "farmers-protest-tweets-2021-2-4.json"

         100665372 function calls (100658895 primitive calls) in 65.086 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(all)
        2    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(append)
        6    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(argsort)
        6    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(atleast_2d)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(bincount)
       16    0.000    0.000    0.128    0.008 <__array_function__ internals>:177(concatenate)
       70    0.000    0.000    0.001    0.000 <__array_function__ internals>:177(copyto)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(dot)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(empty_like)
      

This the solution takes almost 70 seconds to run, and again, major part is related to oprations with the json file.

As mentioned earlier, dataframes might not be the best way to handle the data as it should. A better option would be to look for a tool suitable for big data. The one should be light weight, easy to use and fast to extract and work with.

---

# Question 3

**El top 10 histórico de usuarios (username) más influyentes en función del conteo de las menciones (@) que registra cada uno de ellos. Debe incluir las siguientes funciones:**

```python
def q3_time(file_path: str) -> List[Tuple[str, int]]:
```
```python
def q3_memory(file_path: str) -> List[Tuple[str, int]]:
```
```python
Returns: 
[("LATAM321", 387), ("LATAM_CHI", 129), ...]
```

To this exercise, we'll work on the `mentionedUsers` column.

In [3]:
df_q3 = extract_json(file_path=file_path, col=["mentionedUsers"])
df_q3

Unnamed: 0,mentionedUsers
0,"[{'username': 'narendramodi', 'displayname': '..."
1,"[{'username': 'Kisanektamorcha', 'displayname'..."
3,"[{'username': 'ReallySwara', 'displayname': 'S..."
8,"[{'username': 'mandeeppunia1', 'displayname': ..."
11,"[{'username': 'mandeeppunia1', 'displayname': ..."
...,...
117382,"[{'username': 'narendramodi', 'displayname': '..."
117388,"[{'username': 'BarackObama', 'displayname': 'B..."
117390,"[{'username': 'SushantBSinha', 'displayname': ..."
117392,"[{'username': 'BeingSalmanKhan', 'displayname'..."


In [5]:
df_q3.mentionedUsers.iloc[0]

[{'username': 'narendramodi',
  'displayname': 'Narendra Modi',
  'id': 18839785,
  'description': None,
  'rawDescription': None,
  'descriptionUrls': None,
  'verified': None,
  'created': None,
  'followersCount': None,
  'friendsCount': None,
  'statusesCount': None,
  'favouritesCount': None,
  'listedCount': None,
  'mediaCount': None,
  'location': None,
  'protected': None,
  'linkUrl': None,
  'linkTcourl': None,
  'profileImageUrl': None,
  'profileBannerUrl': None,
  'url': 'https://twitter.com/narendramodi'},
 {'username': 'DelhiPolice',
  'displayname': '#DilKiPolice Delhi Police',
  'id': 1850705408,
  'description': None,
  'rawDescription': None,
  'descriptionUrls': None,
  'verified': None,
  'created': None,
  'followersCount': None,
  'friendsCount': None,
  'statusesCount': None,
  'favouritesCount': None,
  'listedCount': None,
  'mediaCount': None,
  'location': None,
  'protected': None,
  'linkUrl': None,
  'linkTcourl': None,
  'profileImageUrl': None,
  'prof

We can see that the value of this Dataframe is a list of dictionaries. Similar to what we had at the key `user` of the json file.

What we'll have to do is run throught the lists of all `mentionedUsers` values and count the mention of a user as it shows up. Similar to what was done with the emojis in the question 2.

But first, lets get only the necessary data interesting for the problem, in this case, the username of the account that was mentioned on the tweet.

In [8]:
user_list = df_q3.mentionedUsers.tolist()
user_list

In [10]:
user_dict = {}
for users in user_list:
    for user in users:
        username = user["username"]
        if username in user_dict:
            user_dict[username] += 1
        else:
            user_dict[username] = 1

user_dict

{'narendramodi': 2265,
 'DelhiPolice': 814,
 'Kisanektamorcha': 1840,
 'ReallySwara': 201,
 'rohini_sgh': 123,
 'mandeeppunia1': 260,
 'akshaykumar': 285,
 'taapsee': 114,
 'PetroleumMin': 5,
 'PMOIndia': 1427,
 'ArmaanMalik22': 2,
 'ShekharGupta': 19,
 'khanthefatima': 1,
 'MainaBismee': 1,
 'nsitharaman': 69,
 'AmanJha0508': 1,
 'diljitdosanjh': 500,
 'TheeraSingh': 8,
 'sarahwoodwriter': 1,
 'vivianavigil': 1,
 'punjabisath1': 45,
 'ZeeNews': 422,
 'aajtak': 426,
 'republic': 227,
 'TimesNow': 101,
 'Tractor2twitr': 884,
 'CoryBooker': 35,
 'SenBooker': 18,
 'mandeep_puniaa': 77,
 'MahuaMoitra': 204,
 'Bkuektaugrahan': 241,
 'YouTube': 608,
 'ianuragthakur': 15,
 'BJP4India': 777,
 'INCIndia': 723,
 'Raza_AKhan': 48,
 'Monica_Gill1': 344,
 'mkinthuk': 1,
 'amaanbali': 490,
 'ANI': 329,
 'PTI_News': 222,
 'FanPageHai': 2,
 'emmelianadsilva': 1,
 'DrSRaghavachari': 1,
 'DevilOnline24': 43,
 'hussain_hrw': 14,
 'dhruv_rathee': 120,
 'DominicRaab': 91,
 'UN': 681,
 'rashtrapatibhvn': 38

The following steps are exactly the same as steps made in question 2.

In [15]:
df_user = dict_to_df(user_dict, ["count"])
df_user

Unnamed: 0,count
narendramodi,2265
DelhiPolice,814
Kisanektamorcha,1840
ReallySwara,201
rohini_sgh,123
...,...
Gurinde22863579,1
Gymshark,2
_AhmedQuraishi,1
afridi792,1


In [12]:
df_q3_result = find_top10(df_user, "count", True)
df_q3_result

Unnamed: 0,index,count
0,narendramodi,2265
1,Kisanektamorcha,1840
2,RakeshTikaitBKU,1644
3,PMOIndia,1427
4,RahulGandhi,1146
5,GretaThunberg,1048
6,RaviSinghKA,1019
7,rihanna,986
8,UNHumanRights,962
9,meenaharris,926


In [13]:
q3_result = df_to_list(df_q3_result, ["index", "count"])
q3_result

[('narendramodi', 2265),
 ('Kisanektamorcha', 1840),
 ('RakeshTikaitBKU', 1644),
 ('PMOIndia', 1427),
 ('RahulGandhi', 1146),
 ('GretaThunberg', 1048),
 ('RaviSinghKA', 1019),
 ('rihanna', 986),
 ('UNHumanRights', 962),
 ('meenaharris', 926)]

## Q3 Result

In [21]:
q3_time(file_path)

[('narendramodi', 2265),
 ('Kisanektamorcha', 1840),
 ('RakeshTikaitBKU', 1644),
 ('PMOIndia', 1427),
 ('RahulGandhi', 1146),
 ('GretaThunberg', 1048),
 ('RaviSinghKA', 1019),
 ('rihanna', 986),
 ('UNHumanRights', 962),
 ('meenaharris', 926)]

## Q3 Memory Usage

In [22]:
!python -m memory_profiler q3_memory.py "farmers-protest-tweets-2021-2-4.json"

Filename: q3_memory.py

Line #    Mem usage    Increment  Occurrences   Line Contents
    12     76.9 MiB     76.9 MiB           1   @profile
    13                                         def q3_memory(file_path: str) -> List[Tuple[str, int]]:
    14                                         
    15   1546.9 MiB   1470.0 MiB           1       df_q3 = extract_json(file_path=file_path, col=["mentionedUsers"])
    16                                         
    17   1546.9 MiB      0.0 MiB           1       user_list = df_q3.mentionedUsers.tolist()
    18   1546.9 MiB     -0.0 MiB           1       del df_q3
    19                                         
    20   1546.9 MiB      0.0 MiB           1       user_dict = {}
    21   1551.3 MiB      0.0 MiB       38035       for users in user_list:
    22   1551.3 MiB      0.0 MiB      141437           for user in users:
    23   1551.3 MiB      4.3 MiB      103403               username = user["username"]
    24   1551.3 MiB      0.0 MiB      

# Q3 Time Consumption

In [23]:
!python -m cProfile q3_time.py "farmers-protest-tweets-2021-2-4.json"

         734963 function calls (728503 primitive calls) in 14.983 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(all)
        2    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(append)
        6    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(argsort)
        6    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(atleast_2d)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(bincount)
       16    0.000    0.000    0.128    0.008 <__array_function__ internals>:177(concatenate)
       70    0.000    0.000    0.001    0.000 <__array_function__ internals>:177(copyto)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(dot)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(empty_like)
        1   