# Lector Objetos de Tweets


### 1. Importing Required Libraries

In [1]:
import zipfile
from pandas.io.json import json_normalize
import pandas as pd
import re

### 2. Data

#### 2.1. UnZip

Primero extraemos los datos de la carpeta .zip para poder leer los datos

In [2]:
DIR_ZIP = './data/archive.zip'
DIR_UNZIP = './data'

with zipfile.ZipFile(DIR_ZIP, 'r') as zip_ref:
    zip_ref.extractall(DIR_UNZIP)

#### 2.2. Data Lecture

In [2]:
def getJsonData(file_name):
    data = pd.read_json(file_name, lines=True)
    data = data[data['lang']=='en']
    return data

In [4]:
FILE_DIR = './data/farmers-protest-tweets-2021-03-5.json'

# Read JSON file containing tweets data and removce tweets not in English

raw_tweets = getJsonData(FILE_DIR)
print("Shape: ", raw_tweets.shape)
raw_tweets.head(5)

Shape:  (417511, 21)


Unnamed: 0,url,date,content,renderedContent,id,user,outlinks,tcooutlinks,replyCount,retweetCount,...,quoteCount,conversationId,lang,source,sourceUrl,sourceLabel,media,retweetedTweet,quotedTweet,mentionedUsers
0,https://twitter.com/ShashiRajbhar6/status/1376...,2021-03-30 03:33:46+00:00,Support 👇\n\n#FarmersProtest,Support 👇\n\n#FarmersProtest,1376739399593910273,"{'username': 'ShashiRajbhar6', 'displayname': ...",[],[],0,0,...,0,1376739399593910273,en,"<a href=""http://twitter.com/download/android"" ...",http://twitter.com/download/android,Twitter for Android,,,,
1,https://twitter.com/kaursuk06272818/status/137...,2021-03-30 03:33:23+00:00,Supporting farmers means supporting our countr...,Supporting farmers means supporting our countr...,1376739306287427584,"{'username': 'kaursuk06272818', 'displayname':...",[],[],0,0,...,0,1376739306287427584,en,"<a href=""http://twitter.com/download/android"" ...",http://twitter.com/download/android,Twitter for Android,[{'previewUrl': 'https://pbs.twimg.com/media/E...,,,
2,https://twitter.com/kaursuk06272818/status/137...,2021-03-30 03:31:00+00:00,Support farmers if you are related to food #St...,Support farmers if you are related to food #St...,1376738704128020488,"{'username': 'kaursuk06272818', 'displayname':...",[],[],0,0,...,0,1376738704128020488,en,"<a href=""http://twitter.com/download/android"" ...",http://twitter.com/download/android,Twitter for Android,[{'previewUrl': 'https://pbs.twimg.com/media/E...,,,
3,https://twitter.com/SukhdevSingh_/status/13767...,2021-03-30 03:30:45+00:00,#StopHateAgainstFarmers support #FarmersProtes...,#StopHateAgainstFarmers support #FarmersProtes...,1376738640542400518,"{'username': 'SukhdevSingh_', 'displayname': '...",[],[],0,1,...,0,1376738640542400518,en,"<a href=""http://twitter.com/download/android"" ...",http://twitter.com/download/android,Twitter for Android,,,,
4,https://twitter.com/Davidmu66668113/status/137...,2021-03-30 03:30:30+00:00,"You hate farmers I hate you, \nif you love the...","You hate farmers I hate you, \nif you love the...",1376738579171344386,"{'username': 'Davidmu66668113', 'displayname':...",[],[],0,0,...,0,1376738579171344386,en,"<a href=""http://twitter.com/download/android"" ...",http://twitter.com/download/android,Twitter for Android,,,,


#### 2.3. Pre-process Data

In [5]:
Tweets = raw_tweets[['date', 'content', 'renderedContent', 'user', 'retweetCount']]

In [3]:
def focusOnUserAttr(col, attr):
    def func(df):
        df[col] = df[col].apply(lambda x: x[attr])
        return df 
    return func

#focusOnUserAttr('user', 'username')(Tweets).head(5)

In [4]:
def normalizeDate(df):
    df['date'] = df['date'].apply(lambda x: x.date())
    return df

#normalizeDate(Tweets).head(5)

In [5]:
def focusOnHashtags(df):
    df['hashtag'] = df['renderedContent'].apply(lambda x: re.findall(r"#(\w+)", x))
    return df

#focusOnHashtags(Tweets).head(5)

### 3. Ranking

In [6]:
def topTen(func, df):
    return func(df).head(10)

In [7]:
def changeDF(func, df):
    return func(df)

#### 3.1. Filters

##### 3.1.a Most Re-Tweeted

In [8]:
def mostReTweeted(df):
    return df.sort_values(by='retweetCount', ascending=False)

In [9]:
def getTopTenReTweeted(df):
    def fun():
        return topTen(mostReTweeted, df)[['content', 'retweetCount']]
    return fun

In [13]:
getTopTenReTweeted(Tweets)()

Unnamed: 0,content,retweetCount
408128,why aren’t we talking about this?! #FarmersPro...,315547
395142,We stand in solidarity with the #FarmersProtes...,103957
266196,I still #StandWithFarmers and support their pe...,67694
366579,"“Paid actors,” huh? Quite the casting director...",35921
372793,What in the human rights violations is going o...,26972
314192,"Happy to share that I’ve donated $10,000 to pr...",23251
215034,There has been much social media coverage arou...,20132
398011,Farmers feed the world. Fight for them. Protec...,18744
325261,Should be talking about this! #FarmersProtest\...,17368
163689,To all of my influencer/celeb friends- read up...,15677


##### 3.1.b Most Tweeted

In [10]:
def countTweetsPerUser(df):
    return df.groupby('user').count().sort_values(by='content',ascending=False)

In [11]:
def getTopTenTweetsPerUser(df):
    def fun():
        return topTen(countTweetsPerUser,df)[['content']]
    return fun

In [16]:
getTopTenTweetsPerUser(Tweets)()

Unnamed: 0_level_0,content
user,Unnamed: 1_level_1
harjot_tweeting,7134
tasveersandhu,2088
shells_n_petals,1991
jot__b,1841
rebelpacifist,1803
rumsomal,1722
Iamjazzie96,1491
Jass_k_G,1458
DigitalKisanBot,1453
z_khalique007,1446


##### 3.1.c Days Most Tweeted

In [12]:
def countTweetsPerDay(df):
    return df.groupby('date').count().sort_values(by="content", ascending=False)

In [13]:
def getTopTenTweetsPerDay(df):
    def fun():
        return topTen(countTweetsPerDay, df)[["content"]]
    return fun

In [19]:
getTopTenTweetsPerDay(Tweets)()

Unnamed: 0_level_0,content
date,Unnamed: 1_level_1
2021-02-03,83403
2021-02-04,58300
2021-02-05,33165
2021-02-02,28440
2021-02-06,22298
2021-02-07,11244
2021-02-09,9269
2021-02-08,8863
2021-02-10,7938
2021-02-11,5668


#### 3.1.d Most Hashtags used

In [14]:
def count_items(hash):

    def fun(list_):
        for element in list_:
            if element not in hash:
                hash[element] = 1
            else:
                hash[element] += 1

    return fun

In [15]:
def applyCountHashtags(df):
    hash = {}
    df['hashtag'].apply(count_items(hash))
    return hash

In [16]:
def showHashtagCount(df):
    hash = applyCountHashtags(df)
    return pd.DataFrame(hash.items(), columns=['hashtag', 'count']).sort_values(by='count', ascending=False)

In [17]:
def getTopTenHashtags(df):
    def fun():
        return topTen(showHashtagCount, df)
    return fun

In [24]:
getTopTenHashtags(Tweets)()

Unnamed: 0,hashtag,count
0,FarmersProtest,404687
97,IStandWithFarmers,15713
7,farmersprotest,15378
825,IndianFarmersHumanRights,11934
554,FarmersAreIndia,10985
163,StandWithFarmers,10612
2340,Rihanna,9088
18,FarmersProtests,8707
15,Farmers,6541
6110,shameonbollywood,6222


### 4. Main Function

In [18]:
def getInput(message):
    return input(message)

In [19]:
def inputRules(rulesList):
    def inputValidation(input):
        return input.lower() in rulesList
    return inputValidation

In [20]:
def InputController(rules):
    def getNewInput():
        while True:
            input = getInput("Enter a value: ")
            if rules(input):
                return input
            print("Invalid input")
    return getNewInput

In [21]:
def getData():
    FILE_DIR = './data/farmers-protest-tweets-2021-03-5.json'
    data = getJsonData(FILE_DIR)[['date', 'content', 'renderedContent', 'user', 'retweetCount']]
    focusOnUserAttr('user', 'username')(data)
    normalizeDate(data)
    focusOnHashtags(data)
    return data

In [22]:
def main():
    data = getData()
    functions = [getTopTenReTweeted(data), getTopTenTweetsPerUser(data), getTopTenTweetsPerDay(data), getTopTenHashtags(data)]
    rulesList = {'1': 'getTopTenReTweeted', '2': 'getTopTenTweetsPerUser', '3': 'getTopTenTweetsPerDay', '4': 'getTopTenHashtags', 'exit': 'exit'}
    inputValidator = inputRules(rulesList.keys())
    rulesStringlist = "".join([ key + ' for: ' + value + "\n" for key, value in rulesList.items()])
    inputController = InputController(inputValidator)
    while True:
        try:
            print('\n\nEnter one of the following: \n' + rulesStringlist)
            input = inputController()
            if input == 'exit':
                print("Exiting")
                break
            else:
                print("You choose: ", rulesList[input])
                print(functions[int(input) - 1]())
        except Exception as e:
            print(e)
            print("Invalid action")
            continue

In [23]:
if __name__ == "__main__":
    main()



Enter one of the following: 
1 for: getTopTenReTweeted
2 for: getTopTenTweetsPerUser
3 for: getTopTenTweetsPerDay
4 for: getTopTenHashtags
exit for: exit

You choose:  getTopTenReTweeted
                                                  content  retweetCount
408128  why aren’t we talking about this?! #FarmersPro...        315547
395142  We stand in solidarity with the #FarmersProtes...        103957
266196  I still #StandWithFarmers and support their pe...         67694
366579  “Paid actors,” huh? Quite the casting director...         35921
372793  What in the human rights violations is going o...         26972
314192  Happy to share that I’ve donated $10,000 to pr...         23251
215034  There has been much social media coverage arou...         20132
398011  Farmers feed the world. Fight for them. Protec...         18744
325261  Should be talking about this! #FarmersProtest\...         17368
163689  To all of my influencer/celeb friends- read up...         15677


Enter one of the 