## Problem statement
You have to write a python script which can fetch all the tweets(as many as allowed by Twitter
API) done by [midas@IIITD](https://twitter.com/midasIIITD) twitter handle and dump the responses into JSONlines file.
The other part of your script should be able to parse these JSONline files to display the
following for every tweet in a tabular format.
* The text of the tweet.
* Date and time of the tweet.
* The number of favorites/likes.
* The number of retweets.
* Number of Images present in Tweet. If no image returns None.

### Imports

In [18]:
import tweepy
from tweepy import OAuthHandler
import json
import os
import pandas as pd
from datetime import datetime

In [2]:
#Path for JSON file
FILE_PATH = os.path.join(os.getcwd(), "/Users/Akshay/Desktop/tweets.json")

### Authorization

In [3]:
ACCESS_TOKEN = "#"
ACCESS_TOKEN_SECRET = "#"
CONSUMER_KEY = "#" 
CONSUMER_SECRET_KEY = "#"

In [4]:
auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET_KEY)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)

## First Part - Fetch Tweets and Dump them into JSON Line file

In [5]:
def fetch_and_dump_tweets(screen_name):
    '''
    screen_name: Twitter handle of the user
    tweets: List (of length 200) of tweets
    all_tweets: List of all tweets
    oldest: Index of oldest fetched tweet
    '''
    
    all_tweets = []
    tweets = api.user_timeline(screen_name = screen_name,count=200)
    all_tweets.extend(tweets)
    oldest = all_tweets[-1].id - 1
    
    while len(tweets) > 0:
        tweets = api.user_timeline(screen_name = screen_name, count=200, max_id=oldest)
        all_tweets.extend(tweets)
        oldest = all_tweets[-1].id - 1

    print(f"Total number of tweets from {screen_name} are {len(all_tweets)}")
    
    all_tweets_json = []
    for tweet in all_tweets:
        all_tweets_json.append(tweet._json)
    with open(FILE_PATH, 'w', encoding='utf8') as f:
        json.dump(all_tweets_json, f, sort_keys = True,indent = 4)
    return all_tweets

In [6]:
tweets_by_MIDAS = fetch_and_dump_tweets("midasIIITD")

Total number of tweets from midasIIITD are 294


In [7]:
with open(FILE_PATH) as json_file:  
    tweets_by_MIDAS = json.load(json_file)

In [8]:
tweets_by_MIDAS[10]['entities']

{'hashtags': [],
 'symbols': [],
 'urls': [{'display_url': 'twitter.com/i/web/status/1…',
   'expanded_url': 'https://twitter.com/i/web/status/1105478029147553792',
   'indices': [110, 133],
   'url': 'https://t.co/XEkcYO8KmW'}],
 'user_mentions': [{'id': 1021355762575073281,
   'id_str': '1021355762575073281',
   'indices': [23, 34],
   'name': 'MIDAS IIITD',
   'screen_name': 'midasIIITD'}]}

In [9]:
def fetch_and_dump_tweets(screen_name):
    '''
    screen_name: Twitter handle of the user
    tweets: List (of length 200) of tweets
    all_tweets: List of all tweets
    oldest: Index of oldest fetched tweet
    '''
    
    all_tweets = []
    tweets = api.user_timeline(screen_name = screen_name,count=200, tweet_mode = 'extended')
    all_tweets.extend(tweets)
    oldest = all_tweets[-1].id - 1
    
    while len(tweets) > 0:
        tweets = api.user_timeline(screen_name = screen_name, count=200, max_id=oldest, tweet_mode = 'extended')
        all_tweets.extend(tweets)
        oldest = all_tweets[-1].id - 1

    print(f"Total number of tweets from {screen_name} are {len(all_tweets)}")
    
    all_tweets_json = []
    for tweet in all_tweets:
        all_tweets_json.append(tweet._json)
    with open(FILE_PATH, 'w', encoding='utf8') as f:
        json.dump(all_tweets_json, f, sort_keys = True,indent = 4)
    return all_tweets

In [10]:
tweets_by_MIDAS = fetch_and_dump_tweets("midasIIITD")

Total number of tweets from midasIIITD are 294


In [11]:
with open(FILE_PATH) as json_file:  
    tweets_by_MIDAS = json.load(json_file)

In [12]:
tweets_by_MIDAS[10]['entities']

{'hashtags': [{'indices': [111, 116], 'text': 'team'},
  {'indices': [117, 126], 'text': 'research'},
  {'indices': [127, 130], 'text': 'AI'},
  {'indices': [131, 134], 'text': 'ML'},
  {'indices': [135, 144], 'text': 'projects'}],
 'media': [{'display_url': 'pic.twitter.com/lN7hItwPO9',
   'expanded_url': 'https://twitter.com/midasIIITD/status/1105478029147553792/photo/1',
   'id': 1105477322264772610,
   'id_str': '1105477322264772610',
   'indices': [145, 168],
   'media_url': 'http://pbs.twimg.com/media/D1dxxHzXgAIeNSE.jpg',
   'media_url_https': 'https://pbs.twimg.com/media/D1dxxHzXgAIeNSE.jpg',
   'sizes': {'large': {'h': 1338, 'resize': 'fit', 'w': 2048},
    'medium': {'h': 784, 'resize': 'fit', 'w': 1200},
    'small': {'h': 444, 'resize': 'fit', 'w': 680},
    'thumb': {'h': 150, 'resize': 'crop', 'w': 150}},
   'type': 'photo',
   'url': 'https://t.co/lN7hItwPO9'}],
 'symbols': [],
 'urls': [],
 'user_mentions': [{'id': 1021355762575073281,
   'id_str': '1021355762575073281'

In [13]:
tweets_by_MIDAS[10]['extended_entities']

{'media': [{'display_url': 'pic.twitter.com/lN7hItwPO9',
   'expanded_url': 'https://twitter.com/midasIIITD/status/1105478029147553792/photo/1',
   'id': 1105477322264772610,
   'id_str': '1105477322264772610',
   'indices': [145, 168],
   'media_url': 'http://pbs.twimg.com/media/D1dxxHzXgAIeNSE.jpg',
   'media_url_https': 'https://pbs.twimg.com/media/D1dxxHzXgAIeNSE.jpg',
   'sizes': {'large': {'h': 1338, 'resize': 'fit', 'w': 2048},
    'medium': {'h': 784, 'resize': 'fit', 'w': 1200},
    'small': {'h': 444, 'resize': 'fit', 'w': 680},
    'thumb': {'h': 150, 'resize': 'crop', 'w': 150}},
   'type': 'photo',
   'url': 'https://t.co/lN7hItwPO9'}]}

## Second Part - Parse the Json Line file and get the required information in Tabular form

In [43]:
def parse_tweets(FILE_PATH):
    
    with open(FILE_PATH) as json_file:  
        tweets_by_MIDAS = json.load(json_file)
        
        tweet_list=[]

        for tweet in tweets_by_MIDAS:
            tweet_info=dict()

            tweet_info['Text']=tweet['full_text']
            tweet_info['Date and Time']= tweet['created_at']
            #tweet_info['id_str']=tweet['id_str']

            tweet_info['Number of Likes']=tweet['favorite_count']
            tweet_info['Number of Retweets']=tweet['retweet_count']

            if 'media' in tweet['entities']:
                image_count = 0
                tweet_media = tweet['extended_entities']['media']
                for i in range(len(tweet_media)):
                    if(tweet_media[i]['type'] == 'photo'):
                        image_count += 1
                tweet_info['Number of Images'] = image_count
            else:
                #No media in the tweet, hence None
                tweet_info['Number of Images'] = None
            tweet_list.append(tweet_info)
    return tweet_list

In [44]:
MIDAS_info = parse_tweets(FILE_PATH)

In [45]:
MIDAS_info[1]

{'Date and Time': 'Sun Mar 17 14:22:04 +0000 2019',
 'Number of Images': None,
 'Number of Likes': 14,
 'Number of Retweets': 4,
 'Text': 'Congratulations @midasIIITD team, Rohan, Pradyumn, Ramit, @debanjanbhucs, @MADAIguy and @RatnRajiv for getting their paper titled, "SNAP-BATNET: Cascading Author Profiling and Social Network Graphs for Suicide Ideation Detection on Social Media", accepted at @NAACLHLT SRW.'}

In [47]:
MIDAS_df = pd.DataFrame(MIDAS_info)

In [48]:
MIDAS_df

Unnamed: 0,Date and Time,Number of Images,Number of Likes,Number of Retweets,Text
0,Mon Mar 18 02:27:47 +0000 2019,,6,3,BigMM 2019 : IEEE BigMM 2019 – Call for Worksh...
1,Sun Mar 17 14:22:04 +0000 2019,,14,4,"Congratulations @midasIIITD team, Rohan, Prady..."
2,Sat Mar 16 14:06:56 +0000 2019,,6,0,We have emailed the task details to all shortl...
3,Sat Mar 16 09:20:29 +0000 2019,,1,1,IEEE BigMM 2019 - Call for Workshop Proposals....
4,Sat Mar 16 09:14:58 +0000 2019,,7,2,"Congratulations! Arijit, Ramit, @debanjanbhucs..."
5,Sat Mar 16 05:13:14 +0000 2019,,7,2,We will be releasing a very interesting task t...
6,Wed Mar 13 17:09:44 +0000 2019,,0,2,RT @hcdiiitd: Last day to register for #Portfo...
7,Wed Mar 13 04:11:24 +0000 2019,1.0,1,0,@ACMMM19 @sigmm @TheOfficialACM @acmmmsys @ACM...
8,Wed Mar 13 04:06:04 +0000 2019,,0,13,RT @ACMMM19: The paper deadline is approaching...
9,Tue Mar 12 17:43:44 +0000 2019,,0,69,RT @kaggle: Bookmark this amazing library of i...
