# Scraping tweets related to covid 19


In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
import sys
sys.path.append('/gdrive/My Drive/BTP/Code/Scraping Twitter Data')

In [None]:
import re
import tweepy
import difflib
import pandas as pd
import numpy as np
import pickle
import copy
import matplotlib.pyplot as plt
from datetime import date

In [None]:
consumer_key = "xxxxxxxxx"
consumer_secret = "xxxxxxxxxxx"
access_token = "xxxxxxxxxxxx"
access_token_secret = "xxxxxxxxxxxxxx"

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

In [None]:
def get_list_of_user_mentions(tweet):
    user_handles = []

    try:
            
        mentions = tweet._json['entities']['user_mentions']
        for item in mentions:
            user_handles.append(item['screen_name'])
        return user_handles

    except:
        return user_handles

In [None]:
def get_list_of_photos(tweet):
    photos_list = []

    try:
            
        photos = tweet._json['entities']['media']
        for item in photos:
            photos_list.append(item['media_url_https'])
        return photos_list
    
    except:
        return photos_list

In [None]:
def get_replies_count(tweet,user_handle):
    
    try:
            
        replies=[] 
        q = 'to:'+user_handle
        for full_tweets in tweepy.Cursor(api.user_timeline,screen_name=user_handle,timeout=999999).items(10):
            for tweet in tweepy.Cursor(api.search,q, since_id=tweet_id, result_type='recent',tweet_mode='extended',timeout=999999).items(1000):
                if hasattr(tweet, 'in_reply_to_status_id_str'):
                    if (tweet.in_reply_to_status_id_str==full_tweets.id_str):
                        replies.append(tweet.full_text)

        #Removing Duplicate Replies
        replies = list(dict.fromkeys(replies))
        return len(replies)

    except:
        return 0

In [None]:
def get_all_hashtags(tweet):
    hashtags_list = []

    hashtags = tweet._json['entities']['hashtags']
    for item in hashtags:
        hashtags_list.append(item['text'])


    return hashtags_list

In [None]:
def get_retweet_id(tweet):
    try:
        retweet_id = tweet._json['retweeted_status']['id']
        return retweet_id
    except:
        return 0

In [None]:
def get_retweet_date(tweet):
    try:
        retweet_date = tweet._json['retweeted_status']['created_at']
        return retweet_date
    except:
        return ""

In [None]:
def get_retweet_user_id(tweet):
    try:
        retweet_user_id = tweet._json['retweeted_status']['user']['id']
        return retweet_user_id
    except:
        return ""

In [None]:
def check_tweet_exists(test_tweet, texts_list):
    counter = 1
    test_tweet     = test_tweet.split()

    for existing_tweet in texts_list:
        existing_tweet = existing_tweet.split()
        n              = len(existing_tweet)
        m              = 0
        for word in test_tweet:
            if word in existing_tweet:
                m = m + 1
                if(m/n > 0.5):                                             ### If the new tweet is 50% similar with existing tweet then don't append it
                    #print("Helllo this tweet has been rejected") 
                    return True
        # print("This tweet has passed test number ",counter)
        counter = counter + 1
    return False

In [None]:
def get_tweets(query, desired_lang = 'hi',maxTweets = 10, tweetsPerQry=10): 
        # empty list to store parsed tweets 
        tweets = [] 
        texts_list = []
        sinceId = None
        max_id = -1
        tweetCount = 0
        
        while tweetCount < maxTweets:
            try:
                if (max_id <= 0):
                    if (not sinceId):
                        new_tweets = api.search(q=query, lang= desired_lang, tweet_mode= 'extended',count=tweetsPerQry)
                    else:
                        new_tweets = api.search(q=query, lang = desired_lang, tweet_mode= 'extended',count=tweetsPerQry,
                                                since_id=sinceId)
                else:
                    if (not sinceId):
                        new_tweets = api.search(q=query, lang = desired_lang, tweet_mode= 'extended', count=tweetsPerQry,
                                                max_id=str(max_id - 1))
                    else:
                        new_tweets = api.search(q=query, lang = desired_lang, tweet_mode= 'extended', count=tweetsPerQry,
                                                max_id=str(max_id - 1),
                                                since_id=sinceId)
                if not new_tweets:
                    print("No more tweets found")
                    break
                    
                for tweet in new_tweets:

                    parsed_tweet = {} 

                    # Making Sure that the tweets don't have more than 50% word similarity

                    already_existing_tweet_flag = check_tweet_exists(tweet.full_text,texts_list)
                    if(already_existing_tweet_flag):
                        continue
                    else:
                        tweets.append(parsed_tweet)
                        texts_list.append(tweet.full_text)


                    parsed_tweet['id']                     = tweet.id
                    parsed_tweet['conversation_id']        = tweet.id_str
                    parsed_tweet['created_at']             = tweet.created_at
                    parsed_tweet['date']                   = str(tweet.created_at).split()[0]
                    parsed_tweet['time']                   = str(tweet.created_at).split()[1]
                    parsed_tweet['time_zone']              = "UTC" 
                    parsed_tweet['user_id']                = tweet.user.id
                    parsed_tweet['username']               = tweet.user.screen_name
                    parsed_tweet['name']                   = tweet.user.name
                    parsed_tweet['place']                  = tweet.user.location
                    parsed_tweet['tweet']                  = tweet.full_text
                    parsed_tweet['mentions']               = get_list_of_user_mentions(tweet)
                    parsed_tweet['urls']                   = tweet._json['entities']['urls']
                    parsed_tweet['photos']                 = get_list_of_photos(tweet)
                    parsed_tweet['replies_count']          = 0 #get_replies_count(tweet,tweet.user.screen_name)
                    parsed_tweet['retweet_count']          = tweet.retweet_count
                    parsed_tweet['likes_count']            = tweet.favorite_count
                    parsed_tweet['hashtags']               = get_all_hashtags(tweet)
                    parsed_tweet['cashtags']               = []
                    parsed_tweet['link']                   = "http://twitter.com/anyuser/status/" + tweet.id_str
                    parsed_tweet['retweet']                = tweet.retweeted
                    parsed_tweet['quote_url']              = " "
                    parsed_tweet['video']                  = 0
                    parsed_tweet['geo']                    = tweet._json['geo']
                    parsed_tweet['source']                 = tweet._json['source']
                    parsed_tweet['rt_user_id']             = get_retweet_user_id(tweet)
                    parsed_tweet['retweet_date']           = get_retweet_date(tweet)
                    parsed_tweet['retweet_id']             = get_retweet_id(tweet)
                                     
                tweetCount += len(new_tweets)
                print("Downloaded {0} tweets".format(tweetCount))
                max_id = new_tweets[-1].id

            except tweepy.TweepError as e:
                print("Tweepy error : " + str(e))
                break
        
        return pd.DataFrame(tweets)

In [None]:
df = get_tweets("covid",'hi',maxTweets=2000,tweetsPerQry=200)
df

Downloaded 100 tweets
Downloaded 200 tweets
Downloaded 300 tweets
Downloaded 400 tweets
Downloaded 500 tweets
Downloaded 600 tweets
Downloaded 700 tweets
Downloaded 800 tweets
Downloaded 900 tweets
Downloaded 1000 tweets
Downloaded 1100 tweets
Downloaded 1200 tweets
Downloaded 1300 tweets
Downloaded 1400 tweets
Downloaded 1500 tweets
Downloaded 1600 tweets
Downloaded 1700 tweets
Downloaded 1800 tweets
Downloaded 1900 tweets
Downloaded 2000 tweets


Unnamed: 0,id,conversation_id,created_at,date,time,time_zone,user_id,username,name,place,tweet,mentions,urls,photos,replies_count,retweet_count,likes_count,hashtags,cashtags,link,retweet,quote_url,video,geo,source,rt_user_id,retweet_date,retweet_id
0,1306252235123224578,1306252235123224578,2020-09-16 15:22:56,2020-09-16,15:22:56,UTC,1297968425461886976,mayank_ms0106,Mayank Singh,,परंतु asymptomatic व covid +ve होने के कारण मै...,[],[],[],0,0,0,[],[],http://twitter.com/anyuser/status/130625223512...,False,,0,,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,,0
1,1306252221407924224,1306252221407924224,2020-09-16 15:22:53,2020-09-16,15:22:53,UTC,433115234,harshparmar89,Harsh Parmar,"Calgary, Alberta",RT @Jairam_Ramesh: आज राज्य सभा में COVID-19 क...,[Jairam_Ramesh],[],[],0,516,0,[],[],http://twitter.com/anyuser/status/130625222140...,False,,0,,"<a href=""http://twitter.com/download/iphone"" r...",2547533580,Wed Sep 16 09:17:52 +0000 2020,1306160364711600129
2,1306252198863462401,1306252198863462401,2020-09-16 15:22:47,2020-09-16,15:22:47,UTC,170753675,kunalksingh13,Kunal Kishore Singh,"Dhanbad,India",RT @anuragkashyap72: Distraction सुशांत सिंह र...,[anuragkashyap72],[],[],0,3302,0,[],[],http://twitter.com/anyuser/status/130625219886...,False,,0,,"<a href=""http://twitter.com/download/android"" ...",2915766225,Wed Sep 16 12:41:17 +0000 2020,1306211556548468736
3,1306252040633421824,1306252040633421824,2020-09-16 15:22:10,2020-09-16,15:22:10,UTC,2992259190,RT_Himachal,Himachali Retweets,"Himachal Pradesh, India",RT @himachalkesari: मां चिंतपूर्णी दर्शनों के ...,[himachalkesari],"[{'url': 'https://t.co/ygOkYFajGt', 'expanded_...",[],0,1,0,"[Himachal, Una]",[],http://twitter.com/anyuser/status/130625204063...,False,,0,,"<a href=""http://www.hillcrafts.in"" rel=""nofoll...",3286305842,Wed Sep 16 15:22:04 +0000 2020,1306252017631817728
4,1306251979379806210,1306251979379806210,2020-09-16 15:21:55,2020-09-16,15:21:55,UTC,882078129954717696,VinayJh1978,Vinay Jha,"Jhanjharpur, India",RT @skrjbp: @anuragkashyap72 एक बात समझ से परे...,"[skrjbp, anuragkashyap72]",[],[],0,2,0,[],[],http://twitter.com/anyuser/status/130625197937...,False,,0,,"<a href=""http://twitter.com/download/android"" ...",100474027,Wed Sep 16 12:55:06 +0000 2020,1306215029977432064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,1306227352767258624,1306227352767258624,2020-09-16 13:44:04,2020-09-16,13:44:04,UTC,443605771,cgrojgar,Chhattisgarh News Media & Rojgar,Raipur,कोरोना में उपलब्धि : कांकेर के डेडिकेटेड कोविड...,[],"[{'url': 'https://t.co/Ly5XlPyWWE', 'expanded_...",[],0,0,1,[],[],http://twitter.com/anyuser/status/130622735276...,False,,0,,"<a href=""http://publicize.wp.com/"" rel=""nofoll...",,,0
190,1306227101444370435,1306227101444370435,2020-09-16 13:43:04,2020-09-16,13:43:04,UTC,1273575348131815425,Jeetend87422228,Jeetendra Singh,"Gurgaon, India",RT @DeoRampur: कृपया ध्यान से पढ़िए\nबरेली में...,[DeoRampur],[],[],0,2,0,[],[],http://twitter.com/anyuser/status/130622710144...,False,,0,,"<a href=""http://twitter.com/download/android"" ...",784735900244713477,Wed Sep 16 12:53:41 +0000 2020,1306214674040369152
191,1306227045366599680,1306227045366599680,2020-09-16 13:42:50,2020-09-16,13:42:50,UTC,1250339311561527296,Akshita28185632,अक्षिता शुक्ल #सोशल_योगी,,RT @ParishadYog: #Covid_19\nकोरोना संक्रमण से ...,[ParishadYog],[],[],0,26,0,[Covid_19],[],http://twitter.com/anyuser/status/130622704536...,False,,0,,"<a href=""http://twitter.com/download/android"" ...",1161951119771176960,Tue Sep 15 07:45:17 +0000 2020,1305774674475773954
192,1306226506641805315,1306226506641805315,2020-09-16 13:40:42,2020-09-16,13:40:42,UTC,897346706773229568,news11bharat,News11 Bharat,"Ranchi, India",सीएम @HemantSorenJMM ने की DMCH में कोविड-19 ट...,[HemantSorenJMM],"[{'url': 'https://t.co/Ir6Y2cSfac', 'expanded_...",[],0,0,0,[],[],http://twitter.com/anyuser/status/130622650664...,False,,0,,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,,0


In [None]:
df.shape

(1190, 28)

In [None]:
df.to_csv("/gdrive/My Drive/BTP/Dataset/Scraping Tweets/hi_covid_tweets_raw_features_dump.csv",index=False)