## Add necessary packages

In [1]:
#!pip install snscrape
!pip3 install snscrape
!pip install langdetect

Collecting snscrape
  Downloading snscrape-0.3.4-py3-none-any.whl (35 kB)
Installing collected packages: snscrape
Successfully installed snscrape-0.3.4
[0mCollecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l- \ | / - done
[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993242 sha256=1cd511c931c56e9836e49b6d17b16f68a851b0af33fc206ad0fa1ada16fc7a6c
  Stored in directory: /root/.cache/pip/wheels/c5/96/8a/f90c59ed25d75e50a8c10a1b1c2d4c402e4dacfa87f3aff36a
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9
[0m

In [2]:
from snscrape.modules import twitter as sntwitter
import pandas as pd
import datetime
import matplotlib.pyplot as plt
from langdetect import detect_langs
from tqdm.notebook import tqdm
import os
from IPython.display import FileLink
import json

In [3]:
class HateTweetsScraper:
    def __init__(self, query = '', limit=None, start_date = None, end_date = None):
        ''' 
        Class for mining and saving the hate tweets in a csv.
        Input :
            Query : query to be searched
            Limit : max. np. of tweets to search
        start date and end date : timeframe of the tweets to be mined, default yesterday and today resp.

        '''
        # initializing dates :
        self.curr_date = datetime.datetime.now().date()
        self.yest_date = self.curr_date - datetime.timedelta(days = 1)
        if start_date is None:
            self.start_date = self.yest_date
            print("start date is not given, yesterday date is being used : ",self.start_date)
        else :
            self.start_date = start_date        
        if end_date is None:
            self.end_date = self.start_date + datetime.timedelta(days = 1)
            print("end date is not given, start date + 1 is being used", self.end_date)
        else:
            self.end_date = end_date
        #Initializing query and max no. of tweets to be mined:
        if query is None or query =='':
            self.query2 = 'मुल्ला OR मुल्ले OR मुल्लों OR कटवा OR कटुवा OR कटुआ OR क2आ OR क2वा OR मुल्ली'+ f' until:{self.end_date} since:{self.start_date}'
        else:
            self.query2 = query #default query
        self.ts = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
        if limit is None:
            self.limit = 50000
        else:
            self.limit = limit


    @staticmethod
    def make_dict(t, conf):
        '''method to create a dict from
        input : tweet object, confidence
        output : dict'''
        usrnm = ''
        try :
            usrnm = t.user.username
        except AttributeError:
            usrnm = t.username
        di = {
                        "ID": t.id,
                        "date" : t.date,
                        "content" : t.content,
                        "username": usrnm,
                        "URL": t.url,
                        "outlinks" : t.outlinks, 
                        "outlinksss": t.outlinks,
                        "tcooutlinks" : t.tcooutlinks,
                        "tcooutlinksss" : t.tcooutlinks,
                        "confident" : conf
                    }
        return di
    @staticmethod
    def verify(tweets):
        '''
        method to verify lang of the tweets
        and provide a hardcoded confidence score and returns a dataframe.
        for reference :
        {
        foriegn lang : 0,
        hindi : 1,
        query in username : 2
        error : 3
        }'''
        un_list = t_list = []
        h = qiu = f = e = result = conf = 0
        fore_langs = ['fi', 'in', 'ar', 'it']
        query_langs = ['hi', 'en']
        usn = ["Mulla","mulla", 'mullah']
        
        for t in tweets:
            #print(type(t))
            try:
                lang = str(detect_langs(t.content)[0])[:2]
            except Exception as e:
                print(e)
                result = 3
            if lang in fore_langs or lang != 'hi' :
                f = f+1
                result =1;
                conf = 0
            elif lang == 'hi':
                h = h+1
                result = 1
                conf = 1
            else:
                if t.user.username in usn:
                    un_list.append(t.user.username)
                    result = 2
                    conf = 0
                else:
                    result = conf = 1
            if result == 0:
                pass
            elif result == 1:
                temp = HateTweetsScraper.make_dict(t, conf) 
                t_list.append(temp)
            elif result == 2:
                qiu = qiu +1
                conf = 0
                temp = HateTweetsScraper.make_dict(t, conf)
                t_list.append(temp)
            elif result == 3:
                e = e+1
                pass
        print(40*"=-")
        print(f"detected total {len(tweets)} tweets, {f} foriegn Language tweets, {h} hindi tweets, {e} errors and {qiu} account name matches.")
        print(40*"=-")
        print(f"Saving {len(t_list)} verified tweets.")
        df = pd.DataFrame(t_list)
        return df 
    
    @staticmethod
    def write_log(logs):
        '''
        method for creating log file/s
        '''
        filename = f'log.json'
        with open(filename, "+a") as fp:
            json.dump(logs, fp)    
            fp.write(",/n")
        fp.close()
        #without timestamp in name of logs
        print(f"Without TS: Log saved in {filename}.\n", logs)        
        '''ts = logs['Timestamp']
        filename = f'log_{ts}.json'
        with open(filename, "+a") as fp:
            json.dump(logs, fp)    
            fp.write(",/n")
        fp.close()
        print(f"Log saved in {filename}.\n", logs)'''
        
    def get_tweets(self):
        ''' Fn to get tweets.
        query = 'query', limit = 50000
        takes : query text as string, limit of max number of tweets as int
        returns : list of tweet objects'''
        
        query = self.query2
        limit = self.limit
        print('get query and limit', query, limit)
        conf = 0   # Init. cpnfidence as 0
        global tweets
        tweets = []
        
        gen = sntwitter.TwitterSearchScraper(query).get_items()
        for tweet in tqdm(gen, total = limit) :
            # print(vars(tweet))
            # break
            if len(tweets) == limit:
                print(f"Limit of {limit} tweets reached.")
                break
            else:
                tweets.append(tweet)      
        print(f"mined {len(tweets)} tweets.")   
        return tweets

    def save_data(self):
        '''method for saving data
        takes input : tweets list
        verifies each tweet then saves it in .csv and json logs.
        returnd : dataframe'''

        df = HateTweetsScraper.verify(tweets)
        #print(df.head())
        fname = f"output_{self.start_date}_{self.end_date}_{self.ts}.csv"
        df.to_csv(fname)
        # log dictionary
        logs = {
            "Timestamp": self.ts,
            "extraction_date" : str(self.curr_date),
            "filename" : fname, 
            "start_date": str(self.start_date), 
            "end_date" : str(self.end_date),
            "no_of_tweets" : df.shape[0]
        }    
        HateTweetsScraper.write_log(logs)
        return df
        



In [4]:
HT = HateTweetsScraper()
tweets_list = HT.get_tweets()
df = HT.save_data()
df.head()

start date is not given, yesterday date is being used :  2022-08-20
end date is not given, start date + 1 is being used 2022-08-21
get query and limit मुल्ला OR मुल्ले OR मुल्लों OR कटवा OR कटुवा OR कटुआ OR क2आ OR क2वा OR मुल्ली until:2022-08-21 since:2022-08-20 50000


  0%|          | 0/50000 [00:00<?, ?it/s]

mined 335 tweets.
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
detected total 335 tweets, 43 foriegn Language tweets, 292 hindi tweets, 0 errors and 0 account name matches.
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
Saving 335 verified tweets.
Without TS: Log saved in log.json.
 {'Timestamp': '20220821163801', 'extraction_date': '2022-08-21', 'filename': 'output_2022-08-20_2022-08-21_20220821163801.csv', 'start_date': '2022-08-20', 'end_date': '2022-08-21', 'no_of_tweets': 335}


Unnamed: 0,ID,date,content,username,URL,outlinks,outlinksss,tcooutlinks,tcooutlinksss,confident
0,1561137764829372416,2022-08-20 23:47:02+00:00,@chitraaum अतुलानंद से पूछ लिया है।नहीं तो कम ...,VinodKu04217358,https://twitter.com/VinodKu04217358/status/156...,[],[],[],[],1
1,1561136175875624960,2022-08-20 23:40:43+00:00,@Ajashutoshjha @chandaj29710249 @JhaAjitk ये प...,kamalKantmishra,https://twitter.com/kamalKantmishra/status/156...,[],[],[],[],1
2,1561132680007168000,2022-08-20 23:26:50+00:00,@ShuebKh16859893 @Lovely92698976 असली भक्त तो ...,sanjeevkmax,https://twitter.com/sanjeevkmax/status/1561132...,[],[],[],[],1
3,1561090746887073792,2022-08-20 20:40:12+00:00,@ActivistSandeep कभी नहीं आएगी मुल्ला पार्टी ढ...,Arjunpa51223577,https://twitter.com/Arjunpa51223577/status/156...,[],[],[],[],1
4,1561080311521234945,2022-08-20 19:58:44+00:00,@anamikamber जिसने कटवा रखा हो...,highspirithero,https://twitter.com/highspirithero/status/1561...,[],[],[],[],1


# Old Code

curr_date = datetime.datetime.now().date()
yest_date = curr_date - datetime.timedelta(days = 1)

start_date = yest_date
end_date = curr_date
#q = "mulle OR mullon OR मुल्ला OR मुल्ले OR मुल्लों OR कटवा OR कटुवा OR कटुआ OR क2आ OR क2वा OR मुल्ली OR (mulla OR mulli OR katua OR katuaa OR katuwa OR katuva OR k2a OR ktwa OR k2va)"
q = 'मुल्ला OR मुल्ले OR मुल्लों OR कटवा OR कटुवा OR कटुआ OR क2आ OR क2वा OR मुल्ली'

# Scrape tweets

### Detect unrelated tweets

#detect_langs("uberfuhrer")
detect_langs(" यही जीवन है साहब   \n*कर्नाटक में वकीलों पर फू")

import os
import json
#!mkdir logs
def write_log(logs):
    def save(logs, fname):
        with open(filename, "+a") as fp:
            json.dump(logs, fp)    
            fp.write(",/n")
        fp.close()
    filename = f'log.json'
    save(logs, filename)
    print(f"Without TS: Log saved in {filename}.\n", logs)
    
    ts = logs['Timestamp']
    filename = f'log_{ts}.json'
    save(logs, filename)
    print(f"Log saved in {filename}.\n", logs)
    
 

def verify(t):
    
    '''{
    foriegn lang : 0,
    hindi : 1,
    query in username : 2
    error : 3
    }'''
    un_list = []
    fore = hindi = 0
    e = 0
    fore_langs = ['fi', 'in', 'ar', 'it']
    usn = ["Mulla","mulla"]
    try:
        lang = detect_langs(t.content)[0]
    except:
        return 3
    if lang in fore_langs:
        fore = fore+1
        return 0
    elif lang == 'hi':
        hindi = hindi+1
        return 1
    else:
        if t.username in usn:
            un_list.append(t.username)
            print(t.username)
            return 2
        else:
            return 1
    



def get_tweets(query = "Help needed", limit = 50000):
    m = n = 0
    tweets = []
    conf = 0
    gen = sntwitter.TwitterSearchScraper(query).get_items()
    for tweet in tqdm(gen, total = limit) :
        # print(vars(tweet))
        # break
        if len(tweets) == limit:
            print(f"Limit of {limit} tweets reached.")
            break
        else:
            tweets.append(tweet)
    print(f"mined {len(tweets)} tweets.")
    return tweets

def save_to_df(tweets):
    t_list = []
    e = h = n = m = 0
    for t in tqdm(tweets):
        result = verify(t)
        '''{
            foriegn lang : 0,
            hindi : 1,
            query in username : 2
            error : 3
            }'''
        if result == 0:
            n = n+1
            pass
        elif result == 1:
            h = h +1
            temp = {
                "ID": t.id,
                "date" : t.date,
                "content" : t.content,
                "username": t.username,
                "URL": t.url,
                "outlinks" : t.outlinks, 
                "outlinksss": t.outlinksss,
                "tcooutlinks" : t.tcooutlinks,
                "tcooutlinksss" : t.tcooutlinksss,
                "confident" : 1
            }
            t_list.append(temp)
        elif result == 2:
            m = m+1
            temp = {
                "ID": t.id,
                "date" : t.date,
                "content" : t.content,
                "username": t.username,
                "URL": t.url,
                "outlinks" : t.outlinks, 
                "outlinksss": t.outlinksss,
                "tcooutlinks" : t.tcooutlinks,
                "tcooutlinksss" : t.tcooutlinksss,
                "confident" : 0
            }
            t_list.append(temp)
        elif result == 3:
            print(t.content)
            e = e+1
            pass

    print(40*"=")
    print(f"detected {n} foriegn Language tweets, {h} hindi tweets, {e} errors and {m} account name matches.")
    print(40*"=")
    
    print(f"Saving {len(t_list)} verified tweets.")
    df = pd.DataFrame(t_list)
    return df

## Pipeline

ts = datetime.datetime.now().strftime('%Y%m%d%H%M%S')

def run_pipeline(query = q, lim = 50000, end_date = curr_date, start_date = yest_date, timestamp = ts):
    #defining the query
    query = query + f' until:{end_date} since:{start_date}'
    
    all_tweets = get_tweets(query, limit = lim)
    df_hate =  save_to_df(all_tweets)
    
    fname = f"output_{start_date}_{end_date}_{timestamp}.csv"
    df_hate.to_csv(fname)
    print("Searched query:", query)
    print(f'mined and saved {df_hate.shape[0]} tweets in {fname} file.')
    # log dictionary
    logs = {
        "Timestamp": datetime.datetime.now().strftime('%Y%m%d%H%M%S'),
        "extraction_date" : str(curr_date),
        "filename" : fname, 
        "start_date": str(start_date), 
        "end_date" : str(end_date),
        "no_of_tweets" : df_hate.shape[0]
    }
    
    write_log(logs)
    return logs

%%time
logdict = run_pipeline()

write_log(logdict)

df = pd.read_csv(logdict['filename'])

df.head(10)

df.confident.unique()

# Check logs

!ls