In [1]:
import praw
from collections import Counter
import tqdm
import pandas as pd

In [2]:
reddit = praw.Reddit(client_id='', client_secret='', user_agent='reddit scrap')
flairs = []
hot_posts = reddit.subreddit('india').new(limit=1000)
for post in hot_posts:
    flair = post.link_flair_text
    flairs.append(flair)

In [3]:
set(flairs), len(set(flairs))

({'AskIndia',
  'Business/Finance',
  'CAA-NRC-NPR',
  'Coronavirus',
  'Food',
  'Non-Political',
  None,
  'Old',
  'Photography',
  'Policy/Economy',
  'Politics',
  'Scheduled',
  'Science/Technology',
  'Sports',
  '| Repost |'},
 15)

    There are in total of 19 distinct flairs from the first 1000 posts from subreddit india. Doing classification considering all flairs will not be good idea because:
        1. Considering more classes will make our task much more difficult.
        2. With more classes maintaing balanced data will be a huge problem.
    So, I have decided to narrow down the no. of classes based on name and current socio-political scenario. The total no. of classes that I have considered now is 10 and these are:

In [4]:
# Appropriate flairs
flairs = ['Coronavirus',
 'Politics',
 'Photography',
 'Policy/Economy',
 'Non-Political',
 'AskIndia',
 'Business/Finance',
 'Science/Technology',
 'Sports',
 'CAA-NRC-NPR']

   In the above cells, I have used [praw](https://github.com/praw-dev/praw) library to scrape data from reddit. But the problem with this library is that it uses reddit official API to fetch data because of which we can only fetch top 1000 posts, but for training we will need way more than 1000. So, I decided to go with something else:
   [PushshiftAPI](https://github.com/pushshift/api) is good option and [psaw](https://github.com/dmarx/psaw) is a wrapper for this API, which I will be using.


In [5]:
from psaw import PushshiftAPI

api = PushshiftAPI()

result = list(api.search_submissions(subreddit='india',
                            filter=['title','full_link', 'selftext', 'link_flair_text'],
                            limit=150000))

In [7]:
print(result[9999])
print("---"*39)
print(result[9999][-1])

submission(created_utc=1585895305, full_link='https://www.reddit.com/r/india/comments/fu38te/india_code_digital_repository_of_central_and/', link_flair_text='Politics', selftext='', title='India Code | Digital Repository of Central and State Acts', created=1585875505.0, d_={'created_utc': 1585895305, 'full_link': 'https://www.reddit.com/r/india/comments/fu38te/india_code_digital_repository_of_central_and/', 'link_flair_text': 'Politics', 'selftext': '', 'title': 'India Code | Digital Repository of Central and State Acts', 'created': 1585875505.0})
---------------------------------------------------------------------------------------------------------------------
{'created_utc': 1585895305, 'full_link': 'https://www.reddit.com/r/india/comments/fu38te/india_code_digital_repository_of_central_and/', 'link_flair_text': 'Politics', 'selftext': '', 'title': 'India Code | Digital Repository of Central and State Acts', 'created': 1585875505.0}


In [8]:
def comment(url):
    submission = reddit.submission(url=url)
    submission.comments.replace_more(limit=0)
    try:
        return submission.comments[0].body
    except IndexError:
        return None

    Initially, I tried to get the top comment for analysis but the comment can't be directly scraped, so the way to get the comment is shown in the above cell. Now the problem with extracting comments is that It was taking a lot of time approximately for 100000 posts, the estimated time was more than 7 hours. So, I decided to parallelise the process, but still it was taking more than 4 hours. And sending so many requests to reddit API was also difficult, so it was throwing error.

In [9]:
result_dict = {'title':[],'full_link':[], 'selftext':[], 'link_flair_text':[]}

In [10]:
for ind, res in tqdm.tqdm_notebook(enumerate(result), total = 100000, desc = "Progress Bar for Insertion in disk"):
    
    result_dict["title"].append(res[-1]["title"])
    result_dict["full_link"].append(res[-1]["full_link"])
    
    if "selftext" in res[-1].keys():
        result_dict["selftext"].append(res[-1]["selftext"]) 
    else:
        result_dict["selftext"].append(None) 
    
    if "link_flair_text" in res[-1].keys():
        result_dict["link_flair_text"].append(res[-1]["link_flair_text"]) 
    else:
        result_dict["link_flair_text"].append(None) 
    
#     result_dict["comment"].append(comment(res[-1]["full_link"]))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, description='Progress Bar for Insertion in disk', max=100000.0, style=…




In [11]:
df = pd.DataFrame.from_dict(result_dict)
df.to_csv("~/Documents/india_reddit15.csv", index=False)

In [21]:
df.to_csv??

    Parallelising the above function to scrape comments which at last didn't work.

In [None]:
# from multiprocess import Process, Manager

# def f(result_dict, res):
#     result_dict["title"].append(res[-1]["title"])
#     result_dict["full_link"].append(res[-1]["full_link"])
    
#     if "selftext" in res[-1].keys():
#         result_dict["selftext"].append(res[-1]["selftext"]) 
#     else:
#         result_dict["selftext"].append(None) 
    
#     if "link_flair_text" in res[-1].keys():
#         result_dict["link_flair_text"].append(res[-1]["link_flair_text"]) 
#     else:
#         result_dict["link_flair_text"].append(None) 
    
#     result_dict["comment"].append(comment(res[-1]["full_link"]))
    

# manager = Manager()
# result_dict = manager.dict({'title':[],'full_link':[], 'selftext':[], 'link_flair_text':[], "comment":[]})

In [None]:
# %time
# job = [Process(target=f, args=(result_dict, res)) for res in tqdm.tqdm_notebook(result, total = 100000, desc = "Progress Bar for Insertion in disk")]
# _ = [p.start() for p in job]
# _ = [p.join() for p in job]
# # _ = [p.join() for p in job]


# print(result_dict)