# Making the necessary imports

In [1]:
import configparser as cfp
import pandas as pd
import praw

# Importing reddit credentials from a secure file

In [2]:
def get_reddit_credentials():
    config = cfp.ConfigParser()
    config.read('config.ini')
    return config['reddit']['client_id'], config['reddit']['client_secret'], config['reddit']['user']

# Fetching data from reddit to understand the data structure

In [3]:
client_id, client_secret, user = get_reddit_credentials()
reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user)
number_of_posts = 5001								#Number of top posts we want to fetch from a subreddit
indian_sr = reddit.subreddit('India')

top_posts = indian_sr.top(limit=number_of_posts)	#Fetching top n posts
hot_posts = indian_sr.hot(limit=number_of_posts)	#Fetching the hot posts
new_posts = indian_sr.new(limit=number_of_posts)	#Fetching the newest posts

columns = ['id', 'subreddit', 'title', 'upvotes', 'url', 'original', 'num_comments', 'body', 'created_on', 'flair']

list_posts = []

for post in new_posts:
    list_posts.append([post.id, post.subreddit, post.title, post.score, post.url, post.is_original_content, post.num_comments, post.selftext, post.created, post.link_flair_text])

data = pd.DataFrame(list_posts, columns=columns)
data['creation_date'] = pd.to_datetime(data['created_on'], dayfirst = True, unit ='s')
data.drop(['created_on'], axis=1, inplace=True)

In [4]:
data.head(-5)

Unnamed: 0,id,subreddit,title,upvotes,url,original,num_comments,body,flair,creation_date
0,gychj2,india,RBI grants licence to Bank of China to set up ...,1,https://economictimes.indiatimes.com/industry/...,False,0,,Policy/Economy,2020-06-07 21:34:37
1,gyccoe,india,Pls take a look and pls share,0,https://youtu.be/UzfcphQkOUI,False,0,,Non-Political,2020-06-07 21:26:28
2,gycb88,india,"That's one small step for a youtuber, one gian...",1,https://www.reddit.com/r/india/comments/gycb88...,False,0,https://ibb.co/p0TMCTb\n\nPeople always fought...,Non-Political,2020-06-07 21:23:59
3,gyca8n,india,All religions coming together and taking sensi...,1,https://www.hindustantimes.com/india-news/kera...,False,0,,Coronavirus,2020-06-07 21:22:23
4,gyc7mg,india,Since there have been lots of Bollywood/India ...,0,https://www.reddit.com/r/india/comments/gyc7mg...,False,1,Average male height in India as per [this](htt...,Non-Political,2020-06-07 21:17:51
...,...,...,...,...,...,...,...,...,...,...
872,gvnwrn,india,Getting back to the workplace,2,https://www.reddit.com/r/india/comments/gvnwrn...,False,1,As companies come up with innovative communica...,Non-Political,2020-06-03 13:38:01
873,gvnvri,india,"With highest Covid-19 cases, these states push...",12,https://www.hindustantimes.com/india-news/with...,False,1,,Coronavirus,2020-06-03 13:35:53
874,gvnqnq,india,Should I buy R15 on loan at this time?,0,https://www.reddit.com/r/india/comments/gvnqnq...,False,7,"I very much want to buy the bike, however my s...",AskIndia,2020-06-03 13:24:47
875,gvndic,india,[RANT] I have been fucking up things for years...,35,https://www.reddit.com/r/india/comments/gvndic...,False,29,"I am a failure, an utter lazy incompetent fool...",Non-Political,2020-06-03 12:57:27


In [5]:
data['flair'].value_counts().sort_values(ascending=False)

Non-Political         217
Politics              200
AskIndia              185
Coronavirus           133
Policy/Economy         50
Business/Finance       30
Science/Technology     21
CAA-NRC-NPR            11
Scheduled              10
Food                    9
Photography             7
Sports                  4
Unverified              3
Megathread              1
Name: flair, dtype: int64

In [6]:
data.shape

(882, 10)

# Collecting More Data

The data collected is only 882 posts. Need a lot more data than that. So, let's pick content on basis of flairs. 

In [7]:
flairs = ["Non-Political", "Politics", "Coronavirus", "Photography", "[R]eddiquette", 
          "Sports", "Food", "Business/Finance", "Policy/Economy", "Science/Technology",
           "AskIndia", "AMA"]
features = ["Title", "ID", "Upvotes", "URL", "Num_comments", "Creation Date", "Body", "Is_original", 
            "Comments", "Flair"]

In [17]:
list_posts = []

for flair in flairs:
    subs = indian_sr.search(f"flair_name:{flair}", limit=300)
    for sub in subs:
        comments = []
        sub.comments.replace_more(limit=10)
        for comment in sub.comments:
            comments.append(str(comment.body))
        list_posts.append([str(sub.title), str(sub.id), sub.score, str(sub.url), sub.num_comments, str(sub.created), 
                             str(sub.selftext), sub.is_original_content, comments, str(sub.link_flair_text)])

data = pd.DataFrame(list_posts, columns = features)

In [23]:
data['Creation Date'] = pd.to_datetime(data['Creation Date'], dayfirst = True, unit ='s')
data.head()

Unnamed: 0,Title,ID,Upvotes,URL,Num_comments,Creation Date,Body,Is_original,Comments,Flair
0,"Dear Indian society, you do not possess the mo...",gx3r7p,8279,https://www.reddit.com/r/india/comments/gx3r7p...,710,2020-06-05 20:36:43,I'm a dark skinned south Indian guy born in ea...,False,[Even though what we Indians experience in our...,Non-Political
1,Some things never change,gxjc5c,7205,https://imgur.com/pfqzTiY,391,2020-06-06 11:18:58,,False,[Oh God. I'd never understand what they get by...,Non-Political
2,For an industry that has to paint their actors...,gvw1x2,5788,https://i.redd.it/qhqxocwprp251.jpg,510,2020-06-03 23:30:14,,False,[You're forgetting Priyanka Chopra as Mary Kom...,Non-Political
3,The Last Conversation,gvofxe,5130,https://i.redd.it/n8py56mh1n251.jpg,452,2020-06-03 14:19:48,,False,[This is so heart breaking! I can barely contr...,Non-Political
4,A fruit seller in Delhi left his crates of man...,goi120,4845,https://v.redd.it/lp0gqu8h2b051,634,2020-05-22 19:59:10,,False,[That poor fruit vendor lost his last hope of ...,Non-Political


In [27]:
data[:] = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,Title,ID,Upvotes,URL,Num_comments,Creation Date,Body,Is_original,Comments,Flair
0,Uber driver scammed me by not picking me up an...,cc6wf1,96,https://www.reddit.com/r/india/comments/cc6wf1...,59,2019-07-12 13:53:45,Yesterday it was raining heavily and needed to...,False,"[In your case, the driver might have come near...",[R]eddiquette
1,Assembly Elections Results Megathread,a54j07,277,https://www.reddit.com/r/india/comments/a54j07...,856,2018-12-11 14:58:33,"---\n# Rajasthan\n\nTotal Seats: 199, Majority...",False,"[If elections are not one-sided, it's always g...",[R]eddiquette
2,Travelled to Kashmir 2 years back. This is the...,ekd70o,19,https://www.youtube.com/watch?v=6WgnfARwOdA,7,2020-01-05 21:46:20,,True,"[Cringe, Good Job brother... It looks good !]",Photography
3,Reminder to file your Income tax return - 31st...,ct4z8k,59,https://www.reddit.com/r/india/comments/ct4z8k...,21,2019-08-21 03:51:02,"Just filing my tax returns, thought of remindi...",False,"[Hey, thanks! Skipped my mind., 16 years of ed...",[R]eddiquette
4,"After eating 900 mice, cat goes on Haj! RJio j...",fgarrj,214,https://www.financialexpress.com/opinion/after...,39,2020-03-10 16:46:22,,False,[Jio gives free service for six months\n\nIndi...,Business/Finance


In [39]:
data['Comments'] = [' '.join(sentence).strip() for sentence in data['Comments']]
data['Comments'].head()

0    In your case, the driver might have come near ...
1    If elections are not one-sided, it's always go...
2           Cringe Good Job brother... It looks good !
3    Hey, thanks! Skipped my mind. 16 years of educ...
4    Jio gives free service for six months\n\nIndia...
Name: Comments, dtype: object

In [40]:
data.to_csv('./data.csv')