In [None]:

import pandas as pd
from psaw import PushshiftAPI
from os.path import exists
from tqdm import tqdm

api = PushshiftAPI()


In [None]:
topic_map = {
    "democrats" : "politics",
    'government' : "politics",
    'Democracy' : "politics",
    'elections' : "politics",

    "environment" : "environment",
    'nature' : "environment",
    'climate': "environment",
    'climatechange': "environment",
    'atmosphere' : "environment",
    "iphone" : "technology",
    'gaming': "technology",
    'apple': "technology",
    'webdev': "technology",
    'android': "technology",
    'technology': "technology",
    "emergencymedicine" : "Healthcare",
    'COVID19' : "Healthcare",
    'Coronavirus' : "Healthcare",
    'hospitals' : "Healthcare",
    'doctors': "Healthcare",
    'Nurse' : "Healthcare",
    'medicine' : "Healthcare",
    'Dentistry' : "Healthcare",
    'disease' : "Healthcare",    
    "Teachers" : "education",
    'learnprogramming': "education",
    'ArtificialInteligence': "education",
    'MachineLearning': "education",
    'School': "education",
    'College': "education",
    'University': "education",

}

subreddits = list(topic_map.keys())
filters = [
    "id",
    "subreddit"
    "title",
    "selftext",
    "topic",
    "body",    
    "parent_id",
]


In [None]:
def generateSubmissionDF():
    dfs = []
    for subreddits,topic in topic_map.items():
        
            submissions = api.search_submissions(
                    subreddit=subreddits,  # one or more subreddits to include in the search
                    filter=['id', 'subreddit', 'full_link', 'title', 'selftext','author', 'is_submission','score',
                                       'num_comments'],
                    limit=300,
                    
                    num_comments=">20",
                )
            
            submissions = list(submissions)            
            
            print(subreddits,len(submissions))
            submission_df = pd.DataFrame(submissions)
            
            submission_df = submission_df[submission_df['selftext']!='[removed]']
            submission_df = submission_df[submission_df['title']!='[removed]']
            submission_df = submission_df[submission_df['selftext']!='[deleted]']
            submission_df = submission_df[submission_df['selftext']!='']
            submission_df = submission_df[submission_df['title']!='[deleted]']
            
            submission_df['topic'] = topic
            dfs.append(submission_df)

    all_submission_df = pd.concat(dfs)
    all_submission_df['is_submission'] = True
    
    return all_submission_df


In [None]:

def loadSubmissions():
    path = './data/submission.pkl'
    file_exists = exists(path)
    if not file_exists:
        submission_df = generateSubmissionDF()
        submission_df.to_pickle(path) 
    else:
        print("Loading data from file",path)
        submission_df = pd.read_pickle(path)
    
    return submission_df
    

In [None]:

submission_df = loadSubmissions()

In [None]:
submission_df

In [None]:
def generateCommentsDF(id,topic,subreddit,parent_body):

    filter2 = list(filters)
    filter2.append('created_utc')
    # print(filters)
    response = api.search_comments(filter=filter2,link_id=id,subreddit=subreddit)
    comment_df = pd.DataFrame(list(response))
    
    comment_df = comment_df[comment_df['body']!='[removed]']    
    comment_df = comment_df[comment_df['body']!='[deleted]']
    hash_map = {}
    
    for index,row in comment_df.iterrows():        
        hash_map[row['id']] = row
    hash_map[id] = {
        'selftext':parent_body
    }
    new_list = []
    for index,row in comment_df.iterrows():
        # print('RESPONSE2',row)
        parent_id = row['parent_id'].split('_')[1]
        if parent_id in hash_map:
            if 'selftext' in hash_map[parent_id]:
                row['parent_body'] = hash_map[parent_id]['selftext']
            if 'body' in hash_map[parent_id]:
                row['parent_body'] = hash_map[parent_id]['body']
            new_list.append(row)
    # comment_df = comment_df[comment_df['parent_id']==f't3_{id}']
    
    return pd.DataFrame(new_list)


In [None]:
def loadCommentsDF():
    submission_df = loadSubmissions()
    
    dfs = []    
    cnt= 0
    for _, row in tqdm(submission_df.iterrows()): 
        
        
        id=row['id'] 
        try:
            path = f'./data/comments/{id}.pkl'
            file_exists = exists(path)
            if not file_exists:        
                comments_df = generateCommentsDF(row['id'],row['topic'],row['subreddit'],row['selftext'])  
                # print(comments_df)       
                comments_df['topic'] = row['topic']
                comments_df.to_pickle(path) 
            else:
                comments_df = pd.read_pickle(path)
            # print(comments_df)
            cnt+=len(comments_df)
            dfs.append(comments_df)
        except Exception as e:
            print(e)
            pass
    all_comments_df = pd.concat(dfs)
    all_comments_df['is_submission'] = False
    return all_comments_df


In [None]:
comments_df = loadCommentsDF()

In [None]:
comments_df

In [40]:
final = pd.concat([submission_df,comments_df])
final.drop('d_',axis=1,inplace=True)
final.drop('created',axis=1,inplace=True)

In [42]:
drops = ['author', 'created_utc' ,'full_link', 'id', 'score', 'num_comments', 'subreddit']
for drop in drops:
    final.drop(drop,axis=1,inplace=True)

In [45]:
final.to_pickle('final.pkl')

In [46]:
final = final.to_dict('records')

In [47]:
final

[{'selftext': "I have fallen for this trap several times and got temp banned in both these subreddits. Trolls will goat you on and you end saying something that less than civil. They then report your comment, and delete theirs. It makes it look like you are the one being uncivil. You end up getting banned. \n\nIt's my own fault for saying what I did, and will serve my sentence. But know their goal is not debate. It's to get you banned.",
  'title': 'Be careful when you post to r/politics and r/askreddit - trolls are trying to get you banned',
  'topic': 'politics',
  'is_submission': True,
  'body': nan,
  'parent_id': nan,
  'parent_body': nan},
 {'selftext': 'First off...I am a lifelong Republican.  But the Trumpees, Tea Partry  and fascists have ruined my party.   I do not want them to ruin my country. \n\nThe GOP is going out of their way to rig the election to support the Trump Big Lie.  The US Constitution does not guarantee that we have the right to vote for the US President.  T

Stopped here
-------------

In [None]:
res = pd.read_pickle('final.pkl')

In [None]:
slim_df = res[["selftext","title","topic","body","parent_body"]]

In [None]:
def get_response(row):
    if not pd.isna(row["body"]):
        return row["body"]
    if not pd.isna(row["selftext"]):
        return row["selftext"]
    return None

slim_df['response'] = slim_df.apply(lambda row:get_response(row),axis=1)


In [None]:
slim_df = slim_df[slim_df['parent_body'].notnull()]
slim_df = slim_df[['response','parent_body','topic']]

In [None]:
slim_df['query'] = slim_df['parent_body']


In [None]:
slim_df=slim_df[['query','response','topic']] 

In [None]:
slim_df.to_pickle('slim.pkl')

In [None]:
slim_records = slim_df.to_dict('records')

In [None]:
from setup import Indexer
i=Indexer()
i.add_fields()

In [None]:
i.create_documents(slim_records)