In [1]:
import pandas as pd
import numpy as np
import re
from datetime import datetime

In [2]:
df_post= pd.read_csv("/content/drive/MyDrive/asalytics/reddit_posts_table.csv")
df_comment= pd.read_csv("/content/drive/MyDrive/asalytics/reddit_comments_table.csv")

In [3]:
df_post.head()

Unnamed: 0,id,title,post_text,score,total_comments,post_url,time_created,asa_id
0,r2inwn,r/choicecoin Lounge,A place for members of r/choicecoin to chat wi...,86,120,https://www.reddit.com/r/choicecoin/comments/r...,1637916000.0,choicecoin
1,sgxogi,To an Amazing community...🎁$choice,Choice referral program is to reward and encou...,7,5,https://www.reddit.com/r/choicecoin/comments/s...,1643623000.0,choicecoin
2,ttvgjy,It has a pulse... Choice coin is on the up!,Looks like moves higher... seems like we have ...,15,4,https://www.reddit.com/r/choicecoin/comments/t...,1648832000.0,choicecoin
3,tq90u4,Choicecoin Dead?,,6,22,https://www.reddit.com/r/choicecoin/comments/t...,1648473000.0,choicecoin
4,tq6115,New introduction,,2,0,https://app.kaafila.org/#/watch?v=QmNQm7uFRUXn...,1648462000.0,choicecoin


In [4]:
df_comment.head()

Unnamed: 0,id,body,score,parent_id,created_at,asa_id
0,hpedoe6,Anyone think it will go back up?😅,2,t3_r2inwn,1640071042.0,choicecoin
1,hrdqoi7,"Fair to say this is a rug pull coin, yeah?",2,t3_r2inwn,1641405064.0,choicecoin
2,hmpgv8u,Hi there,1,t3_r2inwn,1638304386.0,choicecoin
3,hmpgweu,any chart to follow choice coin?,1,t3_r2inwn,1638304399.0,choicecoin
4,hmq5qo3,Hi everyone,1,t3_r2inwn,1638314603.0,choicecoin


In [5]:
class reddit_analysis():
  def __init__(self):
    self.data= None

  def rename_post_columns(self, df_post):
    self.data= df_post
    self.data.rename(columns = {'id':'parent_id', 'title': "post_title", "score": "post_upvotes",
                              "time_created": "post_time_created"}, inplace = True)
    return self.data

  def rename_comment_columns(self, df_comment):
    self.data= df_comment
    self.data.rename(columns= {"id": "comment_id", "body": "comment_text", "score": "comment_upvotes",
                                "created_at": "comment_time_created"}, inplace= True)
    return self.data

  def strip_commentid_prefix(self, df_comment):
    self.data= df_comment
    self.data['parent_id'] = df_comment['parent_id'].str.lstrip("t3_")
    return self.data
  
  def merge_two_dfs(self, df_post, df_comment):
    df = pd.merge(df_post, df_comment, on='parent_id', 
                   how='right')
    return df

  def drop_missing_values(self, df):
    df= df.dropna(axis= 0, how= "any")
    return df

  def remove_URL(self, df, data:list):
    try:
      for i in data:
        for j in df[i]:
          df[i].replace(j, re.sub(r"http\S+", "", j), inplace= True)
      return df
    except ValueError as e:
      return str(e)

  def special_character_removal(self, post_comment_text):
    data= post_comment_text.split(" ")
    data_list= []
    for i, word in enumerate(data):
      text= ""
      for char in word:
        if char.isalpha():
          text += char.lower()
      data_list.append(text)
    return " ".join(data_list)

  def access_scr(self, df, data:list):
    for i in data:
      for j in df[i]:
        df[i].replace(j, reddit_analysis().special_character_removal(j), inplace= True)
    return df


  def unix_to_utc(self, df, date_columns:list):
    for i in date_columns:
      for j in df[i]:
        if str(j).startswith("202"):
          continue
        df[i].replace(j, datetime.utcfromtimestamp(float(j)).strftime('%Y-%m-%d %H:%M:%S'), inplace= True)
      df[i]= df[i].apply(pd.to_datetime)
    return df

  def extract_date(self, df,date_columns:list):
    for x in date_columns:
        df[x +'_year'] = df[x].dt.year
        df[x +'_day'] = df[x].dt.day
        df[x +'_month'] = df[x].dt.month
    df.drop(columns=date_columns,axis=1,inplace=True)
    return df

  def all(self, df_post, df_comment):
    df_post= reddit_analysis().rename_post_columns(df_post)
    df_comment= reddit_analysis().rename_comment_columns(df_comment)
    df_comment= reddit_analysis().strip_commentid_prefix(df_comment)
    df= reddit_analysis().merge_two_dfs(df_post, df_comment)
    df= reddit_analysis().drop_missing_values(df)
    df = reddit_analysis().remove_URL(df, ["post_text", "comment_text"])
    df = reddit_analysis().access_scr(df, ["post_title", "post_text", "comment_text"])
    df = reddit_analysis().unix_to_utc(df, ['post_time_created', "comment_time_created"])
    df = reddit_analysis().extract_date(df, ['post_time_created', 'comment_time_created'])
    return df

In [6]:
reddit_analysis().all(df_post, df_comment)

Unnamed: 0,parent_id,post_title,post_text,post_upvotes,total_comments,post_url,asa_id_x,comment_id,comment_text,comment_upvotes,asa_id_y,post_time_created_year,post_time_created_day,post_time_created_month,comment_time_created_year,comment_time_created_day,comment_time_created_month
0,r2inwn,rchoicecoin lounge,a place for members of rchoicecoin to chat wit...,86.0,120.0,https://www.reddit.com/r/choicecoin/comments/r...,choicecoin,hpedoe6,anyone think it will go back up,2,choicecoin,2021,26,11,2021,21,12
1,r2inwn,rchoicecoin lounge,a place for members of rchoicecoin to chat wit...,86.0,120.0,https://www.reddit.com/r/choicecoin/comments/r...,choicecoin,hrdqoi7,fair to say this is a rug pull coin yeah,2,choicecoin,2021,26,11,2022,5,1
2,r2inwn,rchoicecoin lounge,a place for members of rchoicecoin to chat wit...,86.0,120.0,https://www.reddit.com/r/choicecoin/comments/r...,choicecoin,hmpgv8u,hi there,1,choicecoin,2021,26,11,2021,30,11
3,r2inwn,rchoicecoin lounge,a place for members of rchoicecoin to chat wit...,86.0,120.0,https://www.reddit.com/r/choicecoin/comments/r...,choicecoin,hmpgweu,any chart to follow choice coin,1,choicecoin,2021,26,11,2021,30,11
4,r2inwn,rchoicecoin lounge,a place for members of rchoicecoin to chat wit...,86.0,120.0,https://www.reddit.com/r/choicecoin/comments/r...,choicecoin,hmq5qo3,hi everyone,1,choicecoin,2021,26,11,2021,30,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5776,nxagkm,airdrop information eligible airdrop addresses...,hello everyoneto thank you all for the feedbac...,12.0,11.0,https://www.reddit.com/r/Algogems/comments/nxa...,Algogems,h1e93bp,lotteries will they be available in usa,2,Algogems,2021,11,6,2021,11,6
5777,nxagkm,airdrop information eligible airdrop addresses...,hello everyoneto thank you all for the feedbac...,12.0,11.0,https://www.reddit.com/r/Algogems/comments/nxa...,Algogems,h1ux5bf,can we get an updated list or is this complete,2,Algogems,2021,11,6,2021,15,6
5778,nx57l6,image size limitations,ok i had to resize my pictures to make them sm...,4.0,1.0,https://www.reddit.com/r/Algogems/comments/nx5...,Algogems,h1dnlxo,hi thank you for reporting the issue we were n...,1,Algogems,2021,11,6,2021,11,6
5780,nt3x6p,error messages,please someone who can explain what is the gu...,5.0,9.0,https://www.reddit.com/r/Algogems/comments/nt3...,Algogems,h0q3twt,regarding the issue of title description it ce...,2,Algogems,2021,5,6,2021,5,6
