# Exploration of Kaggle Data

In [100]:
# Select Dataset for Exploration (options: wsb_df, all_df)
dataset = "all_df"

In [119]:
# Select the bin size in seconds (e.g. 604800 for a week)
bin_size = 604800

#### Load Data

In [120]:
#imports
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [121]:
wsb_df = pd.read_csv("../../Data/reddit_wsb.csv")
all_df = pd.read_csv("../../Data/r_dataisbeautiful_posts.csv", low_memory=False)

In [124]:
if dataset=="wsb_df":
    df=wsb_df
elif dataset=="all_df":
    df=all_df
    df=df.rename({'created_utc': 'created'}, axis=1)

#### Small Data Analysis

In [125]:
df.head(3)

Unnamed: 0,id,title,score,author,author_flair_text,removed_by,total_awards_received,awarders,created,full_link,num_comments,over_18
0,ll1p9h,Wordcloud of trending video titles on YouTube ...,1,OmarZiada,OC: 1,,0.0,[],1613473961,https://www.reddit.com/r/dataisbeautiful/comme...,0,False
1,ll1o4h,Wordcloud of trending videos on YouTube in the...,1,OmarZiada,OC: 1,moderator,0.0,[],1613473829,https://www.reddit.com/r/dataisbeautiful/comme...,1,False
2,ll15gx,Immunization in India. Source: https://niti.go...,1,Professional_Napper_,,moderator,0.0,[],1613471541,https://www.reddit.com/r/dataisbeautiful/comme...,1,False


In [126]:
start_date = datetime.fromtimestamp(df.created.min())
end_date = datetime.fromtimestamp(df.created.max())

In [127]:
print(f"The dataset is from {start_date} to {end_date} and has {df.shape[0]} datapoints.")

The dataset is from 2012-02-15 00:50:56 to 2021-02-16 12:12:41 and has 190853 datapoints.


#### Data To Timeseries

In [128]:
df = df[['title','score','created']]

In [129]:
# Binning data into buckets of selected bin size
dates = sorted(list(set(df['created'])))
mini, maxi = min(dates),max(dates)
df['bins'] = df['created'].map(lambda x:int((x-mini)//bin_size))

In [130]:
aggregation_dict = {'title':'sum'}
df = df.groupby(df['bins']).aggregate(aggregation_dict)

#### Text Processing

In [131]:
tfidf = TfidfVectorizer(
    analyzer='word',
    lowercase=True,
    stop_words='english',
    max_features=None
)
features=tfidf.fit_transform(df['title']).toarray()   
index = df.index

In [132]:
df = pd.DataFrame(
    data=features,
    columns=range(len(features[0])),
    index=index,
)

In [134]:
df.head(3)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,135144,135145,135146,135147,135148,135149,135150,135151,135152,135153
bins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Save Data

In [135]:
#df.to_csv('wsb_time_series.csv')