In [1]:
import pandas as pd
import os

## Compiling CSVs

In [2]:
# Compile all the .csv's into a single pandas dataframe
dirname = os.getcwd()
files = os.listdir(dirname + "/data/")

COLUMN_NAMES = ['titles', 'subreddit', 'time', 'comments']
df = pd.DataFrame(columns=COLUMN_NAMES)

for file in files:
    small_df = pd.read_csv(dirname + "/data/" + file)
    df=pd.concat([df, small_df], ignore_index=True)


In [3]:
# inspect
print(df.shape)
df.head()

(33595, 4)


Unnamed: 0,titles,subreddit,time,comments
0,New Image from Netflix's Post-Apocalyptic Zomb...,r/movies,3 hours ago,600 comments
1,"Philadelphia DA Larry Krasner Sues Big Pharma,...",r/news,3 hours ago,414 comments
2,Pigs in a Blanket Baked Brie [OC],r/GifRecipes,3 hours ago,352 comments
3,There have been 241 posts in /r/The_Donald lin...,r/RussiaLago,4 hours ago,1698 comments
4,Not a care in the world,r/BlackPeopleTwitter,4 hours ago,360 comments


## Remove duplicates

In [4]:
# The duplicates have different comment values and time values, for obvious reasons.
df.sort_values(by=['titles', 'comments'],  inplace=True)
df.head()

Unnamed: 0,titles,subreddit,time,comments
28986,"""@TheBigJamesG: What kind of president doesn't...",r/TrumpCriticizesTrump,12 hours ago,205 comments
7872,"""@TheBigJamesG: What kind of president doesn't...",r/TrumpCriticizesTrump,4 hours ago,90 comments
10725,"""Alright Pinchy, my little bucket of badness, ...",r/lego,16 hours ago,122 comments
10771,"""Alright Pinchy, my little bucket of badness, ...",r/lego,16 hours ago,122 comments
10818,"""Alright Pinchy, my little bucket of badness, ...",r/lego,16 hours ago,122 comments


In [5]:
# Removing duplicates reduces the size of my dataset by about 4/5.
df.drop_duplicates(['titles'], keep='first', inplace=True);
df.shape

(4793, 4)

In [6]:
# Notice that I kept the duplicate with the highest number of comments.
df.head()

Unnamed: 0,titles,subreddit,time,comments
28986,"""@TheBigJamesG: What kind of president doesn't...",r/TrumpCriticizesTrump,12 hours ago,205 comments
10725,"""Alright Pinchy, my little bucket of badness, ...",r/lego,16 hours ago,122 comments
268,"""Are you my Dad?""",r/aww,2 hours ago,10 comments
356,"""Behold a Pale Horse"" book from 1991, by Bill ...",r/CBTS_Stream,3 hours ago,4 comments
30543,"""Can I have a 5p bag with that?"". ""Sorry, we o...",r/britishproblems,11 hours ago,366 comments


## Data Cleaning

In [7]:
#Let's clean up some of that data.
df['sub']= (df['subreddit'].apply(lambda x: x.split('r/'))).apply(lambda x: x[1])
df['comments_count']= (df['comments'].apply(lambda x: x.split(' '))).apply(lambda x: x[0])
df['comments_count']=df['comments_count'].astype(int)

In [8]:
# Break up the time variable into its parts
df['time_string']= (df['time'].apply(lambda x: x.split(' ago'))).apply(lambda x: x[0])
df['time_count']= (df['time_string'].apply(lambda x: x.split(' '))).apply(lambda x: x[0])
df['time_type']= (df['time_string'].apply(lambda x: x.split(' '))).apply(lambda x: x[1])
# Convert from string to integer
df['time_count']=df['time_count'].astype(int)

In [9]:
# create a new column, called minutes.
df['minutes']=df['time_count']
# create a pair of arrays, with the time_count for hour/hours.
hours=df.loc[df['time_type']=='hours', 'time_count']
hour=df.loc[df['time_type']=='hour', 'time_count']
# multiply that by 60 and drop it into the 'minutes' column
df.loc[df['time_type']=='hours', 'minutes']=hours*60
df.loc[df['time_type']=='hour', 'minutes']=hour*60
# confirm that did what we wanted it to.
df.loc[df['time_type']=='minutes'].head(5)

Unnamed: 0,titles,subreddit,time,comments,sub,comments_count,time_string,time_count,time_type,minutes
661,"""Look, I'm not homeless"": A Dissection",r/rva,28 minutes ago,12 comments,rva,12,28 minutes,28,minutes,28
19685,A rare 7 in classic minesweeper,r/gaming,54 minutes ago,23 comments,gaming,23,54 minutes,54,minutes,54
5921,A sign outside CPAC where the head of the NRA ...,r/pics,54 minutes ago,96 comments,pics,96,54 minutes,54,minutes,54
2127,As a teacher in the U.S. this seems more real ...,r/PoliticalHumor,11 minutes ago,36 comments,PoliticalHumor,36,11 minutes,11,minutes,11
2129,BOY TRIXIE'S REAL EVOLUTION,r/rupaulsdragrace,15 minutes ago,19 comments,rupaulsdragrace,19,15 minutes,15,minutes,15


In [10]:
# keep only the four cleaned columns, and fix their names.
cols=['title', 'subreddit', 'comments', 'minutes']
df=df.drop(['subreddit', 'time', 'comments', 'time_string', 'time_count', 'time_type'], axis=1)
df.columns=cols

In [11]:
print(df.shape)
df.head()

(4793, 4)


Unnamed: 0,title,subreddit,comments,minutes
28986,"""@TheBigJamesG: What kind of president doesn't...",TrumpCriticizesTrump,205,720
10725,"""Alright Pinchy, my little bucket of badness, ...",lego,122,960
268,"""Are you my Dad?""",aww,10,120
356,"""Behold a Pale Horse"" book from 1991, by Bill ...",CBTS_Stream,4,180
30543,"""Can I have a 5p bag with that?"". ""Sorry, we o...",britishproblems,366,660


In [13]:
# Save the df data.
df.to_csv('/Users/austinlasseter/DSI-EC-2/projects/project-3/data.csv', index=False)