In [1]:
import pandas as pd
import os

## Compiling CSVs

In [2]:
# Compile all the .csv's into a single pandas dataframe
dirname = '/Users/austinlasseter/DSI-EC-2/projects/datasets/proj_3/'
files = os.listdir(dirname + "/rawdata/")

COLUMN_NAMES = ['titles', 'subreddit', 'time', 'comments']
df = pd.DataFrame(columns=COLUMN_NAMES)

for file in files:
    small_df = pd.read_csv(dirname + "/rawdata/" + file)
    print(file) # check the filenames are accurate
    print(small_df.columns) # check the column names are accurate
    df=pd.concat([df, small_df], ignore_index=True) # put them all together

February_17-18-29.csv
Index(['titles', 'subreddit', 'time', 'comments'], dtype='object')
February_17-13-20.csv
Index(['titles', 'subreddit', 'time', 'comments'], dtype='object')
February_17-13-21.csv
Index(['titles', 'subreddit', 'time', 'comments'], dtype='object')
February_22-14-29.csv
Index(['titles', 'subreddit', 'time', 'comments'], dtype='object')
February_19-20-33.csv
Index(['titles', 'subreddit', 'time', 'comments'], dtype='object')
February_18-12-22.csv
Index(['titles', 'subreddit', 'time', 'comments'], dtype='object')
February_23-04-58.csv
Index(['titles', 'subreddit', 'time', 'comments'], dtype='object')
February_17-13-31.csv
Index(['titles', 'subreddit', 'time', 'comments'], dtype='object')
February_18-06-51.csv
Index(['titles', 'subreddit', 'time', 'comments'], dtype='object')
February_17-19-51.csv
Index(['titles', 'subreddit', 'time', 'comments'], dtype='object')
February_21-06-40.csv
Index(['titles', 'subreddit', 'time', 'comments'], dtype='object')
February_22-09-08.csv

In [3]:
file='February_18-06-51.csv'
small_df = pd.read_csv(dirname + "/rawdata/" + file)
print(small_df.columns)
small_df.head()

Index(['titles', 'subreddit', 'time', 'comments'], dtype='object')


Unnamed: 0,titles,subreddit,time,comments
0,Larry Nance Jr. recreates his father's dunk fr...,r/sports,5 hours ago,358 comments
1,Bathtime bliss,r/aww,7 hours ago,643 comments
2,"If this does well, I will continue to enlarge ...",r/detroitlions,8 hours ago,283 comments
3,Mister Rogers,r/MadeMeSmile,8 hours ago,598 comments
4,Steam Controller being tested,r/reallifedoodles,8 hours ago,253 comments


In [4]:
df.head()

Unnamed: 0,titles,subreddit,time,comments
0,New Image from Netflix's Post-Apocalyptic Zomb...,r/movies,3 hours ago,600 comments
1,"Philadelphia DA Larry Krasner Sues Big Pharma,...",r/news,3 hours ago,414 comments
2,Pigs in a Blanket Baked Brie [OC],r/GifRecipes,3 hours ago,352 comments
3,There have been 241 posts in /r/The_Donald lin...,r/RussiaLago,4 hours ago,1698 comments
4,Not a care in the world,r/BlackPeopleTwitter,4 hours ago,360 comments


In [5]:
# inspect
print(df.shape)
df.head()

(36505, 4)


Unnamed: 0,titles,subreddit,time,comments
0,New Image from Netflix's Post-Apocalyptic Zomb...,r/movies,3 hours ago,600 comments
1,"Philadelphia DA Larry Krasner Sues Big Pharma,...",r/news,3 hours ago,414 comments
2,Pigs in a Blanket Baked Brie [OC],r/GifRecipes,3 hours ago,352 comments
3,There have been 241 posts in /r/The_Donald lin...,r/RussiaLago,4 hours ago,1698 comments
4,Not a care in the world,r/BlackPeopleTwitter,4 hours ago,360 comments


## Remove duplicates

In [6]:
# The duplicates have different comment values and time values, for obvious reasons.
df.sort_values(by=['titles', 'comments'],  inplace=True)
df.head()

Unnamed: 0,titles,subreddit,time,comments
31896,"""@TheBigJamesG: What kind of president doesn't...",r/TrumpCriticizesTrump,12 hours ago,205 comments
7872,"""@TheBigJamesG: What kind of president doesn't...",r/TrumpCriticizesTrump,4 hours ago,90 comments
13635,"""Alright Pinchy, my little bucket of badness, ...",r/lego,16 hours ago,122 comments
13681,"""Alright Pinchy, my little bucket of badness, ...",r/lego,16 hours ago,122 comments
13728,"""Alright Pinchy, my little bucket of badness, ...",r/lego,16 hours ago,122 comments


In [7]:
# Removing duplicates reduces the size of my dataset by about 4/5.
df.drop_duplicates(['titles'], keep='first', inplace=True);
df.shape

(5304, 4)

In [8]:
# Notice that I kept the duplicate with the highest number of comments.
df.head()

Unnamed: 0,titles,subreddit,time,comments
31896,"""@TheBigJamesG: What kind of president doesn't...",r/TrumpCriticizesTrump,12 hours ago,205 comments
13635,"""Alright Pinchy, my little bucket of badness, ...",r/lego,16 hours ago,122 comments
268,"""Are you my Dad?""",r/aww,2 hours ago,10 comments
356,"""Behold a Pale Horse"" book from 1991, by Bill ...",r/CBTS_Stream,3 hours ago,4 comments
33453,"""Can I have a 5p bag with that?"". ""Sorry, we o...",r/britishproblems,11 hours ago,366 comments


## Data Cleaning

In [9]:
#Let's clean up some of that data.
df['sub']= (df['subreddit'].apply(lambda x: x.split('r/'))).apply(lambda x: x[1])
df['comments_count']= (df['comments'].apply(lambda x: x.split(' '))).apply(lambda x: x[0])
df['comments_count']=df['comments_count'].astype(int)

In [10]:
# Break up the time variable into its parts
df['time_string']= (df['time'].apply(lambda x: x.split(' ago'))).apply(lambda x: x[0])
df['time_count']= (df['time_string'].apply(lambda x: x.split(' '))).apply(lambda x: x[0])
df['time_type']= (df['time_string'].apply(lambda x: x.split(' '))).apply(lambda x: x[1])
# Convert from string to integer
df['time_count']=df['time_count'].astype(int)

In [11]:
# create a new column, called minutes.
df['minutes']=df['time_count']
# create a pair of arrays, with the time_count for hour/hours.
hours=df.loc[df['time_type']=='hours', 'time_count']
hour=df.loc[df['time_type']=='hour', 'time_count']
# multiply that by 60 and drop it into the 'minutes' column
df.loc[df['time_type']=='hours', 'minutes']=hours*60
df.loc[df['time_type']=='hour', 'minutes']=hour*60
# confirm that did what we wanted it to.
df.loc[df['time_type']=='minutes'].head(5)

Unnamed: 0,titles,subreddit,time,comments,sub,comments_count,time_string,time_count,time_type,minutes
661,"""Look, I'm not homeless"": A Dissection",r/rva,28 minutes ago,12 comments,rva,12,28 minutes,28,minutes,28
22595,A rare 7 in classic minesweeper,r/gaming,54 minutes ago,23 comments,gaming,23,54 minutes,54,minutes,54
5921,A sign outside CPAC where the head of the NRA ...,r/pics,54 minutes ago,96 comments,pics,96,54 minutes,54,minutes,54
2129,BOY TRIXIE'S REAL EVOLUTION,r/rupaulsdragrace,15 minutes ago,19 comments,rupaulsdragrace,19,15 minutes,15,minutes,15
12737,Close inspection of VAR actually shows lukaku ...,r/reddevils,28 minutes ago,28 comments,reddevils,28,28 minutes,28,minutes,28


In [12]:
# keep only the four cleaned columns, and fix their names.
cols=['title', 'subreddit', 'comments', 'minutes']
df=df.drop(['subreddit', 'time', 'comments', 'time_string', 'time_count', 'time_type'], axis=1)
df.columns=cols

In [13]:
print(df.shape)
df.head()

(5304, 4)


Unnamed: 0,title,subreddit,comments,minutes
31896,"""@TheBigJamesG: What kind of president doesn't...",TrumpCriticizesTrump,205,720
13635,"""Alright Pinchy, my little bucket of badness, ...",lego,122,960
268,"""Are you my Dad?""",aww,10,120
356,"""Behold a Pale Horse"" book from 1991, by Bill ...",CBTS_Stream,4,180
33453,"""Can I have a 5p bag with that?"". ""Sorry, we o...",britishproblems,366,660


In [14]:
# Save the df data.
df.to_csv(dirname + '/data.csv', index=False)