# Imports

In [None]:
import pandas as pd    # To load the data
import re              # To clean the data

# Read in and display dataset

In [None]:
dataset_url = 'https://zenodo.org/record/45901/files/hacker_news_comments.csv'
df = pd.read_csv(dataset_url, encoding='utf-8')

In [None]:
print(df.info())
df.sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1165439 entries, 0 to 1165438
Data columns (total 11 columns):
 #   Column                Non-Null Count    Dtype 
---  ------                --------------    ----- 
 0   story_id              1165439 non-null  int64 
 1   story_time            1165439 non-null  int64 
 2   story_url             987540 non-null   object
 3   story_text            183381 non-null   object
 4   story_author          1164932 non-null  object
 5   comment_id            1165439 non-null  int64 
 6   comment_text          1165439 non-null  object
 7   comment_author        1165439 non-null  object
 8   comment_ranking       1165439 non-null  int64 
 9   author_comment_count  1165439 non-null  int64 
 10  story_comment_count   1165439 non-null  int64 
dtypes: int64(6), object(5)
memory usage: 97.8+ MB
None


Unnamed: 0,story_id,story_time,story_url,story_text,story_author,comment_id,comment_text,comment_author,comment_ranking,author_comment_count,story_comment_count
131439,10248203,1442768771,https://www.timeshighereducation.com/news/erns...,,rcurry,10248360,The title for the HN submission is currently v...,jamessb,12,35,17
350621,6446077,1380131112,http://www.theguardian.com/world/2013/sep/25/r...,,yapcguy,6446498,As referenced to in the article and highly rel...,vowelless,12,63,26
768891,3958627,1336734523,http://www.stallman.org/articles/asked_to_lie....,,gulbrandr,3959360,Offtopic: I liked how minimalistic the site lo...,JBiserkov,12,92,50
1087875,9218213,1426603250,http://www.slideshare.net/Odersky/scala-days-s...,,youroub,9218567,Is the compiler faster these days?,BonoboBoner,9,131,11
1111543,1464238,1277584225,,"Like a lot of HN hackers, I've been living and...",cageface,1464373,Anywhere in central California (the central va...,eam,12,121,31
872921,3943824,1336488098,,"(been on HN for half a decade, but posting ano...",thrwwy20120508,3944281,Shitty jobs happen. People don't always admit ...,michaelochurch,14,1441,55
744917,9415891,1429640114,http://automatetheboringstuff.com/,,adamnemecek,9417394,This is kind of how I got my girlfriend intere...,akilism,10,52,17
746730,7801834,1401141263,https://medium.com/p/2a7af4788b10,,coldtea,7803207,&lt;offtopic&gt;<p>+1 just for mentioning requ...,annnnd,15,77,44
485436,3313790,1323075356,http://me.veekun.com/blog/2011/12/04/fuck-pass...,,vetler,3313938,"Awesome rant.<p>In my mind, someone (browser v...",dprice1,9,16,60
446092,549298,1239041799,http://www.tapinko.com,,keltecp11,549626,"I'm using Firefox 2.0.0.7, which I don't want ...",zavulon,21,209,27


# Step-by-step comment cleaning

In [None]:
test_comment = df['comment_text'][2]
print(test_comment)
print()

# Remove HTML tags
result_1 = re.sub(r'<.*?>', '', test_comment)

# Remove URLS
result_2 = re.sub('http[s]?(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', result_1)

# Remove hexadecimal
result_3 = re.sub(r'&.*?;', '', result_2)

# Remove non-Ascii
result_4 = ''.join(char for char in result_3 if ord(char)<128)

# Remove non-alphanumeric
result_5 = re.sub('[^a-zA-Z0-9]', ' ', result_4)

# email = re.compile('\w+@\w+\.[a-z]{3}')
result_5

I like this a lot!<p>The research manuscript example is exciting. It would be great if authors could link directly to the part of a paper that they are citing and be able to open that up if you want to dive deeper. Linking methods to results to discussion for specific experiments would make reading through dense papers a lot easier, and maybe have a notation&#x2F;jargon definition section open at the same time. It&#x27;s almost like a tiling window manager for reading.<p>I&#x27;m a little bit concerned about how it looks on smaller screens. It looks fine on my work monitor but I only have a netbook at home right now and a lot of websites have overlapping elements that keep me from reading articles. I haven&#x27;t looked at this from that computer yet though. Maybe it would help to have collapsible columns if there are issues.<p>Good luck, I&#x27;m looking forward to seeing where this goes!



'I like this a lot The research manuscript example is exciting  It would be great if authors could link directly to the part of a paper that they are citing and be able to open that up if you want to dive deeper  Linking methods to results to discussion for specific experiments would make reading through dense papers a lot easier  and maybe have a notationjargon definition section open at the same time  Its almost like a tiling window manager for reading Im a little bit concerned about how it looks on smaller screens  It looks fine on my work monitor but I only have a netbook at home right now and a lot of websites have overlapping elements that keep me from reading articles  I havent looked at this from that computer yet though  Maybe it would help to have collapsible columns if there are issues Good luck  Im looking forward to seeing where this goes '

In [None]:
def wrangle(df, subset=200000, col='comment_text', random_state=99):
  """
  Accepts a dataframe and returns a subset with no non-English,
  non-alphanumeric characters in specified column. Default subset
  is 200,000 rows randomly chosen, default column is 'comment_text',
  and default random_state is 99.
  """

  # Make shallow copy to preserve original
  df = df[:]

  # Subset data
  df = df.sample(n=subset, random_state=random_state)
  
  # Remove HTML tags
  df[col] = df[col].apply(
      lambda comment: re.sub(r'<.*?>', '', comment))
  
  # Remove URLs
  df[col] = df[col].apply(
      lambda comment: re.sub(r'http[s]?(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', comment))
  
  # Remove hexadecimal
  df[col] = df[col].apply(
      lambda comment: re.sub(r'&.*?;', '', comment))
  
  # Remove non-ascii characters
  df[col] = df[col].apply(
      lambda comment: ''.join(char for char in comment if ord(char)<128))
  
  # Remove non-alphanumeric characters including punctuation but not whitspace
  df[col] = df[col].apply(
      lambda comment: re.sub('[^a-zA-Z0-9 ]', ' ', comment))

  # Return subset cleaned dataframe
  return df

In [None]:
df_cleaned = wrangle(df)
df_cleaned['comment_text'].sample(10)

1031141    Cucumbertown does this very well  They allow y...
54769      Over 50  of this article is a job posting    I...
359002     Google is located in an area where there aren ...
1070188    I really wish the article explicitly explained...
970692     I was having severe pain with this last month ...
229694      On this day  I highlighted her workstation an...
728108     We still have 2 client manager and 1 sales pos...
1110051    There are several reasons why I dislike sports...
391295     For those as confused as I was  it turns out y...
542387             They should hire him in the Android team 
Name: comment_text, dtype: object

In [None]:
from google.colab import files

df_cleaned.to_csv('cleaned_comments_2.csv', index=False)
files.download('cleaned_comments_2.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>