# Unit 3 - Who is the Saltiest Hacker?
## Data Engineering by: Jay Adamo, Brad Brauser & Ryan Koul

# Exploratory Data Analysis

In [None]:
# Importing necessary libraries
import pandas as pd
import re
dataset_url = 'https://zenodo.org/record/45901/files/hacker_news_comments.csv'
df = pd.read_csv(dataset_url, encoding='utf-8')


In [None]:
# Sorting the data randomly
print(df.info())
df.sample(10)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1165439 entries, 0 to 1165438
Data columns (total 11 columns):
 #   Column                Non-Null Count    Dtype 
---  ------                --------------    ----- 
 0   story_id              1165439 non-null  int64 
 1   story_time            1165439 non-null  int64 
 2   story_url             987540 non-null   object
 3   story_text            183381 non-null   object
 4   story_author          1164932 non-null  object
 5   comment_id            1165439 non-null  int64 
 6   comment_text          1165439 non-null  object
 7   comment_author        1165439 non-null  object
 8   comment_ranking       1165439 non-null  int64 
 9   author_comment_count  1165439 non-null  int64 
 10  story_comment_count   1165439 non-null  int64 
dtypes: int64(6), object(5)
memory usage: 97.8+ MB
None


Unnamed: 0,story_id,story_time,story_url,story_text,story_author,comment_id,comment_text,comment_author,comment_ranking,author_comment_count,story_comment_count
407484,2682712,1308744269,http://markusjais.com/why-scala-seems-difficul...,,nuriaion,2683333,"Coincidentally, today is the first day I'm ser...",currywurst,6,52,12
538938,4247096,1342365640,http://steverandytantra.com/thoughts/three-mon...,Sublime Text 2 is a text editor with great res...,steverandy,4247290,having started my programming career with web ...,csulok,5,17,14
242880,7364034,1394246581,,,,7364114,Wow might need a filter there of some sort.,michaelrhansen,16,29,30
535233,8584637,1415638619,http://www.rubyraptor.org/how-we-made-raptor-u...,,triskweline,8588290,Does it seem weird to anyone else that they&#x...,bratsche,1,72,21
291186,7332402,1393829671,https://www.mtgox.com/?they_finally_provide_so...,,sillysaurus3,7332645,I&#x27;m troubled by the persistent societal t...,stevenh,3,29,16
768346,4216832,1341812011,,I'm about to start on a cross-platform desktop...,metaxyy,4216932,Qt is probably pretty hard to beat in maturity...,randomdata,15,28,27
692206,5015119,1357442477,http://www.jsonline.com/features/health/online...,,absconditus,5015371,I agree that it depends on the community. It's...,kafkaesque,1,49,13
603191,7107626,1390479282,,I am in Italy and I need help. I hope there is...,watermel0n,7109217,Can&#x27;t you go almost anywhere else in the EU?,debacle,58,1037,58
1072635,2255331,1298488549,http://blog.amirkhella.com/2011/02/23/what-i-w...,,amirkhella,2256124,He's right. The most important thing we tell ...,pg,0,3368,27
9387,1988926,1291925503,http://nathanmarz.com/blog/how-to-reject-a-job...,,nathanmarz,1990622,I think the most frequent complaint of mine is...,desigooner,5,149,22


In [None]:
# Defining test_comment
test_comment = df['comment_text'][11]
print(test_comment)
print()

# Remove HTML tags
result_1 = re.sub(r'<.*?>', '', test_comment)

# Remove URLS
result_2 = re.sub('http[s]?(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'
                  '(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', result_1)

# Remove hexadecimal
result_3 = re.sub(r'&.*?;', '', result_2)

# Remove non-Ascii
result_4 = ''.join(char for char in result_3 if ord(char) < 128)

# Remove non-alphanumeric
result_5 = re.sub('[^a-zA-Z0-9]', ' ', result_4)

# email = re.compile('\w+@\w+\.[a-z]{3}')
result_5


sharing.io<p>invoices.io<p>keys.io<p>fubar.io<p>tele.io<p>I've mentioned some of these before, but then decided I didn't want to part with some of them for "maybe one day projects."  Not anymore; I'm just ready to be rid of them honestly.  I'll take $100 for any of them (buy them all for $400 ;)).  That won't really even cover the reg fees for as long as I've had them, but it'll recover a bit of the investment at least.<p>Just e-mail me (HN username at GMail) and we'll do it. :)



'sharing ioinvoices iokeys iofubar iotele ioI ve mentioned some of these before  but then decided I didn t want to part with some of them for  maybe one day projects    Not anymore  I m just ready to be rid of them honestly   I ll take  100 for any of them  buy them all for  400       That won t really even cover the reg fees for as long as I ve had them  but it ll recover a bit of the investment at least Just e mail me  HN username at GMail  and we ll do it    '

In [None]:
# Data wrangling
def wrangle(df, subset=200000, col='comment_text', random_state=99):
    """
    Accepts a dataframe and returns a subset with no non-English,
    non-alphanumeric characters in specified column. Default subset
    is 200,000 rows randomly chosen, default column is 'comment_text',
    and default random_state is 99.
    """

    # Make shallow copy to preserve original
    df = df[:]

    # Subset data
    df = df.sample(n=subset, random_state=random_state)

    # Remove HTML tags
    df[col] = df[col].apply(lambda comment: re.sub(r'<.*?>', '', comment))

    # Remove URLs
    df[col] = df[col].apply(lambda comment: re.sub(
        r'http[s]?(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'
        '(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', comment))

    # Remove hexadecimal
    df[col] = df[col].apply(
      lambda comment: re.sub(r'&.*?;', '', comment))

    # Remove non-ascii characters
    df[col] = df[col].apply(
      lambda comment: ''.join(char for char in comment if ord(char) < 128))

    # Remove non-alphanumeric including punctuation but not whitspace
    df[col] = df[col].apply(
      lambda comment: re.sub('[^a-zA-Z0-9 ]', ' ', comment))

    # Return subset cleaned dataframe
    return df


In [None]:
# Wrangling the cleaned data
df_cleaned = wrangle(df)
df_cleaned['comment_text'].sample(10)


1031141    Cucumbertown does this very well  They allow y...
54769      Over 50  of this article is a job posting    I...
359002     Google is located in an area where there aren ...
1070188    I really wish the article explicitly explained...
970692     I was having severe pain with this last month ...
229694      On this day  I highlighted her workstation an...
728108     We still have 2 client manager and 1 sales pos...
1110051    There are several reasons why I dislike sports...
391295     For those as confused as I was  it turns out y...
542387             They should hire him in the Android team 
Name: comment_text, dtype: object

In [None]:
# Downloading resulting csv
from google.colab import files

df_cleaned.to_csv('cleaned_comments_2.csv', index=False)
files.download('cleaned_comments_2.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>