# Data Cleaning

In [2]:
import pandas as pd

In [4]:
def read_in_data(topic, type_of_post):
    
    """
    Returns dataframe of specific topic and type of post with columns "topic" and "text"
    """
    import pandas as pd
    
    if type_of_post == 'submissions':
        df = pd.read_csv('../datasets/' + topic + '_submissions.csv', usecols=['subreddit', 'title'])
        df.rename({'subreddit':'topic', 'title': 'text'}, axis=1 , inplace=True)
        return df
    elif type_of_post == 'comments':
        df = pd.read_csv('../datasets/' + topic + '_comments.csv', usecols=['subreddit', 'body'])
        df.rename({'subreddit':'topic', 'body': 'text'}, axis=1 , inplace=True)
        return df
    else:
        print('Enter a topic and type of post')

In [5]:
#Read in my data

hiking_sub = read_in_data('hiking', 'submissions')
hiking_com = read_in_data('hiking', 'comments')
gardening_sub = read_in_data('gardening', 'submissions')
gardening_com = read_in_data('gardening', 'comments')

In [6]:
# Check my dataframes

hiking_sub.head()

Unnamed: 0,topic,text
0,hiking,Boulder Flatiron Loop Hike
1,hiking,Washington state lakes to swim
2,hiking,"Here's a fun episode, demonstrating why a fly-..."
3,hiking,Picture I took of my friend at Angel’s Landing...
4,hiking,Hiking to Bertha Peak via Cougar Crest Trail a...


In [7]:
hiking_com.head()

Unnamed: 0,text,topic
0,Dang I tried a month ago the snow was super de...,hiking
1,\nI see you've posted an image. Thanks for yo...,hiking
2,I can't tell what is more impressive: the wild...,hiking
3,This isn't part of the Smokey Mountains. This...,hiking
4,I was just thinking about this the other day b...,hiking


In [8]:
gardening_sub.head()

Unnamed: 0,topic,text
0,gardening,What’s this in the middle of my bell pepper pl...
1,gardening,This weird lemon
2,gardening,My indoor zinnia bloomed!
3,gardening,Fungus? Dangerous?
4,gardening,Raise your hand if you're a chaotic gardener a...


In [9]:
gardening_com.head()

Unnamed: 0,text,topic
0,So cool!,gardening
1,"I have a few more containers, but this is the ...",gardening
2,Maybe needs repotting in new soil.,gardening
3,Oh no. You or your neighbor need to sever the ...,gardening
4,Gorgeous collection,gardening


In [10]:
#Merging all 4 dataframes

postings = pd.concat([hiking_sub, gardening_sub, hiking_com, gardening_com], ignore_index=True)

In [11]:
postings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12500 entries, 0 to 12499
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   topic   12500 non-null  object
 1   text    12500 non-null  object
dtypes: object(2)
memory usage: 195.4+ KB


In [12]:
postings

Unnamed: 0,topic,text
0,hiking,Boulder Flatiron Loop Hike
1,hiking,Washington state lakes to swim
2,hiking,"Here's a fun episode, demonstrating why a fly-..."
3,hiking,Picture I took of my friend at Angel’s Landing...
4,hiking,Hiking to Bertha Peak via Cougar Crest Trail a...
...,...,...
12495,gardening,I'm transitioning to everbearing in strawberry...
12496,gardening,So pretty
12497,gardening,I think it’s 30% chance of frost after that date.
12498,gardening,My understanding is that the caterpillars eat ...


In [13]:
postings['topic'].value_counts()

hiking       7000
gardening    5500
Name: topic, dtype: int64

In [14]:
#Save this file before I clean

postings.to_csv('../datasets/postings.csv', index=False)

In [15]:
#Check for deleted posts

postings[postings['text'] == '[deleted]'].value_counts()

topic      text     
gardening  [deleted]    33
hiking     [deleted]    14
dtype: int64

In [16]:
# Delete the above 47 deleted posts:

postings.drop(postings[postings['text'] == '[deleted]'].index, inplace = True)

In [17]:
postings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12453 entries, 0 to 12498
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   topic   12453 non-null  object
 1   text    12453 non-null  object
dtypes: object(2)
memory usage: 291.9+ KB


In [18]:
# Confirm that deleted posts were indeed dropped

postings[postings['text'] == '[deleted]'].value_counts()

Series([], dtype: int64)

In [19]:
postings['text'].value_counts()

Thank you!                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   19
r/hikersfriend                                                                                                                                                                                                                                                                                                                                                                                                                          

In [20]:
postings['topic'].value_counts()

hiking       6986
gardening    5467
Name: topic, dtype: int64

In [21]:
#Save this file after I minimally cleaned it

postings.to_csv('../datasets/postings_minimally_cleaned.csv', index=False)

In [None]:
# Now into deep data cleaning...

In [22]:
#Delete "text" duplicates
postings.drop_duplicates(subset='text', inplace=True, ignore_index=True)

In [23]:
postings

Unnamed: 0,topic,text
0,hiking,Boulder Flatiron Loop Hike
1,hiking,Washington state lakes to swim
2,hiking,"Here's a fun episode, demonstrating why a fly-..."
3,hiking,Picture I took of my friend at Angel’s Landing...
4,hiking,Hiking to Bertha Peak via Cougar Crest Trail a...
...,...,...
11915,gardening,Nope haha Oklahoma so not too far away.
11916,gardening,I'm transitioning to everbearing in strawberry...
11917,gardening,So pretty
11918,gardening,I think it’s 30% chance of frost after that date.


In [24]:
#Remove URLs #inspired by https://stackoverflow.com/questions/51994254/removing-url-from-a-column-in-pandas-dataframe/51994366

postings['text'] = postings['text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')

In [25]:
#Check to confirm removal

postings['text'].str.contains('https://www.thespruce.com/free-seed-catalogs-1357756').any()

False

In [26]:
#Remove other unneccesarry character groupings:

postings['text'] = postings['text'].str.replace('\n\n&amp;#x200B;\n\n', ' ')
postings['text'] = postings['text'].str.replace('\n', ' ')
postings['text'] = postings['text'].str.replace('&amp;', ' ')
postings['text'] = postings['text'].str.replace('&amp', ' ')
postings['text'] = postings['text'].str.replace('&gt;', ' ')
postings['text'] = postings['text'].str.replace('/r/', ' ')
postings['text'] = postings['text'].str.replace('#羊台叠翠', ' ')

In [27]:
#Check to confirm removal

postings['text'].str.contains('#羊台叠翠').any()

False

In [28]:
#inspired by https://machinelearningmastery.com/clean-text-machine-learning-python/

In [29]:
from nltk.tokenize import word_tokenize

In [30]:
#Make text lowercase and remove punctuation marks

def lowercase_and_remove_punctuation(sentence):
    lower_case_sentence = sentence.lower()
    sentence_split = word_tokenize(lower_case_sentence)
    clean_split = [word for word in sentence_split if word.isalpha()]
    clean_join = ' '.join(clean_split)
    final_string = str(clean_join)
    return clean_join

In [31]:
postings['text'] = postings['text'].apply(lowercase_and_remove_punctuation)

In [32]:
#Check for '' after cleaning

postings[postings['text']=='']

Unnamed: 0,topic,text
478,hiking,
583,hiking,
854,hiking,
957,hiking,
1011,hiking,
...,...,...
11396,gardening,
11436,gardening,
11837,gardening,
11847,gardening,


In [33]:
# Delete the above 63 empty posts:

postings.drop(postings[postings['text'] == ''].index, inplace = True)

In [34]:
postings['topic'].value_counts()

hiking       6581
gardening    5276
Name: topic, dtype: int64

In [35]:
postings['topic'].value_counts().sum()

11857

In [135]:
#Save this file after I cleaned it

postings.to_csv('../datasets/postings_cleaned.csv', index=False)