# Clean Trump's tweets. 

Tall task, I know.

We'll remove retweets, URLs, @mentions, hashtags, special characters, extra whitespace (but let the record show: he's good at generating whitespace); we'll replace "&" for "and", and "w/" for "with".

#### But first:
Grab the emoji library:
```
pip install emoji
```

In [1]:
import calendar
from datetime import datetime
import json
import re
import emoji

In [2]:
out_file_path = 'cleaned_tweets.json'

In [6]:
month_to_int = {v: k for k, v in enumerate(calendar.month_abbr)}

def remove_special_chars(s):
    remove_emojis = ''.join(c for c in s 
                            if c not in emoji.UNICODE_EMOJI)
    # Catch missed emoji:
    remove_nonalphanum = re.sub('[^a-zA-Z0-9 \n\.]', '', 
                                remove_emojis)
    remove_extra_whitespace = " ".join(remove_nonalphanum.split())
    return remove_extra_whitespace

def clean_tweet(tweet_text):
    """
    Clean a tweet's text. 
    
    Remove links, @-mentions, hashtags; replace
    "&" and "w/" with "and" and "with".
    """
    no_links = re.sub(r'http\S+', '', tweet_text)
    no_tags = re.sub(r'@\S+', '', no_links)
    nohashtags = re.sub(r'#\S+', '', no_tags)
    replace_ampersand = re.sub(r'&amp;', 'and', nohashtags)
    replace_with = re.sub(r'w/', 'with', replace_ampersand)

    replace_dquotes = re.sub(r'\"', '', replace_ampersand)
    replace_quotes = re.sub(r"\'", '', replace_dquotes)

    cleaned_text = remove_special_chars(replace_quotes)
    
    if not cleaned_text.endswith('.'):
        cleaned_text += '.'
    
    return cleaned_text

def longdate_to_timestamp(longdate):
    split_date = longdate.split()
    split_time = split_date[3].split(':')
    hours = int(split_time[0])
    mins = int(split_time[1])
    secs = int(split_time[2])
    month = month_to_int[split_date[1]]
    dt = datetime(int(split_date[-1]), month, int(split_date[2]), 
                  hours, mins, secs)
    timestamp = dt.timestamp()
    return float(timestamp)

cleaned_dictionary = dict()

In [7]:
tweets = json.load(open('trump_tweets.txt'))
not_retweets = [tweet for tweet in tweets if not tweet['is_retweet']]

In [8]:
for i in range(len(not_retweets)):
    tweet_text = not_retweets[i]['text']
    raw_date = not_retweets[i]['created_at']
    
    cleaned_tweet = clean_tweet(tweet_text)
    
    ts = longdate_to_timestamp(raw_date)
    
    if len(cleaned_tweet) > 1:
        cleaned_dictionary[ts] = cleaned_tweet
    
json.dump(cleaned_dictionary, open(out_file_path, 'w'), indent=4)