## Data Preprocessing

In [1]:
# import core libraries 
import datetime
import json
import re
import csv
import ast
import pprint
import pathlib
import itertools
from collections import Counter
from itertools import islice

# import third-party libraries
import numpy as np
import pandas as pd

# import visualizations
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# set directory path data
syria_data_dir = pathlib.Path('/Users/adamstueckrath/Desktop/syria_data/')

# tweets_no_rts_csv file path
tweets_no_rts_csv = syria_data_dir / 'tweets_no_retweets' / 'tweets_no_retweets.csv'


## Tweet Data Processing

In [3]:
def string_to_datetime(tweet_date):
    """
    Turns a datetime string like this: 
    '2017-07-06T18:34:37.000Z' 
    to a Python datetime object like this -> 2017-07-06 18:34:41
    """
    return datetime.datetime.strptime(tweet_date, "%Y-%m-%dT%H:%M:%S.%fZ")


In [4]:
# load tweets into dataframe from csv file
tweets_no_rts_df = pd.read_csv(tweets_no_rts_csv, header=0,
                               parse_dates=['tweet_created_at'],
                               usecols=['tweet_id', 'tweet_id_str',
                                        'tweet_lang','tweet_created_at',
                                        'user_name','user_id_str','tweet_text'],
                               date_parser=string_to_datetime)


### Filter tweets for english only

In [5]:
tweets_no_rts_df.shape

(1160088, 7)

In [6]:
tweets_no_rts_df = tweets_no_rts_df[tweets_no_rts_df['tweet_lang'] =='en']
tweets_no_rts_df = tweets_no_rts_df.reset_index(drop=True)


In [7]:
tweets_no_rts_df.shape

(638161, 7)

### Clean tweet text

In [8]:
def clean_text(text):
    '''
    Utility function to clean the text in a tweet by removing 
    links and special characters using regex.
    use this for removing digits -> return re.sub('\d+', '', input_text)
    '''
    text = text.lower()
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text).split())


In [9]:
tweets_no_rts_df['tweet_text_clean'] = tweets_no_rts_df['tweet_text'].apply(clean_text)


## Tokenization

In [10]:
from nltk.tokenize import word_tokenize

def tokenize_text(text):
    return word_tokenize(text)


In [11]:
tweets_no_rts_df['tweet_text_tokenize'] = tweets_no_rts_df['tweet_text_clean'].apply(tokenize_text)
