# Investigate tweets for question


In [1]:
# import core libraries 
import datetime
import re
import csv
import pathlib
import itertools
from collections import Counter

# import third-party libraries
import pandas

# nltk library
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
# set directory path data
syria_data_dir = pathlib.Path('/Users/adamstueckrath/Desktop/syria_data/')

# tweets_no_rts_csv file path
tweets_no_rts_csv = syria_data_dir / 'tweets_no_retweets' / 'tweets_no_retweets.csv'


In [3]:
def string_to_datetime(tweet_date):
    """
    Turns a datetime string like this: 
    '2017-07-06T18:34:37.000Z' 
    to a Python datetime object like this -> 2017-07-06 18:34:41
    """
    return datetime.datetime.strptime(tweet_date, "%Y-%m-%dT%H:%M:%S.%fZ")


In [4]:
# load tweets into dataframe from csv file
tweets_no_rts_df = pandas.read_csv(tweets_no_rts_csv, header=0,
                                   parse_dates=['tweet_created_at'], 
                                   date_parser=string_to_datetime)


In [5]:
# print dataframe
tweets_no_rts_df.shape


(1160088, 29)

In [6]:
# set event data from events dataset
event_date = pandas.to_datetime('2017-08-04').date()
print(event_date)


2017-08-04


In [7]:
# filter function for finding tweets during event date
def event_date_filter(tweet_date):
    if tweet_date.date() == event_date:
        return True
    return False 

tweets_no_rts_df['tweet_event_date'] = tweets_no_rts_df["tweet_created_at"].apply(event_date_filter)


In [8]:
# how many tweets are during the event date
tweets_no_rts_df['tweet_event_date'].value_counts()


False    1134431
True       25657
Name: tweet_event_date, dtype: int64

In [9]:
# filter tweets dataframe on tweet_event_date
tweets_no_rts_df = tweets_no_rts_df[tweets_no_rts_df.tweet_event_date == True]


## Finding an event to match tweet

In [10]:
# set event string
event_test = 'Violent clashes took place in the village of Ghanim al-Ali in Ar-Raqqa countryside between the Syrian army and its allies on one side and the Islamic State on other, the clashes were accompanied with airstrikes on the area of conflict. Pro-Syrian regime forces fully controlled the village. No fatalities reported.'


In [11]:
# set stop words and word net
stop_words = set(stopwords.words('english'))
wordnet_lemmatizer = WordNetLemmatizer()

def normalizer(tweet):
    only_letters = re.sub("[^a-zA-Z]", " ", tweet) 
    tokens = nltk.word_tokenize(only_letters)
    lower_case = [l.lower() for l in tokens]
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    return lemmas


In [12]:
# remove stop words and normalize event string
event_list = normalizer(event_test)
print(event_list)


['violent', 'clash', 'took', 'place', 'village', 'ghanim', 'al', 'ali', 'ar', 'raqqa', 'countryside', 'syrian', 'army', 'ally', 'one', 'side', 'islamic', 'state', 'clash', 'accompanied', 'airstrikes', 'area', 'conflict', 'pro', 'syrian', 'regime', 'force', 'fully', 'controlled', 'village', 'fatality', 'reported']


In [13]:
# set tweet text to normalize
tweets_no_rts_df['tweet_text_normalize'] = tweets_no_rts_df["tweet_text"].apply(normalizer)


In [14]:
# check if tweet matches test event
def is_words_match(tweet):
    return list((map(lambda each: each in event_list, tweet)))

def check(tweet):
    test = is_words_match(tweet)
    if any(test):
        return True
    return False


In [15]:
# tweet event check
tweets_no_rts_df['tweet_event_check'] = tweets_no_rts_df["tweet_text_normalize"].apply(check)


In [16]:
# filter tweets dataframe on event check
tweets_no_rts_df = tweets_no_rts_df[tweets_no_rts_df.tweet_event_check == True]


In [18]:
# print size of dataframe
tweets_no_rts_df.shape

(3909, 32)

In [19]:
# write dataframe to csv
tweets_no_rts_df.to_csv('/Users/adamstueckrath/Desktop/syria_data/model/question_test.csv')
