In [9]:
# import core libraries 
import datetime
import json
import re
import csv
import ast
import pprint
import pathlib
import itertools
from collections import Counter
from itertools import islice

# import third-party libraries
import numpy as np
import pandas as pd
from dateutil import parser

# import visualizations
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
# set directory path data
syria_data_dir = pathlib.Path('/Users/adamstueckrath/Desktop/syria_data/')

# syria_events_csv file path
syria_events_csv = syria_data_dir / 'events' / 'syria_event_2017_present.csv'


In [11]:
def string_to_datetime(event_date):
    """
    Turns a datetime string like this: 
    '4-Aug-17'
    to a Python datetime object like this -> 2017-08-04
    """
    event_date = parser.parse(event_date)
    return event_date.date()

In [12]:
# load tweets into dataframe from csv file
syria_events_df = pd.read_csv(syria_events_csv, header=0,
                               usecols=['event_id_no_cnty', 'event_date','event_type',
                                        'location','latitude','longitude',
                                        'actor1','assoc_actor_1',
                                        'actor2','assoc_actor_2','notes'])


In [13]:
# transform date column
syria_events_df['event_date'] = syria_events_df['event_date'].apply(string_to_datetime)


In [14]:
# rename columns
syria_events_df = syria_events_df.rename(columns={'event_id_no_cnty': 'event_id', 
                                                  'actor1': 'actor_1',
                                                  'actor2': 'actor_2',
                                                  'notes': 'event_text'})


In [15]:
def clean_text(text):
    '''
    Utility function to clean the text by removing 
    links and special characters using regex.
    use this for removing digits -> return re.sub('\d+', '', input_text)
    '''
    text = text.lower()
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text).split())


In [16]:
syria_events_df['event_text_clean'] = syria_events_df['event_text'].apply(clean_text)


## Tokenization

In [17]:
from nltk.tokenize import word_tokenize

def tokenize_text(text):
    return word_tokenize(text)


In [18]:
syria_events_df['event_text_tokenize'] = syria_events_df['event_text_clean'].apply(tokenize_text)


## Stemming and removing stop words¶

In [19]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
wordnet_lemmatizer = WordNetLemmatizer()

def normalizer(text):
    filtered_text = list(filter(lambda l: l not in stop_words, text))
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_text]
    return lemmas


In [20]:
syria_events_df['event_text_normalize'] = syria_events_df['event_text_tokenize'].apply(normalizer)


In [21]:
syria_events_df.head()


Unnamed: 0,event_id,event_date,event_type,actor_1,assoc_actor_1,actor_2,assoc_actor_2,location,latitude,longitude,event_text,event_text_clean,event_text_tokenize,event_text_normalize
0,10317,2017-08-04,Remote violence,Unidentified Military Forces,,,,Thiban,35.0114,40.5047,Unknown warplanes targeted the village of Thib...,unknown warplanes targeted the village of thib...,"[unknown, warplanes, targeted, the, village, o...","[unknown, warplane, targeted, village, thiban,..."
1,10300,2017-08-04,Battle-No change of territory,AAS: Ahrar al Sham,,Opposition Rebels (Syria),Jund al Aqsa,Maar Shamarin,35.6159,36.7188,Clashes between Ahrar al-Sham militia and mili...,clashes between ahrar al sham militia and mili...,"[clashes, between, ahrar, al, sham, militia, a...","[clash, ahrar, al, sham, militia, militant, pr..."
2,10283,2017-08-04,Remote violence,Islamist Rebels (Syria),,Military Forces of Syria (2000-),,Bashkwi,36.3278,37.1203,The Islamic rebel troops targeted Syrian army ...,the islamic rebel troops targeted syrian army ...,"[the, islamic, rebel, troops, targeted, syrian...","[islamic, rebel, troop, targeted, syrian, army..."
3,10318,2017-08-04,Remote violence,Military Forces of Syria (2000-),,,,Um Hartein,35.3872,36.8608,"The Syrian army shelled the villages of Murak,...",the syrian army shelled the villages of murak ...,"[the, syrian, army, shelled, the, villages, of...","[syrian, army, shelled, village, murak, lahaya..."
4,10319,2017-08-04,Remote violence,Unidentified Armed Group (Syria),,HTS: Hayat Tahrir al Sham,Civilians (Syria),Urum al-Kubra,36.1482,36.9478,Two HTS members and 2 civilians were killed in...,two hts members and 2 civilians were killed in...,"[two, hts, members, and, 2, civilians, were, k...","[two, hts, member, 2, civilian, killed, car, e..."


In [22]:
syria_events_df.iloc[7]['event_text']


'The Syrian army shelled Talbiseh village in the Northern countryside of Homs which led to the killing ofing one militant from an Islamic rebel troop.'

In [23]:
syria_events_df.iloc[7]['event_text_normalize']

['syrian',
 'army',
 'shelled',
 'talbiseh',
 'village',
 'northern',
 'countryside',
 'homs',
 'led',
 'killing',
 'ofing',
 'one',
 'militant',
 'islamic',
 'rebel',
 'troop']

In [24]:
syria_events_df.iloc[7]['event_text_clean']

'the syrian army shelled talbiseh village in the northern countryside of homs which led to the killing ofing one militant from an islamic rebel troop'

## Save data for training and testing 

In [25]:
# syria_events_csv file path
events_pre_processed_csv = syria_data_dir / 'model' / 'model_data' /'events_pre_processed.csv'

# write tweets to csv 
syria_events_df.to_csv(events_pre_processed_csv, index=False)
