In [1]:
# import core libraries 
import datetime
import json
import re
import csv
import ast
import pathlib
import itertools
from collections import Counter
from itertools import islice

# import third-party libraries
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from pandas import ExcelWriter


In [2]:
# set directory path data
twitter_data_dir = pathlib.Path('/Users/adamstueckrath/Desktop/syria_data/')

# tweets_no_rts_csv file path
tweets_no_rts_csv = twitter_data_dir / 'tweets_no_retweets' / 'tweets_no_retweets.csv'


In [3]:
def string_to_datetime(tweet_date):
    """
    Turns a datetime string like this: 
    '2017-07-06T18:34:37.000Z' 
    to a Python datetime object like this -> 2017-07-06 18:34:41
    """
    return datetime.datetime.strptime(tweet_date, "%Y-%m-%dT%H:%M:%S.%fZ")



In [4]:
# load tweets into dataframe from csv file
tweets_no_rts_df = pd.read_csv(tweets_no_rts_csv, header=0,
                               parse_dates=['tweet_created_at'], 
                               date_parser=string_to_datetime)


In [5]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
analyzer = SentimentIntensityAnalyzer()
def clean_tweet(tweet):
    '''
    Utility function to clean the text in a tweet by removing 
    links and special characters using regex.
    '''
    tweet = tweet.lower()
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

def analyze_sentiment(tweet):
    '''
    Utility function to classify the polarity of a tweet
    using nltk. analysis variable returns the following dict: 
    {'neg': 0.122, 'neu': 0.641, 'pos': 0.237, 'compound': 0.4215}
    The compound value here conveys the overall positive or negative user experience.
    Examples: 
    https://www.programcreek.com/python/example/100005/nltk.sentiment.vader.SentimentIntensityAnalyzer
    https://opensourceforu.com/2016/12/analysing-sentiments-nltk/
    '''
    analysis = analyzer.polarity_scores(clean_tweet(tweet))
    if analysis['compound'] > 0.1:
        return 1
    elif analysis['compound'] == 0:
        return 0
    else:
        return -1
    



In [None]:
test = 'Assad loses 3 generals during the first week of July #Syria #RevolutionWins #militia #Free_Syrian_Army\nhttps://t.co/5x8UbVOnH9'

testing = clean_tweet(test)
print(testing)


In [6]:
tweets_no_rts_df_en = tweets_no_rts_df.copy()
tweets_no_rts_df_en = tweets_no_rts_df_en[tweets_no_rts_df_en['tweet_lang'] =='en']


In [8]:
tweets_no_rts_df_en['tweet_text_clean'] = tweets_no_rts_df_en['tweet_text'].apply(clean_tweet)
tweets_no_rts_df_en['tweet_text_sentiment'] = tweets_no_rts_df_en['tweet_text_clean'].apply(analyze_sentiment)


In [None]:
t = tweets_no_rts_df_en['tweet_text_sentiment'].value_counts().to_dict()
print(t)


In [9]:
tweet_text = tweets_no_rts_df_en['tweet_text_clean']
tweet_text_list = tweet_text.tolist()
print(len(tweet_text_list))


638161


## SVM Model

### What characterizes text of different sentiments?
While we still haven't decided what classification method to use, it's useful to get an idea of how the different texts look. This might be an "old school" approach in the age of deep learning, but lets indulge ourselves nevertheless.

To explore the data we apply some crude preprocessing. We will tokenize and lemmatize using Python NLTK, and transform to lower case. As words mostly matter in context we'll look at bi-grams instead of just individual tokens.

As a way to simplify later inspection of results we will store all processing of data together with it's original form. This means we will extend the Pandas dataframe into which we imported the raw data with new columns as we go along.

In [None]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
wordnet_lemmatizer = WordNetLemmatizer()

def normalizer(tweet):
    only_letters = re.sub("[^a-zA-Z]", " ", tweet) 
    tokens = nltk.word_tokenize(only_letters)
    lower_case = [l.lower() for l in tokens]
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    return lemmas


In [None]:
tweets_no_rts_df_en['tweet_text_normalized'] = tweets_no_rts_df_en['tweet_text_clean'].apply(normalizer)

In [None]:
from nltk import ngrams
def ngrams(input_list):
    #onegrams = input_list
    bigrams = [' '.join(t) for t in list(zip(input_list, input_list[1:]))]
    trigrams = [' '.join(t) for t in list(zip(input_list, input_list[1:], input_list[2:]))]
    return bigrams+trigrams


In [None]:
tweets_no_rts_df_en['tweet_text_grams'] = tweets_no_rts_df_en['tweet_text_normalized'].apply(ngrams)
tweets_no_rts_df_en[['tweet_text_grams']].head()


In [None]:
def count_words(input):
    cnt = Counter()
    for row in input:
        for word in row:
            cnt[word] += 1
    return cnt


In [None]:
tweets_no_rts_df_en[(tweets_no_rts_df_en.tweet_text_sentiment == 1)][['tweet_text_grams']].apply(count_words)['tweet_text_grams'].most_common(20)


In [None]:
tweets_no_rts_df_en[(tweets_no_rts_df_en.tweet_text_sentiment == -1)][['tweet_text_grams']].apply(count_words)['tweet_text_grams'].most_common(20)


### Linear SVM classifier
We will build a simple, linear Support-Vector-Machine (SVM) classifier. The classifier will take into account each unique word present in the sentence, as well as all consecutive words. To make this representation useful for our SVM classifier we transform each sentence into a vector. The vector is of the same length as our vocabulary, i.e. the list of all words observed in our training data, with each word representing an entry in the vector. If a particular word is present, that entry in the vector is 1, otherwise 0.

To create these vectors we use the CountVectorizer from sklearn.

In [None]:
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(ngram_range=(1,2))

In [None]:
test = tweets_no_rts_df_en.copy()[:50000]
test.shape

In [None]:
vectorized_data = count_vectorizer.fit_transform(test.tweet_text)
indexed_data = hstack((np.array(range(0,vectorized_data.shape[0]))[:,None], vectorized_data))

In [None]:
def sentiment2target(sentiment):
    return {
        -1: 0,
        0: 1,
        1: 2
    }[sentiment]
targets = test.tweet_text_sentiment.apply(sentiment2target)

In [None]:
from sklearn.model_selection import train_test_split
data_train, data_test, targets_train, targets_test = train_test_split(indexed_data, targets, test_size=0.4, random_state=0)
data_train_index = data_train[:,0]
data_train = data_train[:,1:]
data_test_index = data_test[:,0]
data_test = data_test[:,1:]


In [None]:
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
clf = OneVsRestClassifier(svm.SVC(gamma=0.01, C=100., probability=True, class_weight='balanced', kernel='linear'))
clf_output = clf.fit(data_train, targets_train)


In [None]:
clf.score(data_test, targets_test)


### Filtering tweet words (for feature vector)
Stop words - a, is, the, with etc. The full list of stop words can be found at Stop Word List. These words don't indicate any sentiment and can be removed.

Repeating letters - if you look at the tweets, sometimes people repeat letters to stress the emotion. E.g. hunggrryyy, huuuuuuungry for 'hungry'. We can look for 2 or more repetitive letters in words and replace them by 2 of the same.

Punctuation - we can remove punctuation such as comma, single/double quote, question marks at the start and end of each word. E.g. beautiful!!!!!! replaced with beautiful

Words must start with an alphabet - For simplicity sake, we can remove all those words which don't start with an alphabet. E.g. 15th, 5.34am


## Kmeans clustering

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score


In [15]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(tweet_text_list)

true_k = 25
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=25, n_init=1, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [16]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print


Top terms per cluster:
Cluster 0:
 syria
 russia
 trump
 iran
 israel
 assad
 like
 just
 people
 good
Cluster 1:
 killed
 civilians
 fighting
 syria
 coalition
 isis
 600
 strikes
 iraq
 british
Cluster 2:
 asimah
 kuwait
 st
 city
 al
 damascus
 alen56
 bk8
 q8
 218
Cluster 3:
 isis
 syria
 fight
 lgbt
 fighting
 women
 unit
 burqas
 queer
 created
Cluster 4:
 al
 qaeda
 syria
 idlib
 sham
 cnn
 propagandist
 hired
 qaida
 documentary
Cluster 5:
 usa
 russia
 syria
 uk
 politics
 nato
 israel
 china
 eu
 trump
Cluster 6:
 putin
 trump
 syria
 ceasefire
 russia
 meeting
 deal
 signs
 cease
 assad
Cluster 7:
 damascus
 knife
 custom
 steel
 hunting
 handmade
 syrian
 near
 handle
 strike
Cluster 8:
 war
 syria
 civil
 world
 crimes
 russia
 israel
 trump
 torn
 iran
Cluster 9:
 says
 syria
 russia
 official
 trump
 war
 state
 military
 russian
 coalition
Cluster 10:
 lebanon
 border
 hezbollah
 syria
 operation
 launches
 thousands
 lebanese
 offensive
 launch
Cluster 11:
 cease
 syri

In [14]:
print("\n")
print("Prediction")

Y = vectorizer.transform(["Trump 3 generals dead during the first week of July is toast Trump"])
prediction = model.predict(Y)
print(prediction)

Y = vectorizer.transform(["Help trump is president"])
prediction = model.predict(Y)
print(prediction)
 



Prediction
[7]
[7]
