In [15]:
# import core libraries 
import datetime
import json
import re
import csv
import ast
import pprint
import pathlib
import itertools
from collections import Counter
from itertools import islice

# import third-party libraries
import numpy as np
import pandas as pd

# import visualizations
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [16]:
# set directory path data
syria_data_dir = pathlib.Path('/Users/adamstueckrath/Desktop/syria_data/')

# syria_events_csv file path
events_pre_processed_csv = syria_data_dir / 'model' / 'model_data' / 'events_pre_processed.csv'

# tweets_no_rts_csv file path
tweets_pre_processed_csv = syria_data_dir / 'model' / 'model_data' / 'tweets_pre_processed.csv'


In [17]:
tweets_df = pd.read_csv(tweets_pre_processed_csv, header=0)
events_df = pd.read_csv(events_pre_processed_csv, header=0)


In [12]:
t = tweets_df.dropna(subset=['tweet_text_normalize'])

In [None]:
tweets_df.head()


In [18]:
events_df.head()

Unnamed: 0,event_id,event_date,event_type,actor_1,assoc_actor_1,actor_2,assoc_actor_2,location,event_text,event_text_clean,event_text_tokenize,event_text_normalize
0,10317,2017-08-04,Remote violence,Unidentified Military Forces,,,,Thiban,Unknown warplanes targeted the village of Thib...,unknown warplanes targeted the village of thib...,"['unknown', 'warplanes', 'targeted', 'the', 'v...","['unknown', 'warplane', 'targeted', 'village',..."
1,10300,2017-08-04,Battle-No change of territory,AAS: Ahrar al Sham,,Opposition Rebels (Syria),Jund al Aqsa,Maar Shamarin,Clashes between Ahrar al-Sham militia and mili...,clashes between ahrar al sham militia and mili...,"['clashes', 'between', 'ahrar', 'al', 'sham', ...","['clash', 'ahrar', 'al', 'sham', 'militia', 'm..."
2,10283,2017-08-04,Remote violence,Islamist Rebels (Syria),,Military Forces of Syria (2000-),,Bashkwi,The Islamic rebel troops targeted Syrian army ...,the islamic rebel troops targeted syrian army ...,"['the', 'islamic', 'rebel', 'troops', 'targete...","['islamic', 'rebel', 'troop', 'targeted', 'syr..."
3,10318,2017-08-04,Remote violence,Military Forces of Syria (2000-),,,,Um Hartein,"The Syrian army shelled the villages of Murak,...",the syrian army shelled the villages of murak ...,"['the', 'syrian', 'army', 'shelled', 'the', 'v...","['syrian', 'army', 'shelled', 'village', 'mura..."
4,10319,2017-08-04,Remote violence,Unidentified Armed Group (Syria),,HTS: Hayat Tahrir al Sham,Civilians (Syria),Urum al-Kubra,Two HTS members and 2 civilians were killed in...,two hts members and 2 civilians were killed in...,"['two', 'hts', 'members', 'and', '2', 'civilia...","['two', 'hts', 'member', '2', 'civilian', 'kil..."


## What is Tf-idf?
Tf-idf is a very common technique for determining roughly what each document in a set of documents is “about”. It cleverly accomplishes this by looking at two simple metrics: tf (term frequency) and idf (inverse document frequency). Term frequency is the proportion of occurrences of a specific term to total number of terms in a document. Inverse document frequency is the inverse of the proportion of documents that contain that word/phrase. Simple, right!? The general idea is that if a specific phrase appears a lot of times in a given document, but it doesn’t appear in many other documents, then we have a good idea that the phrase is important in distinguishing that document from all the others. Let’s think about it a bit more concretely:

If the word "nails" show up 5 times in the document we're looking at, then that's pretty different if there are 100 total words in the document or 10,000. The latter document mentions nails but doesn't seem to be significantly about nails (this is why Term Frequency is a proportion instead of a raw count)
If the word "nails" shows up in 1% of all documents, that's pretty different than if it shows up in 80% of all documents. In the latter case, it's less unique to the document we're looking at.

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer

Tf-idf is a transformation you apply to texts to get two real-valued vectors. You can then obtain the cosine similarity of any pair of vectors by taking their dot product and dividing that by the product of their norms. That yields the cosine of the angle between the vectors.

### What is the difference between TfidfVectorizer and CountVectorizer?

TfidfVectorizer combines all options of CountVectorizer and TfidfTransformer in a single model.

CountVectorizer just counts the word frequencies. Simple as that.

With the TFIDFVectorizer the value increases proportionally to count, but is offset by the frequency of the word in the corpus. - This is the IDF (inverse document frequency part).

In [None]:
tweet_test = tweets_df.copy()
tweet_test = tweet_test[:5000]

In [None]:
event_test = events_df.copy()
event_test = event_test

In [None]:
tweet_test['tweet_text_normalize'] = tweet_test['tweet_text_normalize'].apply(lambda x: ast.literal_eval(x))
event_test['event_text_normalize'] = event_test['event_text_normalize'].apply(lambda x: ast.literal_eval(x))


In [None]:
tweet_test_vec = tweet_test.tweet_text_normalize.tolist()
event_test_vec = event_test.event_text_normalize.tolist()


In [None]:
tweet_test_vec = [ ' '.join(x) for x in tweet_test_vec ]
event_test_vec = [ ' '.join(x) for x in event_test_vec ]
print(tweet_test_vec[:10])
print(event_test_vec[:10])


### The difference between fit_transform(), fit(), and transform()
Hence, every sklearn's transform's fit() just calculates the parameters (e.g. μ and σ in case of StandardScaler) and saves them as an internal objects state. Afterwards, you can call its transform() method to apply the transformation to a particular set of examples.

fit_transform() joins these two steps and is used for the initial fitting of parameters on the training set x, but it also returns a transformed x′. Internally, it just calls first fit() and then transform() on the same data.


In layman's terms, fit_transform means to do some calculation and then do transformation (say calculating the means of columns from some data and then replacing the missing values). So for training set, you need to both calculate and do transformation.

But for testing set (event set), Machine learning applies prediction based on what was learned during the training set and so it doesn't need to calculate, it just performs the transformation.

### explaination #2
So by fit the imputer calculates the means of columns from some data, and by transform it applies those means to some data (which is just replacing missing values with the means). If both these data are the same (i.e. the data for calculating the means and the data that means are applied to) you can use fit_transform which is basically a fit followed by a transform.

Now your questions:

Why we might need to transform data?

"For various reasons, many real world datasets contain missing values, often encoded as blanks, NaNs or other placeholders. Such datasets however are incompatible with scikit-learn estimators which assume that all values in an array are numerical" (source)

What does it mean fitting model on training data and transforming to test data?

The fit of an imputer has nothing to do with fit used in model fitting. So using imputer's fit on training data just calculates means of each column of training data. Using transform on test data then replaces missing values of test data with means that were calculated from training data.



## Test 1

In [None]:
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer

stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

'''remove punctuation, lowercase, stem'''
def normalize(text):
    t = stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
    return t

vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]



## Test 2

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
tweet_set = tweet_test_vec #Tweets
event_set = event_test_vec #Event Query

vectorizer = TfidfVectorizer()
trainVectorizerArray = vectorizer.fit_transform(tweet_set).toarray()
testVectorizerArray = vectorizer.fit_transform(event_set).toarray()


# print ("cosine scores ==> ")
# cosine_similarity(trainVectorizerArray[0:1], testVectorizerArray)  #here the first element of tfidf_matrix_train is matched with other three elements



## Test 3

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
import numpy as np
import numpy.linalg as LA

tweet_set = tweet_test_vec #Tweets
event_set = event_test_vec #Event Query

vectorizer = CountVectorizer()
transformer = TfidfTransformer()

trainVectorizerArray = vectorizer.fit_transform(tweet_set).toarray()
testVectorizerArray = vectorizer.transform(event_set).toarray()

cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
# for train_vector in trainVectorizerArray:
#     print('Tweet')
#     for test_vector in testVectorizerArray:
        
#         cosine = cx(train_vector, test_vector)
#         print (cosine)

# transformer.fit(trainVectorizerArray)
# print (transformer.transform(trainVectorizerArray).toarray())

# transformer.fit(testVectorizerArray)
# tfidf = transformer.transform(testVectorizerArray)
# print (tfidf.todense())


## Test 4

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy.linalg as LA

tweet_set = tweet_test_vec #Tweets
event_set = event_test_vec #Event Query

vectorizer = TfidfVectorizer()

tweetVectorizerArray = vectorizer.fit_transform(tweet_set).toarray()
eventVectorizerArray = vectorizer.transform(event_set).toarray()

cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
test = []
for tweet_vector in tweetVectorizerArray:
    cosine_list = []
    for event_vector in eventVectorizerArray:
        cosine = cx(tweet_vector, event_vector)
        cosine_list.append(cosine)
#         print (cosine)
    test.append(max(cosine_list))



In [None]:
test.sort(reverse=True)
print(test[:10])