In [None]:
!pip install xgboost
!pip install tabulate
!pip install vaderSentiment
!pip install --upgrade pandas

In [1]:
import xgboost
import warnings
import textblob

import numpy as np
import pandas as pd
import keras.utils as U
import keras.layers as L
import keras.models as M
import keras.optimizers as opt
import matplotlib.pyplot as plt

from utils import *
from doc_utils import *
from tabulate import tabulate
from sklearn.ensemble import *
from collections import Counter
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import RidgeClassifier 
from sklearn.neighbors import KNeighborsClassifier
from nlp_utils import get_features, make_predictions
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.metrics import precision_score, recall_score, f1_score
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  from numpy.core.umath_tests import inner1d


In [None]:
plt.rcParams['figure.figsize'] = [20, 8]

# Read tweets csv
df = pd.concat([pd.read_csv("../../data/Tweets3WeeksLocations.csv"), pd.read_pickle("../../data/Tweets3Weeks_2Locations.pkl")]) 
# Drop all-null rows if any
df.dropna(how='all', inplace=True)
df.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
lead_days = 2
days_window = 5

start_date = pd.to_datetime("23-feb-2019")
end_date = pd.to_datetime("13-apr-2019")


labels = process_acled_csv("../../data/1900-01-01-2019-04-15-India.csv", 
                           top_locations=-1, 
                           start=start_date, 
                           end=end_date,
                           lead_days=lead_days,
                           days_window = days_window)

In [None]:
clean = False
if not clean:
    df = clean_df(df)
    clean = True

In [None]:
df.head(5)

In [None]:
df = get_tweet_sentiment(df)

In [None]:
plot_counter(df['lang'], num_elements=10, xlabel="Language", ylabel="Number of Tweets", title="Language Distribution")

In [None]:
df['created_at'] = pd.to_datetime(df['created_at'])
# Sort by time created
df.sort_values(by=['created_at'], inplace=True)

In [None]:
print(start_date, end_date)

warnings.filterwarnings('ignore')
location_date_dict = interleave_location_and_date(df, start_date, end_date)
warnings.filterwarnings('default')

### Current Features
   * Number of tweets each day
   * Average pos, neg, neu and compound features
   * Tweet count with neg sentiment

### TODO
   * Hate speech
   * Violent speech

In [None]:
location_features_dict = {}
for location in location_date_dict:
    location_features_dict[location] = get_features(location_date_dict[location])
    print(location, "done")

In [None]:
make_predictions(location_features_dict, labels, permute=False)

In [2]:
def generate_LSTM_model(history):
    """ Generates a compiled LSTM model
    
    Input - Number of history points considered
    """
    model = M.Sequential()
    model.add(L.InputLayer(input_shape=(history, 6)))
    model.add(L.LSTM(10))
    model.add(L.Dense(20, activation='relu'))
    model.add(L.Dense(2))
    model.add(L.Softmax())
    
    model.compile(loss="categorical_crossentropy", optimizer=opt.Adam(0.01))
    return model

def generate_CNN_model(history):
    """ Generate a compiled 1D CNN model
    
    Input - Number of history points considered
    """
    model = M.Sequential()
    model.add(L.InputLayer(input_shape=(history, 6)))
    model.add(L.Conv1D(32, kernel_size=3))
    model.add(L.Conv1D(16, kernel_size=1))
    model.add(L.Flatten())
    model.add(L.Dense(20, activation='relu'))
    model.add(L.Dense(2))
    model.add(L.Softmax())

    model.compile(loss="categorical_crossentropy", optimizer=opt.Adam(0.01))
    return model

def make_deep_predictions(location_features_dict, labels, model=None, permute=False, lead_days=2, days_window=5, history=3):
    """
    Input - 
            location_features_dict - The dict mapping from location to features
            labels - Label dict generated from process_acled_csv(..)
            model - Specific sklearn model to evaluate/benchmark performance
            permute - Permute the data before train-test split
            history - The number of data points for contextualization
    Returns - None
    """
    # Table for presenting on tabulate
    result_table = []

    # Compute intersection for locations present on both dicts
    common_locations = set(location_features_dict.keys()) & set(labels.keys())

    # Sorted for clarity
    common_locations = sorted(list(common_locations))

    for common_location in common_locations:
        # Get data and labels
        X, y = location_features_dict[common_location], labels[common_location]
        X, y = np.array(X), np.array(y)

        # Eliminate last days to match labels.shape
        X = X[:-(lead_days + days_window)]
        
        # Generate data for LSTM/CNN
        # Basically, use points from i to i + history
        # and predict for i + history + 1
        temp_X, temp_y = [], []
        for i in range(len(X) - history - 1):
            temp_X.append(X[i: i + history])
            temp_y.append(y[i + history + 1])
        
        X, y = np.array(temp_X), np.array(temp_y)
        
        # Permute randomly if specified
        if permute:
            p = np.random.permutation(len(X))
            X, y = X[p], y[p]

        # Split data into train & test - 75% & 25%
        split = int(0.75 * len(X))
        
        xtrain, ytrain = X[:split], y[:split]
        xtest, ytest = X[split:], y[split:]
        
        
        model = generate_LSTM_model(history)


        ytrain, ytest = U.to_categorical(ytrain, num_classes=2), U.to_categorical(ytest, num_classes=2)
        # Fit the train data
        model.fit(xtrain, ytrain, epochs=100, verbose=0)

        # Make predictions
        ypred = model.predict(xtest)
        ytrain_pred = model.predict(xtrain)

        # Uncategorize
        uncategorize = lambda x: np.argmax(x, axis=1)

        ytrain, ytest = uncategorize(ytrain), uncategorize(ytest)
        ytrain_pred, ypred = uncategorize(ytrain_pred), uncategorize(ypred)

        # Compute metrics
        train_acc = np.mean(ytrain_pred == ytrain)
        test_acc = np.mean(ytest == ypred)
        precision = precision_score(ytest, ypred, average='weighted', labels=np.unique(ypred))
        recall = recall_score(ytest, ypred, average='weighted', labels=np.unique(ypred))
        f1 = f1_score(ytest, ypred, average='weighted', labels=np.unique(ypred))

        # Add row to result_table
        result_row = [common_location,
                      np.round(train_acc, 2), np.round(test_acc, 2),
                      np.round(precision, 2), np.round(recall, 2),
                      np.round(f1, 2),
                      np.round(np.sum(y) / len(y), 2)]
        result_table.append(result_row)

    # Average stats
    # Turns out median is kind of useless
    result_table_copy = (np.array(result_table)[:, 1:]).astype(np.float32)
    result_table = sorted(result_table, key=lambda x: -x[2])
    averages = np.round(np.mean(result_table_copy, axis=0), 2)

    # Add them to the existing result table
    result_table.append(["Average"] + averages.tolist())

    # Header for table
    header = ["Location", "Train Accuracy", "Test Accuracy",
              "Precision", "Recall", "F1 Score", "+'s in data"]
    
    # Print tabulated result
    print(tabulate(result_table, 
                   tablefmt="pipe", 
                   stralign="center", 
                   headers=header))
    
    

In [None]:
make_deep_predictions(location_features_dict, labels, permute=False)