In [62]:
import numpy as np
import pandas as pd
import keras
import gensim
import os
import pickle
import matplotlib.pyplot as plt
from LDA_helpers import process_tweets
from helpers import *
import re
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Conv1D, MaxPooling1D, GlobalAveragePooling1D, Flatten


In [2]:
DATA_PATH = access_folder()
GENERATED_DATA_PATH = access_folder('generated')
print(os.listdir(DATA_PATH))
print(os.listdir(GENERATED_DATA_PATH))

['IRAhandle_tweets_4.csv', 'IRAhandle_tweets_5.csv', 'rus_troll_tweet_stats.csv', 'IRAhandle_tweets_7.csv', 'IRAhandle_tweets_6.csv', 'rus_troll_user.csv', 'IRAhandle_tweets_2.csv', '.DS_Store', 'IRAhandle_tweets_3.csv', 'IRAhandle_tweets_1.csv', 'line_to_author.pickle', 'iran_troll_tweet_stats.csv', 'rus_troll_tweet_text.csv', '.gitignore', 'iran_troll_tweet_text.csv', 'tweet_author_df.pickle', 'iran_troll_user.csv', 'hashtags.txt', 'rus_troll_tweet_metadata.csv', 'iran_troll_tweet_metadata.csv', 'IRAhandle_tweets_8.csv', 'IRAhandle_tweets_9.csv']
['Unknown_tweets.txt', 'test_indices.npy', '.DS_Store', 'cleaned_RightTroll_tweets.txt', 'cleaned_Unknown_tweets.txt', 'cleaned_HashtagGamer_tweets.txt', 'benja.pkl', 'LeftTroll_tweets.txt', 'NonEnglish_tweets.txt', 'NewsFeed_tweets.txt', 'cleaned_Fearmonger_tweets.txt', 'benja.csv', 'cleaned_NewsFeed_tweets.txt', '.gitignore', 'training_indices.npy', 'cleaned_NonEnglish_tweets.txt', 'Fearmonger_tweets.txt', 'HashtagGamer_tweets.txt', 'weath

In [3]:
data_df = pd.read_csv(GENERATED_DATA_PATH + 'tweets_n_hashtags.csv', index_col=0)
data_df['Topic'] = data_df['Topic'].astype('category')
data_df['Topic Ids'] = data_df['Topic'].cat.codes
data_df

Unnamed: 0,Topic,tweet_text,hashtag,Topic Ids
33,News,"['john', 'carroll', 'university', 'get', 'gift']",news,8
34,News,"['spring', 'cook', 'book', 'healthy', 'chocola...",local,8
37,News,"['rocky', 'river', 'prepare', 'more', 'floodin...",news,8
39,News,"['forecast', 'cooler', 'mid', 'week', 'cooler'...",news,8
40,News,"['brother', 'farook', 'decorate', 'veteran', '...",TopNews,8
42,News,"['arrest', 'connection', 'art', 'festival', 't...",news,8
44,News,"['director', 'craven', 'die', 'scream', 'filmm...",news,8
45,Sports,"['secretariat', 'dominate', 'american', 'pharo...",sports,10
46,News,"['black', 'box', 'miss', 'german', 'airline', ...",local,8
48,Sports,"['dale', 'earnhardt', 'get', 'engage', 'girlfr...",sports,10


In [4]:
test_indices = np.load(GENERATED_DATA_PATH + 'test_indices.npy')
test_indices

array([ 524589, 1795012, 1642773, ...,  811295, 1303909,  778210])

In [5]:
data_df.tweet_text = data_df.tweet_text.apply(lambda x: x[1:-1]).replace("b'", '', regex=True).replace("'", '', regex=True)
data_df.tweet_text = data_df.tweet_text.apply(lambda x: x.split(', '))
categories = ['News', 'Sports', 'Crime', 'Fukushima', 'Entertainment', 
              'Anti-Trump', 'Patriot', 'Trump Support',
              'Foreign Countries', 'Health',
              'Black Support', 'Anti-Islam']
from collections import Counter

vocab = Counter()

for text in data_df.tweet_text:
    for word in text: ## change this because the texts are already a list of words
        vocab[word.lower()]+=1

total_words = len(vocab)

def get_word_2_index(vocab):
    word2index = {}
    for i,word in enumerate(vocab):
        word2index[word.lower()] = i

    return word2index

word2index = get_word_2_index(vocab)

In [65]:
def data_2_1_hot_encoding(tweets, dictionary):
    x_data = []
    for tweet in tweets:
        line = np.zeros(len(dictionary.keys()))
        for word in tweet:
            line[dictionary[word]] = 1
        x_data.append(line)
    return x_data

def neural_net(shape):

    model = Sequential()
    model.add(Conv1D(64, 3, activation='relu', input_shape=(shape, 1)))
    model.add(Conv1D(64, 3, activation='relu'))
    model.add(MaxPooling1D(13))
    model.add(Flatten())
    #model.add(Dense(512, input_shape=(shape, ), activation='relu' ))
    model.add(Dropout(0.5))
    
    model.add(Dense(12 , activation='softmax', ))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    model.summary()
    return model


def get_training_indices(data_indexes, test_indexes):
    training_indexes = []
    for index in data_indexes:
        if index not in test_indexes:
            training_indexes.append(index)
    return np.array(training_indexes)

def train(model , training_indices, data, class_weight, dictionary):
    splits = np.array_split(training_indices, 50)
    split_i = 1
    for split in splits:
        print('split: {}/{}'.format(split_i, len(splits)))
        train_data = data.loc[split,:]
        train_data['X'] = data_2_1_hot_encoding(train_data.tweet_text, dictionary)
        X_train = np.zeros((len(split, ),len(train_data.X.values[0])))
        y_train = []
        i = 0
        for train_index in split:
            X_train[i,:] = np.array(train_data.loc[train_index,('X')])
            y_train.append(data.iloc[train_index]['Topic Ids'])
            i = i + 1
        print(X_train[0,:].sum())
        one_hot_labels = keras.utils.to_categorical(np.array(y_train), num_classes=12)
        model.fit(np.reshape(X_train, (np.shape(X_train)[0], np.shape(X_train)[1], 1)), one_hot_labels, epochs=10, batch_size=32, verbose=2, validation_split=0.2,
                  class_weight = class_weight)
        split_i = split_i + 1
    model.save(GENERATED_DATA_PATH + 'model.h5')

In [7]:
#data_df['X'] = data_2_1_hot_encoding(data_df.tweet_text, word2index)



In [8]:
training_indices = get_training_indices(data_df.index.values, test_indices)
training_indices

array([     33,      34,      37, ..., 2150827, 2150834, 2150857])

In [9]:
np.save(GENERATED_DATA_PATH + 'training_indices.npy',training_indices)

In [None]:
model = neural_net(len(word2index))
s = data_df['Topic Ids'].value_counts()/data_df['Topic Ids'].sum()
train(model , training_indices, data_df, s.to_dict(), word2index)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_32 (Conv1D)           (None, 66216, 64)         256       
_________________________________________________________________
conv1d_33 (Conv1D)           (None, 66214, 64)         12352     
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 5093, 64)          0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 325952)            0         
_________________________________________________________________
dropout_18 (Dropout)         (None, 325952)            0         
_________________________________________________________________
dense_56 (Dense)             (None, 12)                3911436   
Total params: 3,924,044
Trainable params: 3,924,044
Non-trainable params: 0
_________________________________________________________________
