In [None]:
import numpy as np 
import pandas as pd
import re
import string
import sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, HashingVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score
import nltk
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import SimpleRNN, GRU, LSTM, Embedding, Dense
from tensorflow.keras import Sequential

trainData = pd.read_csv('train.csv')

In [None]:
trainData

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [None]:
#replace all NaN values with an empty space
trainData = trainData.fillna(' ')
trainData

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [None]:
#pad all text entries out to 280 chars, the max limit for twitter
for i, row in trainData.iterrows():
    newVal = row['text'].ljust(280, ' ')
    trainData.at[i,'text'] = newVal

In [None]:
#split out each char into its own column
features = trainData.text.str.split('', expand=True)
#convert each char to its ASCII value
for i, row in features.iterrows():
  for j in range(0, 282):
    if(features.at[i,j] == ""):
      features.at[i,j] = ord(" ")
      continue
    features.at[i, j] = ord(features.at[i,j])
#ensure data is structured properly for the model
features = np.asarray(features).astype('float32')
features

array([[ 32.,  79., 117., ...,  32.,  32.,  32.],
       [ 32.,  70., 111., ...,  32.,  32.,  32.],
       [ 32.,  65., 108., ...,  32.,  32.,  32.],
       ...,
       [ 32.,  77.,  49., ...,  32.,  32.,  32.],
       [ 32.,  80., 111., ...,  32.,  32.,  32.],
       [ 32.,  84., 104., ...,  32.,  32.,  32.]], dtype=float32)

In [None]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

In [None]:
features

array([[ 32.,  79., 117., ...,  32.,  32.,  32.],
       [ 32.,  70., 111., ...,  32.,  32.,  32.],
       [ 32.,  65., 108., ...,  32.,  32.,  32.],
       ...,
       [ 32.,  77.,  49., ...,  32.,  32.,  32.],
       [ 32.,  80., 111., ...,  32.,  32.,  32.],
       [ 32.,  84., 104., ...,  32.,  32.,  32.]], dtype=float32)

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
# Simple RNN network, think these give great results but not familiar enough to fine tune them
layers_RNN = [
    Embedding(20000, 64),
    RNN(64, dropout = 0.2, recurrent_dropout = 0.2),
    Dense(2, activation = 'sigmoid')
]
rnn_simple = Sequential(layers_RNN)

In [None]:
# Simple LSTM network, think these give great results but not familiar enough to fine tune them
layers_LSTM = [
    Embedding(20000, 64),
    LSTM(64, dropout = 0.2, recurrent_dropout = 0.2),
    Dense(2, activation = 'sigmoid')
]
LSTM_simple = Sequential(layers_LSTM)

In [None]:
# Simple GRU network, think these give great results but not familiar enough to fine tune them
layers_GRU = [
    Embedding(20000, 64),
    GRU(64, dropout = 0.2, recurrent_dropout = 0.2),
    Dense(2, activation = 'sigmoid')
]
GRU_simple = Sequential(layers_GRU)

In [None]:
from keras import losses
#train the model and track results
target = trainData['target']
#model.compile(optimizer="Adam", loss="binary_crossentropy", metrics=["accuracy"])
rnn_simple.compile(optimizer="Adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
rnn_simple.fit(features, target, epochs=3)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

KeyboardInterrupt: ignored