In [1]:
import numpy as np
import pandas as pd

In [3]:
train_tweets = pd.read_csv('train_tweets.csv')
train_tweets = train_tweets.sample(frac = 1)
train_tweets.head()

Unnamed: 0,id,label,tweet
19654,19655,0,â #cny expected to remain under pressure â...
2035,2036,0,i am terrific. #i_am #positive #affirmation
21334,21335,0,why do i fear a post-apocalyptic world? find o...
13735,13736,0,@user is wishing all you proud fathers out the...
23423,23424,1,@user a #nation should not be #judged by how i...


In [5]:
from keras.preprocessing.text import text_to_word_sequence
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def train_tweets_cleaning(doc):
    train_tweets['clean_text'] = doc.apply(lambda x : ' '.join(text_to_word_sequence(x)))
    train_tweets['clean_text'] = train_tweets['clean_text'].apply(lambda y : ' '.join([y for y in y.split() if not y in stop_words]))
    train_tweets['clean_text'] = train_tweets['clean_text'].apply(lambda z : ' '.join([z for z in z.split() if len(z) > 1]))
    train_tweets['clean_text'] = train_tweets['clean_text'].apply(lambda w : ' '.join([w for w in w.split() if w.isalpha()]))
    train_tweets['review_length'] = train_tweets['clean_text'].apply(lambda v : len([v for v in v.split()]))

train_tweets_cleaning(train_tweets['tweet'])
train_tweets.head()

Unnamed: 0,id,label,tweet,clean_text,review_length
19654,19655,0,â #cny expected to remain under pressure â...,cny expected remain pressure rabobank blog sil...,9
2035,2036,0,i am terrific. #i_am #positive #affirmation,terrific positive affirmation,3
21334,21335,0,why do i fear a post-apocalyptic world? find o...,fear post apocalyptic world find feeling emoti...,12
13735,13736,0,@user is wishing all you proud fathers out the...,user wishing proud fathers fathers day,6
23423,23424,1,@user a #nation should not be #judged by how i...,user nation judged treats highest citizens low...,10


In [6]:
X_independent = train_tweets['clean_text']
Y_target = train_tweets['label']

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_independent, Y_target, test_size=0.33, random_state=42)

In [8]:
length = max(train_tweets['review_length'])
print(length)

37


In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

def create_tokenizer(X_train_lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train_lines)
    return tokenizer
# encode a list of lines
def encode_text(tokenizer,X_train_lines,length):
    # integer encode
    encoded = tokenizer.texts_to_sequences(X_train_lines)
    # pad encoded sequences
    padded = pad_sequences(encoded,maxlen = length,padding = 'post')
    return padded

tokenizer = create_tokenizer(X_train)
X_train_padded = encode_text(tokenizer,X_train,length)
print(X_train_padded)

[[ 570 1091 1524 ...    0    0    0]
 [4397    1    4 ...    0    0    0]
 [   1  258  643 ...    0    0    0]
 ...
 [   1  515   46 ...    0    0    0]
 [  40   20 1733 ...    0    0    0]
 [   1   21  352 ...    0    0    0]]


In [11]:
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate

# define the model
def define_model(length,vocab_size):
    # channel 1
    inputs1 = Input(shape = (length,))
    embedding1 = Embedding(vocab_size,100)(inputs1)
    conv1 = Conv1D(filters = 32, kernel_size = 4, activation = 'relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size = 2)(drop1)
    flat1 = Flatten()(pool1)
    
    # channel 2
    inputs2 = Input(shape = (length,))
    embedding2 = Embedding(vocab_size,100)(inputs2)
    conv2 = Conv1D(filters = 32, kernel_size = 4, activation = 'relu')(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D(pool_size = 2)(drop2)
    flat2 = Flatten()(pool2)
    
    # channel 1
    inputs3 = Input(shape = (length,))
    embedding3 = Embedding(vocab_size,100)(inputs3)
    conv3 = Conv1D(filters = 32, kernel_size = 4, activation = 'relu')(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D(pool_size = 2)(drop3)
    flat3 = Flatten()(pool3)
    
    # merge
    merged = concatenate([flat1,flat2,flat3])
    # interpretation
    dense1 = Dense(10,activation = 'relu')(merged)
    outputs = Dense(1, activation = 'sigmoid')(dense1)
    model = Model(inputs = [inputs1,inputs2,inputs3], outputs = outputs)
    # compile
    model.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
    # summarize
    model.summary()
    #plot_model(model,show_shapes = True,to_file = 'multichannel.png')
    return model

vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

# define model
model = define_model(length,vocab_size)

# fit model
model.fit([X_train_padded,X_train_padded,X_train_padded],y_train, epochs = 5, verbose = 2)

Vocabulary Size: 27780






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 37)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 37)           0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 37)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 37, 100)      2778000     input_1[0][0]                    
____________________________________________________________________________________________




Epoch 1/5
 - 96s - loss: 0.1835 - accuracy: 0.9398
Epoch 2/5
 - 96s - loss: 0.0803 - accuracy: 0.9788
Epoch 3/5
 - 94s - loss: 0.0462 - accuracy: 0.9910
Epoch 4/5
 - 93s - loss: 0.0294 - accuracy: 0.9945
Epoch 5/5
 - 93s - loss: 0.0124 - accuracy: 0.9964


<keras.callbacks.callbacks.History at 0x1c253bdc888>

In [12]:
# evaluate model on test dataset
X_test_padded = encode_text(tokenizer,X_test,length)

_,acc = model.evaluate([X_train_padded,X_train_padded,X_train_padded],y_train,verbose = 0)
print('Train Accuracy: %.2f' %(acc*100))
_,acc = model.evaluate([X_test_padded,X_test_padded,X_test_padded],y_test,verbose = 0)
print('Test Accuracy: %.2f' %(acc*100))

Train Accuracy: 99.91
Test Accuracy: 96.00


# Predictions On Test Data

In [13]:
test_tweets = pd.read_csv('test_tweets.csv')
test_tweets.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [14]:
def test_tweets_cleaning(doc_test_data):
    test_tweets['clean_text'] = doc_test_data.apply(lambda x : ' '.join(text_to_word_sequence(x)))
    test_tweets['clean_text'] = test_tweets['clean_text'].apply(lambda y : ' '.join([y for y in y.split() if not y in stop_words]))
    test_tweets['clean_text'] = test_tweets['clean_text'].apply(lambda z : ' '.join([z for z in z.split() if len(z) > 1]))
    test_tweets['clean_text'] = test_tweets['clean_text'].apply(lambda w : ' '.join([w for w in w.split() if w.isalpha()]))
    test_tweets['review_length'] = test_tweets['clean_text'].apply(lambda v : len([v for v in v.split()]))

test_tweets_cleaning(test_tweets['tweet'])
test_tweets.head()

Unnamed: 0,id,tweet,clean_text,review_length
0,31963,#studiolife #aislife #requires #passion #dedic...,studiolife aislife requires passion dedication...,7
1,31964,@user #white #supremacists want everyone to s...,user white supremacists want everyone see new ...,8
2,31965,safe ways to heal your #acne!! #altwaystohe...,safe ways heal acne altwaystoheal healthy healing,7
3,31966,is the hp and the cursed child book up for res...,hp cursed child book reservations already yes ...,10
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",bihday amazing hilarious nephew eli ahmir uncl...,9


In [15]:
test_padded = encode_text(tokenizer,test_tweets['clean_text'],length)
print(test_padded)

[[21528  1345 10152 ...     0     0     0]
 [    1   157 18963 ...     0     0     0]
 [  567   723   606 ...     0     0     0]
 ...
 [  658     9 10373 ...     0     0     0]
 [    5    22   319 ...     0     0     0]
 [  337   729   126 ...     0     0     0]]


In [16]:
yhat = model.predict([test_padded,test_padded,test_padded],verbose = 0)
# retrieve predicted percentage and label
percent_pos = yhat[0,0]
print(percent_pos)

0.006575942


In [20]:
yhat

array([[6.5759420e-03],
       [9.4162893e-01],
       [1.1920929e-07],
       ...,
       [5.2416325e-01],
       [3.2782555e-07],
       [2.2216837e-04]], dtype=float32)

In [32]:
labels = []
for i in yhat:
    percent_pos = np.round(i[0])
    labels.append(percent_pos)

In [34]:
test_tweets['label'] = labels
test_tweets['label'] = test_tweets['label'].astype('int')
test_tweets.head()

Unnamed: 0,id,tweet,clean_text,review_length,label
0,31963,#studiolife #aislife #requires #passion #dedic...,studiolife aislife requires passion dedication...,7,0
1,31964,@user #white #supremacists want everyone to s...,user white supremacists want everyone see new ...,8,1
2,31965,safe ways to heal your #acne!! #altwaystohe...,safe ways heal acne altwaystoheal healthy healing,7,0
3,31966,is the hp and the cursed child book up for res...,hp cursed child book reservations already yes ...,10,0
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",bihday amazing hilarious nephew eli ahmir uncl...,9,0


In [35]:
sample_submission = test_tweets[['id','label']]
sample_submission.head()

Unnamed: 0,id,label
0,31963,0
1,31964,1
2,31965,0
3,31966,0
4,31967,0


In [37]:
sample_submission.to_csv('submission.csv',index = False)