In [1]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, GRU
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from nltk.corpus import words, stopwords
from keras.preprocessing.text import Tokenizer
import numpy as np
import csv
import string
import re

import time
from contextlib import contextmanager
import gc
gc.collect()

@contextmanager
def timer(name):
	t0 = time.time()
	yield
	print("\n\n" + name + ' done in ' + str(round(time.time() - t0)) + 's \n')

print("\n\nStarting\n\n")
cachedStopWords = stopwords.words("english")
allEnglishWords = words.words()
allEnglishWords[:] = [x.lower() for x in allEnglishWords]
vocabSize = len(allEnglishWords)
tokenizer = Tokenizer(num_words=vocabSize)
tokenised = tokenizer.fit_on_texts(allEnglishWords)

def createPredictorModel():
	vocabSize = len(allEnglishWords)
	tokenizer = Tokenizer(num_words= vocabSize)
	tokenised = tokenizer.fit_on_texts(allEnglishWords)
	model = Sequential()
	
	with timer("Making embedding index dict"):
		embeddings_index = dict()
		f = open('glove.twitter.27B/glove.twitter.27B.100d.txt', encoding="utf8")
		for line in f:
			values = line.split()
			word = values[0]
			coefs = np.asarray(values[1:], dtype='float32')
			embeddings_index[word] = coefs
		f.close()
		print('Loaded %s word vectors.' % len(embeddings_index))


	with timer("Making Embedding matrix"):
		embedding_matrix = np.zeros((vocabSize, 100))
		for word, index in tokenizer.word_index.items():
			if index > vocabSize - 1:
				break
			else:
				embedding_vector = embeddings_index.get(word)
				if embedding_vector is not None:
					embedding_matrix[index] = embedding_vector

	with timer("Creating predictor model"): 
		model.add(Embedding(vocabSize, 100, input_length=180, weights=[embedding_matrix]))
		model.add(Dropout(0.2))
		model.add(Conv1D(64, 5, activation='relu'))
		model.add(MaxPooling1D(pool_size=4))
		model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
		model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
		model.add(LSTM(16, dropout=0.2, recurrent_dropout=0.2))
		model.add(Dense(3, activation='sigmoid'))
		model.compile(loss='categorical_crossentropy', optimizer='adam',    metrics=['accuracy'])

	return model

model = createPredictorModel()
print(model)


def clean(s):
	transalator = str.maketrans("", "", string.punctuation)
	return s.translate(transalator)


def preprocess(text):
	text = text.split(",")[-1]
	text = clean(text).lower()
	text = text.lower()
	text = ' '.join([word for word in text.split()
									if word not in cachedStopWords])
	text = ' '.join([word for word in text.split() if(not word.startswith(
		"@") and not word.startswith("http") and not word.startswith("\\"))])
	text = ' '.join([word for word in text.split()
									if word in allEnglishWords])
	#text =  re.sub("[_]","",text)
	#remove tags
	text = re.sub("&lt;/?.*?&gt;", " &lt;&gt; ", text)
	# remove special characters and digits
	text = re.sub("(\\d|\\W)+", " ", text)
	if(text.startswith("rt ") or text.startswith(" rt")):
		text = text[3:]
	if(text == "rt"):
		text = ""
	while(text != "" and text[0] == ' '):
		text = text[1:]
	return text


with timer("Reading data"):
		x = []
		y = []
		radical = []
		radicalOne = 0
		radicalZero = 0
		radicalTwo = 0
		with open("input.csv", 'r', encoding="utf8") as csvFile:
			reader = csv.reader(csvFile)
			p = 0
			for row in reader:
				if(p == 0):
					p = p + 1
					continue
				if(len(row) >= 2):
					s = row[0]
					x.append(preprocess(s))
					if(row[2] != '0.0'):
						# radicalOne += 1
						if(row[2] != '1.0' and row[2] != '2.0'):
							print("Chutiya annotator tha : ", row[2], " row : ", p)
							# radicalOne -= 1
					s = 0
					if(row[2] == '1.0'):
						radicalOne += 1
						s = 1
					if(row[2] == '2.0'):
						radicalTwo += 1
						s = 2
					if(s == 0):
						radicalZero += 1
					radical.append(s)
				p = p + 1
		csvFile.close

X = []
for t in x:
		t = re.sub(r'[^\w\s]', ' ', t)
		t = ' '.join([word for word in t.split() if word != " "])
		t = t.lower()
		t = ' '.join([word for word in t.split()
									if word not in cachedStopWords])
		X.append(t)

tokenisedTest = tokenizer.texts_to_sequences(X)
X_Test = sequence.pad_sequences(
	tokenisedTest, maxlen=180, padding='post')

with timer("Making label vector"):
		Y = np.zeros([len(radical), 3], dtype=int)
		for x in range(0, len(radical)):
				Y[x, radical[x]] = 1

radical = np.array(Y)
with timer('Fitting the model'):
	epochs = 1
	print("epochs : ", epochs)
	fitHistory = model.fit(
		X_Test, radical, epochs=1, batch_size=32)
	trainingAccuracy = fitHistory.history['acc']
	while(trainingAccuracy[0] < 0.99 and epochs < 100):
		epochs += 1
		print("epochs : ", epochs)
		fitHistory = model.fit(
			X_Test, radical, epochs=1, batch_size=32)
		trainingAccuracy = fitHistory.history['acc']


with timer("Reading unlabelled input"):
	unlabelledTweets = []
	with open('input2.csv', 'r', encoding = 'utf8') as csvFile:
		reader = csv.reader(csvFile)
		for row in reader:
			unlabelledTweets.append(preprocess(row[0]))

with timer("Predicting"):
	tokenisedForPredicting = tokenizer.texts_to_sequences(unlabelledTweets)
	X_prediction = sequence.pad_sequences(tokenisedForPredicting, maxlen = 180, padding = 'post')
	predScores = model.predict(X_prediction, verbose = 1)

# zeroes = 0
# ones = 0
# twos = 0

# for x in predScores:
# 	if(x == 0):
# 		zeroes += 1
# 	elif(x == 1):
# 		ones += 1
# 	else:
# 		twos += 1

# print("Total unlabelled tweets : ", len(predScores), '\nzeroes : ', zeroes, "\nones : ", ones, "\ntwos : ", twos)


print(predScores)

print("\n\nWeights are :\n\n")
print(model.get_weights())

Using TensorFlow backend.




Starting


Loaded 1193514 word vectors.


Making embedding index dict done in 155s 



Making Embedding matrix done in 0s 



Creating predictor model done in 201s 

<keras.engine.sequential.Sequential object at 0x00000204322F5358>
Chutiya annotator tha :    row :  136
Chutiya annotator tha :    row :  155
Chutiya annotator tha :    row :  159
Chutiya annotator tha :    row :  322
Chutiya annotator tha :    row :  445
Chutiya annotator tha :    row :  471
Chutiya annotator tha :    row :  483
Chutiya annotator tha :    row :  942


Reading data done in 65s 



Making label vector done in 0s 

epochs :  1
Epoch 1/1
epochs :  2
Epoch 1/1
epochs :  3
Epoch 1/1
epochs :  4
Epoch 1/1
epochs :  5
Epoch 1/1
epochs :  6
Epoch 1/1
epochs :  7
Epoch 1/1
epochs :  8
Epoch 1/1
epochs :  9
Epoch 1/1
epochs :  10
Epoch 1/1
epochs :  11
Epoch 1/1
epochs :  12
Epoch 1/1
epochs :  13
Epoch 1/1
epochs :  14
Epoch 1/1
epochs :  15
Epoch 1/1
epochs :  16
Epoch 1/1
epochs :  17
Epoch 1/1
epochs :  18
Epo

epochs :  70
Epoch 1/1
epochs :  71
Epoch 1/1
epochs :  72
Epoch 1/1
epochs :  73
Epoch 1/1
epochs :  74
Epoch 1/1
epochs :  75
Epoch 1/1
epochs :  76
Epoch 1/1
epochs :  77
Epoch 1/1
epochs :  78
Epoch 1/1
epochs :  79
Epoch 1/1
epochs :  80
Epoch 1/1
epochs :  81
Epoch 1/1
epochs :  82
Epoch 1/1
epochs :  83
Epoch 1/1
epochs :  84
Epoch 1/1
epochs :  85
Epoch 1/1
epochs :  86
Epoch 1/1
epochs :  87
Epoch 1/1
epochs :  88
Epoch 1/1
epochs :  89
Epoch 1/1
epochs :  90
Epoch 1/1
epochs :  91
Epoch 1/1
epochs :  92
Epoch 1/1
epochs :  93
Epoch 1/1
epochs :  94
Epoch 1/1
epochs :  95
Epoch 1/1
epochs :  96
Epoch 1/1
epochs :  97
Epoch 1/1
epochs :  98
Epoch 1/1
epochs :  99
Epoch 1/1
epochs :  100
Epoch 1/1


Fitting the model done in 2593s 



Reading unlabelled input done in 2060s 



Predicting done in 936s 

[[0.20029399 0.90274733 0.07877063]
 [0.20029406 0.90274715 0.07877064]
 [0.20029399 0.90274733 0.07877063]
 ...
 [0.20029402 0.90274745 0.07877063]
 [0.20029399 0.90274733 0.0787

In [2]:

print(type(model.get_weights()))

<class 'list'>


In [None]:
import numpy
test = numpy.asarray(model.get_weights())
print(type(test))
print(test.shape)