<a href="https://colab.research.google.com/github/ayush-09/Convolutional-Neural-Network/blob/master/Text_to_array.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import math
import re
from google.colab import drive
import time
from bs4 import BeautifulSoup 

In [None]:
try:
  %tensorflow_version 2.x
except:
  pass
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tds


In [None]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cols=["sentiments","id","date","query","user","text"]
train_data=pd.read_csv("/content/drive/My Drive/CNN_NLP/data/train.csv",header=None,names=cols,engine="python",encoding="latin1")
test_data=pd.read_csv("/content/drive/My Drive/CNN_NLP/data/test.csv",header=None,names=cols,engine="python",encoding="latin1")

In [None]:
train_data.head()

Unnamed: 0,sentiments,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
train_data.drop(["id","user","query","date"],axis=1,inplace=True)

In [None]:
train_data.head()

Unnamed: 0,sentiments,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [None]:
def clean_tweet(tweet):
  tweet=BeautifulSoup(tweet, "lxml").get_text()
  tweet=re.sub(r"@[A-Za-z0-9]+", ' ',tweet)
  tweet=re.sub(r"https?://[A-Za-z0-9./]+", ' ',tweet)
  tweet=re.sub(r"[^a-zA-z.!?']", ' ',tweet)
  tweet=re.sub(r" +", " ",tweet)
  return tweet

In [None]:
data_clean=[clean_tweet(tweet) for tweet in train_data.text]

In [None]:
data_labels=train_data.sentiments.values
data_labels[data_labels==4]=1

In [None]:
set(data_labels)

{0, 1}

In [None]:
tokenizer=tds.features.text.SubwordTextEncoder.build_from_corpus(
    data_clean,target_vocab_size=2**16
)
data_input=[tokenizer.encode(sentence) for sentence in data_clean]

In [None]:
MAX_LEN=max([len(sentence) for sentence in data_input])
data_input=tf.keras.preprocessing.sequence.pad_sequences(data_input,value=0,padding="post",maxlen=MAX_LEN)


In [None]:
test_idx=np.random.randint(0,800000,8000)
test_idx=np.concatenate((test_idx,test_idx+800000))

In [None]:
test_inputs=data_input[test_idx]
test_labels=data_labels[test_idx]
train_input=np.delete(data_input,test_idx,axis=0)
train_labels=np.delete(data_labels,test_idx)

In [None]:
class DCNN(tf.keras.Model):
  def __init__(self,vocab_size,emb_dim=130,nb_filters=50,FFN_units=512,nb_classes=2,dropout_rate=0.1,training=False,name="dcnn"):
    super(DCNN,self).__init__(name=name)
    self.embedding=layers.Embedding(vocab_size,emb_dim)
    self.bigram=layers.Conv1D(filters=nb_filters,kernel_size=2,padding="valid",activation="relu")
    self.pool_1=layers.GlobalMaxPool1D()
    self.trigram=layers.Conv1D(filters=nb_filters,kernel_size=3,padding="valid",activation="relu")
    self.pool_2=layers.GlobalMaxPool1D()
    self.fourgram=layers.Conv1D(filters=nb_filters,kernel_size=4,padding="valid",activation="relu")
    self.pool_3=layers.GlobalMaxPool1D()
    self.dense1=layers.Dense(units=FFN_units,activation="relu")
    self.dropout=layers.Dropout(rate=dropout_rate)
    if nb_classes==2:
      self.last_dense=layers.Dense(units=1,activation="sigmoid")
    else:
      self.last_dense=layers.Dense(units=nb_classes,activation="softmax")
  def call(self,inputs,training):
    x=self.embedding(inputs)
    x_1=self.bigram(x)
    x_1=self.pool_1(x_1)
    x_2=self.trigram(x)
    x_2=self.pool_1(x_2)
    x_3=self.fourgram(x)
    x_3=self.pool_1(x_3)

    merged=tf.concat([x_1,x_2,x_3],axis=-1) #(batch_size=3*nb_filters)
    merged=self.dense1(merged)
    merged=self.dropout(merged,training)
    output=self.last_dense(merged)
    return output




In [None]:
VOCAB_SIZE=tokenizer.vocab_size
EMB_DIM=200
NB_FILTERS=100
FFN_UNITS=256
NB_CLASSES=len(set(train_labels))
DROPOUT_RATE=0.2
BATCH_SIZE=32
NB_EPOCHS=5

In [None]:
Dcnn=DCNN(vocab_size=VOCAB_SIZE,emb_dim=EMB_DIM,nb_filters=NB_FILTERS,FFN_units=FFN_UNITS,nb_classes=NB_CLASSES,dropout_rate=DROPOUT_RATE)

In [None]:
if NB_CLASSES==2:
  Dcnn.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])
else:
  Dcnn.compile(loss="sparse_categorical_crossentropy",optimizer="adam",metrics="sparse_categorical_accuracy")


In [None]:
checkpoint_path="./drive/My Drive/CNN_NPL/ckpt/"
ckpt=tf.train.Checkpoint(Dcnn=Dcnn)
ckpt_manager=tf.train.CheckpointManager(ckpt,checkpoint_path,max_to_keep=1)
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print("Latest checkpoint restored!!")

In [None]:
Dcnn.fit(train_input,train_labels,batch_size=BATCH_SIZE,epochs=NB_EPOCHS)
ckpt_manager.save()

In [None]:
results=Dcnn.evaluate(test_inputs,test_labels,batch_size=BATCH_SIZE)
print(results)

[0.3843754529953003, 0.8284375071525574]


In [None]:
Dcnn(np.array([tokenizer.encode("i love you so much")]),training=False).numpy()

array([[0.914371]], dtype=float32)