In [None]:
!pip uninstall keras -y
!pip install keras==2.2.4

!pip uninstall tensorflow -y
!pip install tensorflow==1.14.0

!pip install git+https://www.github.com/keras-team/keras-contrib.git

In [None]:
import numpy as np
import pandas as pd
import os

import nltk
from nltk.tokenize import WordPunctTokenizer
nltk.download('punkt')

from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [None]:
df_1 = pd.read_csv("/kaggle/input/entity-annotated-corpus/ner_dataset.csv", encoding="latin1")
df_1 = df_1.fillna(method="ffill")

df_1['Sentence #'] = df_1['Sentence #'].apply(lambda x: x.split(': ')[1]).astype(int)
df_1.drop(['POS','Tag'],axis=1,inplace=True)

df_1.columns = ['Sentence','Word']

df_1.head()

In [None]:
def annotater(arg):
    caps = first = small = misc = 0
    n = len(arg)
    
    for i in range(n):
        if 65<=ord(arg[i])<=90:
            if i==0:
                first = 1
            caps += 1
            
        elif 97<=ord(arg[i])<=122:
            small += 1
            
        else:
            misc += 1
            
    if small==n:
        return 's'
    elif caps==n:
        return 'c'
    elif first + small==n:
        return 'f'
    elif misc>0:
        return 'm'
    else:
        return 'f'
    
df_1['Target'] = df_1['Word'].apply(annotater)
df_1.head()

In [None]:
def reader(filename):
    file =  open(filename)
    lines = file.readlines()
    text = ''
    for i in lines:
        text += i
    return text

english = reader("/kaggle/input/english/english.txt")
english_sentences = english.split('.')

In [None]:
def tokenizer(arg):
    arg += '.'
    tokenizer = WordPunctTokenizer() 
    return tokenizer.tokenize(arg)

In [None]:
sentences = []
numbers = []
for n,i in enumerate(english_sentences):
    tokens = tokenizer(i)
    for i in tokens:
        sentences.append(i)
    for i in range(len(tokens)):
        numbers.append(47960+n)

In [None]:
df2 = pd.DataFrame(numbers,columns=['Sentence'])
df2['Word'] = sentences
df2['Target'] = df2['Word'].apply(annotater)

df = pd.concat([df_1,df2],axis=0)
df['Word'] = df['Word'].apply(lambda x: x.lower())
df.head()

In [None]:
word2idx = {}

for n,i in enumerate(df['Word'].unique()):
    word2idx[i] = n+2

word2idx["PAD"]= 0
word2idx["UNK"]=1
   
idx2word = {v:k for k,v in word2idx.items()}

In [None]:
text2tag = {"P":0,'f':1,'c':2,'s':3,'m':4}

tag2text = {0:"P",1:'f',2:'c',3:'s',4:'m'}

In [None]:
def encode(arg):
    if arg=='f':
        return 1
    elif arg=='c':
        return 2
    elif arg=='s':
        return 3
    elif arg=='m':
        return 4

df['Target'] = df['Target'].apply(encode)

In [None]:
def get_next(n):
    values = df[df['Sentence']==n][['Word','Target']].values
    return values[:,0],values[:,1]

X = []
y = []

for i in tqdm(range(df['Sentence'].values[-1])):
    if i+1==22480:
        continue
    x_,y_ = get_next(i+1)
    X.append(np.array([word2idx[i] for i in x_]))
    y.append(np.array(y_))

In [None]:
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

In [None]:
MAX_LEN = 80
EMBEDDING = 300
n_words = len(word2idx)
BATCH_SIZE = 32
EPOCHS = 5

X = pad_sequences(X,MAX_LEN,padding="post", value=0)
y = pad_sequences(y,MAX_LEN,padding="post", value=0)

y = [to_categorical(i, num_classes=4+1) for i in y]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [None]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF

In [None]:
inputs = Input(shape=(MAX_LEN,))

model = Embedding(input_dim=n_words, 
                  output_dim=EMBEDDING, 
                  input_length=80,
                  mask_zero=True)(inputs)

model = Bidirectional(LSTM(units=50, 
                           return_sequences=True, 
                           recurrent_dropout=0.2))(model)

model = TimeDistributed(Dense(50, activation="relu"))(model)
crf = CRF(5)
out = crf(model)

model = Model(inputs, out)
model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])
model.summary()

In [None]:
history = model.fit(X_train, np.array(y_train), batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.1)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(history.history['crf_viterbi_accuracy'])
plt.plot(history.history['val_crf_viterbi_accuracy'])

plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'])
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
pred = model.predict(X_test)
pred = np.argmax(pred, axis=-1)
y_true = np.argmax(y_test, -1)

y_true = [[tag2text[j] for j in i] for i in y_true]
pred = [[tag2text[j] for j in i] for i in pred]

In [None]:
!pip install sklearn-crfsuite

In [None]:
from sklearn_crfsuite.metrics import flat_classification_report

print(flat_classification_report(y_pred=pred, y_true=y_true))

In [None]:
def decoder(labels,tokens):
    decoded = []
    for i in range(len(tokens)):
        if tag2text[labels[0][i]]=='f':
            tmp = list(tokens[i])
            tmp[0] = tmp[0].upper()
            decoded.append(''.join(tmp))
        elif tag2text[labels[0][i]]=='s':
            decoded.append(tokens[i])
        elif tag2text[labels[0][i]]=='c':
            decoded.append(tokens[i].upper())
        else:
            decoded.append(tokens[i])
    
    text = []
    for token in decoded:
        
        if token=='(':
            text.append(token)
        
        elif token not in '!"#$%&\'*)+-./:,;<=>?@[\\]^_`{|}~':
            text.append(token + ' ')
        else:
            if text[-1][-1]==' ':
                tmp = text.pop()
                text.append(tmp[:-1])
            
            text.append(token+' ')
            
    
    return ''.join(text)

def encode(text):

    tokens = WordPunctTokenizer().tokenize(text)
    
    encoded = []

    for token in tokens:
        if token in word2idx.keys():
            encoded.append(word2idx[token])
        else:
            encoded.append(1)

    encoded = encoded + [0 for x in range(80-len(encoded))]
    
    return encoded,tokens

In [None]:
#text = input("Enter the Text for case conversion: ").lower()

text = 'uSIng mATPLotlib and SeaBORN LIBraries is one OF THE best IDea to do Eda.'.lower()

encoded,tokens = encode(text)

labels = np.argmax(model.predict(np.array(encoded).reshape(1,80)),axis=-1)

text = decoder(labels,tokens)
print()
print("The converted text is: ",text)

In [None]:
#text = input("Enter the Text for case conversion: ").lower()

text = 'people in india started using english as thier primary language next to hindi.'

encoded,tokens = encode(text)

labels = np.argmax(model.predict(np.array(encoded).reshape(1,80)),axis=-1)

text = decoder(labels,tokens)
print()
print("The converted text is: ",text)