In [None]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
import re
!pip install pandarallel
from pandarallel import pandarallel
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# Initialization
pandarallel.initialize()

# *Methods Function*

In [None]:
def data_analysis(data):
  print("\n\ndata head\n\n")
  print(data.head())
  label_count=data['label'].value_counts()
  print("\n\nlabel count \n\n")
  print(label_count)



  print("Number of sentence in data set : ",data["data"].count())
  print("Average words per sentence in data set:", data['data'].str.split().str.len().mean())




In [None]:
def data_preprocess(data_set):
  #print(data_set.head())
   # #Pre processing train data
  data_set['clean_text'] = data_set['data']
  data_set['clean_text'] = data_set['clean_text'].str.replace("URL","")

  data_set['clean_text'] = data_set['clean_text'].apply(lambda x: x.replace("@user",""))
  # #remove retweet rt start of text
  #data_set['clean_text'] = data_set['clean_text'].apply(lambda x: x.replace("rt ",""))

  # #remove single characcters
  data_set['clean_text'] = data_set['clean_text'].apply(lambda x: re.sub(r'(?:^| )\w(?:$| )', ' ', x).strip())

  #Convert lower case
  data_set['clean_text'] = data_set['clean_text'].apply(lambda x:x.lower())

  # #remove emojis
  data_set['clean_text'] = data_set['clean_text'].apply(lambda x:x.encode('ascii','ignore').decode('ascii'))
  #remove digit
  data_set['clean_text'] = data_set['clean_text'].apply(lambda x:re.sub(" \d+", " ", x))
  # # #remove punctuations
  punctuation='!!"$%&()*+-/:;<=>?[\\]^_{|}~.#'
  data_set['clean_text'] = data_set['clean_text'].parallel_apply(lambda x:''.join(ch for ch in x if ch not in set(punctuation)))
  
  return data_set


In [None]:
def TFIDF_char_logistic(train,dev,test):

  vectorizer = TfidfVectorizer(ngram_range=(1, 3),analyzer='char')
  full_text = list(train['clean_text'].values) + list(dev['clean_text'].values)

  #tfidf = vectorizer.fit(full_text)

  x_train_dev = vectorizer.fit_transform(full_text)
  x_test = vectorizer.transform(test['clean_text'])

  #x_train = tfidf.transform(train['data'])
  y_train_dev= list(train['label'].values) + list(dev['label'].values)
  #x_dev = tfidf.transform(dev['data'])
  #y_dev= dev['label']

  lr = LogisticRegression(max_iter=5000)
  ovr = OneVsRestClassifier(lr)
  ovr.fit(x_train_dev,y_train_dev)

  print("Linear Regression Result char on Train")
  print(classification_report( y_train_dev,ovr.predict(x_train_dev) ))

  # print("Linear Regression Result char on Test")
  # print(classification_report( ovr.predict(x_dev) , y_dev))
  return ovr.predict(x_test)


In [None]:
def TFIDF_char_svm(train,dev,test):
  vectorizer = TfidfVectorizer(ngram_range=(1, 3),analyzer='char')
  full_text = list(train['clean_text'].values) + list(dev['clean_text'].values)

  #tfidf = vectorizer.fit(full_text)

  x_train_dev = vectorizer.fit_transform(full_text)
  x_test = vectorizer.transform(test['clean_text'])

  #x_train = tfidf.transform(train['data'])
  y_train_dev= list(train['label'].values) + list(dev['label'].values)
  #x_dev = tfidf.transform(dev['data'])
  #y_dev= dev['label']

  svm = SVC(kernel='linear',max_iter=5000) 
  svm.fit(x_train_dev,y_train_dev)

  print("svm Result char on Train")
  print(classification_report( y_train_dev,svm.predict(x_train_dev) ))


  #print("svm Result char on Test")
  #print(classification_report(svm.predict(x_dev),y_dev))
  return svm.predict(x_test)




In [None]:
def TFIDF_char_randamforest_test(train,dev,test):
  vectorizer = TfidfVectorizer(ngram_range=(1, 3),analyzer='char')
  full_text = list(train['clean_text'].values) + list(dev['clean_text'].values)

  #tfidf = vectorizer.fit(full_text)

  x_train_dev = vectorizer.fit_transform(full_text)
  x_test = vectorizer.transform(test['clean_text'])

  #x_train = tfidf.transform(train['data'])
  y_train_dev= list(train['label'].values) + list(dev['label'].values)
  #x_dev = tfidf.transform(dev['data'])
  #y_dev= dev['label']

  text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
  text_classifier.fit(x_train_dev, y_train_dev)

  print("RandomForest Result Char on Train")
  print(classification_report(y_train_dev,text_classifier.predict(x_train_dev) ))




  #print("svm Result char on Test")
  #print(classification_report(svm.predict(x_dev),y_dev))
  return text_classifier.predict(x_test)




In [None]:
!pip install bert-for-tf2
!pip install tensorflow-text

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import tensorflow_text as text
from bert import bert_tokenization
import numpy as np
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def get_model(model_url, max_seq_length):
  inputs = dict(
    input_word_ids=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
    input_mask=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
    input_type_ids=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
    )

  muril_layer = hub.KerasLayer(model_url, trainable=True)
  outputs = muril_layer(inputs)

  assert 'sequence_output' in outputs
  assert 'pooled_output' in outputs
  assert 'encoder_outputs' in outputs
  assert 'default' in outputs
  return tf.keras.Model(inputs=inputs,outputs=outputs["pooled_output"]), muril_layer

In [None]:
max_seq_length = 100
muril_model, muril_layer = get_model(
    model_url="https://tfhub.dev/google/MuRIL/1", max_seq_length=max_seq_length)

In [None]:
vocab_file = muril_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = muril_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
def create_input(input_strings, tokenizer, max_seq_length):
  input_ids_all, input_mask_all, input_type_ids_all = [], [], []
  for input_string in input_strings:
    input_tokens = ["[CLS]"] + tokenizer.tokenize(input_string) + ["[SEP]"]
    input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
    sequence_length = min(len(input_ids), max_seq_length)
    
    if len(input_ids) >= max_seq_length:
      input_ids = input_ids[:max_seq_length]
    else:
      input_ids = input_ids + [0] * (max_seq_length - len(input_ids))

    input_mask = [1] * sequence_length + [0] * (max_seq_length - sequence_length)

    input_ids_all.append(input_ids)
    input_mask_all.append(input_mask)
    input_type_ids_all.append([0] * max_seq_length)
  
  return np.array(input_ids_all), np.array(input_mask_all), np.array(input_type_ids_all)

In [None]:
def encode(input_text):
  input_ids, input_mask, input_type_ids = create_input(input_text, 
                                                       tokenizer, 
                                                       max_seq_length)
  inputs = dict(
      input_word_ids=input_ids,
      input_mask=input_mask,
      input_type_ids=input_type_ids,
  )
  return muril_model(inputs)

In [None]:
def MURIL_logist_regression(train_embeddings,y_train, dev_embeddings,y_dev):
  lr = LogisticRegression(max_iter=5000)
  ovr = OneVsRestClassifier(lr)
  ovr.fit(train_embeddings,y_train)

  print("Linear Regression Result MuRIL on Train")
  print(classification_report(y_train,ovr.predict(train_embeddings)))

  print("Linear Regression Result MuRIL on Test")
  print(classification_report(y_dev, ovr.predict(dev_embeddings)))


In [None]:
def MURIL_logist_regression_test(train_dev_embeddings,y_train_dev, test_embeddings):
  lr = LogisticRegression(max_iter=5000)
  ovr = OneVsRestClassifier(lr)
  ovr.fit(train_dev_embeddings,y_train_dev)

  # print("Linear Regression Result MuRIL on Train")
  # print(classification_report( ovr.predict(train_embeddings) , y_train))

  # print("Linear Regression Result MuRIL on Test")
  # print(classification_report( ovr.predict(dev_embeddings) , y_dev))
  return ovr.predict(test_embeddings)


In [None]:
def MURIL_SVM(train_embeddings,y_train, dev_embeddings,y_dev):
  svm = SVC(kernel='linear',max_iter=5000) 
  svm.fit(train_embeddings,y_train)

  print("SVM Result MuRIL on Train")
  print(classification_report(y_train.svm.predict(train_embeddings) ))

  print("SVM Result MuRIL on Test")
  print(classification_report( y_dev.svm.predict(dev_embeddings) ))


In [None]:
def MURIL_SVM_test(train_dev_embeddings,y_train_dev, test_embeddings):
  svm = SVC(kernel='linear',max_iter=5000) 
  svm.fit(train_dev_embeddings,y_train_dev)

  # print("SVM Result MuRIL on Train")
  # print(classification_report( svm.predict(train_embeddings) , y_train))

  # print("SVM Result MuRIL on Test")
  # print(classification_report( svm.predict(dev_embeddings) , y_dev))
  return svm.predict(test_embeddings)


In [None]:
def MuRIL_embeddings(train):
  train_embeddings =  []
  preprocessed_text=train['data']
  i = 0
  for k in range(50):
    train_embeddings.extend(encode(preprocessed_text[i:min(i+len(preprocessed_text)//49,len(preprocessed_text))]))
    i+=len(preprocessed_text)//49
  print(len(train_embeddings))
  return train_embeddings


#kannada

In [None]:
kannada_train= pd.read_csv("kannada_offensive_train.csv",names=["data", "label"],sep="\t")
kannada_dev= pd.read_csv("kannada_offensive_dev.csv",names=["data", "label"],sep="\t")
kannada_test= pd.read_csv("kannada_offensive_test.csv",names=["data"],sep="\t")


In [None]:
kannada_train=data_preprocess(kannada_train)
kannada_dev=data_preprocess(kannada_dev)
kannada_test=data_preprocess(kannada_test)


In [None]:
kannada_train_embeddings =  MuRIL_embeddings(kannada_train)
kannada_dev_embeddings =  MuRIL_embeddings(kannada_dev)
kannada_test_embeddings =  MuRIL_embeddings(kannada_test)


In [None]:
kannada_train_dev_embeddings= np.concatenate((kannada_train_embeddings, kannada_dev_embeddings), axis=0)
len(kannada_train_dev_embeddings)

In [None]:
frames = [kannada_train,kannada_dev]
kannada_train_dev = pd.concat(frames)
len(kannada_train_dev)

In [None]:
kannada_test['label']=TFIDF_char_logistic_test(kannada_train,kannada_dev,kannada_test)


In [None]:
kannada_test['label']=TFIDF_char_svm_test(kannada_train,kannada_dev,kannada_test)


In [None]:
kannada_test['label']=TFIDF_char_randamforest_test(kannada_train,kannada_dev,kannada_test)


In [None]:
kannada_test['label']=MURIL_logist_regression_test(kannada_train_dev_embeddings,kannada_train_dev['label'],kannada_test_embeddings)


In [None]:
kannada_test['label']=MURIL_SVM_test(kannada_train_dev_embeddings,kannada_train_dev['label'],kannada_test_embeddings)


In [None]:
kannada_test['label']=MURIL_randamforest_test(kannada_train_dev_embeddings,kannada_train_dev['label'],kannada_test_embeddings)


In [None]:
kannada_test=kannada_test.drop(columns=['clean_text'])
kannada_test.index = np.arange(1, len(kannada_test) + 1)
kannada_test.head()


In [None]:
kannada_test.to_csv('IRNLP_DAIICT_kannada.tsv', sep = '\t',header=False)

#Malyalam

In [None]:
mal_train= pd.read_csv("mal_full_offensive_train.csv",names=["data", "label","noise"],sep="\t")
mal_dev= pd.read_csv("mal_full_offensive_dev.csv",names=["data", "label","noise"],sep="\t")
mal_test= pd.read_csv("mal_full_offensive_test.csv",names=["data"],sep="\t")



In [None]:
mal_train_embeddings=MuRIL_embeddings(mal_train)
mal_dev_embeddings=MuRIL_embeddings(mal_dev)
mal_test_embeddings=MuRIL_embeddings(mal_test)


In [None]:
mal_train_dev_embeddings= np.concatenate((mal_train_embeddings, mal_dev_embeddings), axis=0)
len(mal_train_dev_embeddings)

In [None]:
frames = [mal_train,mal_dev]
mal_train_dev = pd.concat(frames)
len(mal_train_dev)

In [None]:
mal_train=data_preprocess(mal_train)
mal_dev=data_preprocess(mal_dev)
mal_test=data_preprocess(mal_test)


In [None]:
mal_test['label']=TFIDF_char_logistic_test(mal_train,mal_dev,mal_test)


In [None]:
mal_test['label']=TFIDF_char_svm_test(mal_train,mal_dev,mal_test)


In [None]:
mal_test['label']=TFIDF_char_randamforest_test(mal_train,mal_dev,mal_test)


In [None]:

mal_test['label']=MURIL_logist_regression_test(mal_train_dev_embeddings,mal_train_dev['label'],mal_test_embeddings)


In [None]:
mal_test['label']=MURIL_SVM_test(mal_train_dev_embeddings,mal_train_dev['label'],mal_test_embeddings)


In [None]:
mal_test['label']=MURIL_randamforest_test(mal_train_dev_embeddings,mal_train_dev['label'],mal_test_embeddings)


In [None]:
mal_test=mal_test.drop(columns=['clean_text'])
mal_test.index = np.arange(1, len(mal_test) + 1)
mal_test.head()


In [None]:
mal_test.to_csv('IRNLP_DAIICT_malayalam.tsv', sep = '\t',header=False)

#Tamil

In [None]:
tamil_train= pd.read_csv("tamil_offensive_full_train.csv",names=["data", "label","noise"],sep="\t")
tamil_dev= pd.read_csv("tamil_offensive_full_dev.csv",names=["data", "label","noise"],sep="\t")

tamil_test= pd.read_csv("tamil_offensive_full_test.csv",names=["data"],sep="\t")


In [None]:
tamil_train_embeddings=MuRIL_embeddings(tamil_train)
tamil_dev_embeddings=MuRIL_embeddings(tamil_dev)
tamil_test_embeddings=MuRIL_embeddings(tamil_test)


In [None]:
tamil_train_dev_embeddings= np.concatenate((tamil_train_embeddings, tamil_dev_embeddings), axis=0)
len(tamil_train_dev_embeddings)

In [None]:
frames = [tamil_train,tamil_dev]
tamil_train_dev = pd.concat(frames)
len(tamil_train_dev)

In [None]:
tamil_test['label']=TFIDF_char_logistic_test(tamil_train,tamil_dev,tamil_test)


In [None]:
tamil_test['label']=TFIDF_char_svm_test(tamil_train,tamil_dev,tamil_test)


In [None]:
tamil_test['label']=TFIDF_char_randamforest_test(tamil_train,tamil_dev,tamil_test)


In [None]:
tamil_test['label']=MURIL_SVM_test(tamil_train_dev_embeddings,tamil_train_dev['label'],tamil_test_embeddings)


In [None]:
tamil_test['label']=MURIL_randamforest_test(tamil_train_dev_embeddings,tamil_train_dev['label'],tamil_test_embeddings)


In [None]:
tamil_test['label']=MURIL_logist_regression_test(tamil_train_dev_embeddings,tamil_train_dev['label'],tamil_test_embeddings)


In [None]:
tamil_test=tamil_test.drop(columns=['clean_text'])
tamil_test.index = np.arange(1, len(tamil_test) + 1)
tamil_test.head()


In [None]:
tamil_test.to_csv('IRNLP_DAIICT_tamil.tsv', sep = '\t',header=False)