In [1]:
!pip install googletrans==3.1.0a0 transformers sentencepiece gdown -q

[0m

In [2]:
!gdown --id 1VDwcsXYmh1uq8X7-8va25CP1JqL4xdv0

Downloading...
From: https://drive.google.com/uc?id=1VDwcsXYmh1uq8X7-8va25CP1JqL4xdv0
To: /kaggle/working/mahasent_train_new_part1.csv
100%|██████████████████████████████████████| 3.23M/3.23M [00:00<00:00, 32.2MB/s]


In [3]:
import pandas as pd
import numpy as np
import re
from googletrans import Translator
translator = Translator()

In [4]:
from transformers import MBartForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import AlbertTokenizer, AutoTokenizer



In [5]:
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/MultiIndicParaphraseGeneration")
model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/MultiIndicParaphraseGeneration")

# Some initial mapping
bos_id = tokenizer._convert_token_to_id_with_added_voc("<s>")
eos_id = tokenizer._convert_token_to_id_with_added_voc("</s>")
pad_id = tokenizer._convert_token_to_id_with_added_voc("<pad>")

In [6]:
inp = tokenizer("दिल्ली यूनिवर्सिटी देश की प्रसिद्ध यूनिवर्सिटी में से एक है. </s> <2hi>", add_special_tokens=False, return_tensors="pt", padding=True).input_ids 

model_output=model.generate(inp, use_cache=True,no_repeat_ngram_size=3,encoder_no_repeat_ngram_size=3, num_beams=4, max_length=40, min_length=1, early_stopping=True, pad_token_id=pad_id, bos_token_id=bos_id, eos_token_id=eos_id, decoder_start_token_id=tokenizer._convert_token_to_id_with_added_voc("<2hi>"))

# Decode to get output strings
decoded_output=tokenizer.decode(model_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
print(decoded_output) # दिल्ली विश्वविद्यालय देश की प्रमुख विश्वविद्यालयों में शामिल है।

दिल्ली विश्वविद्यालय देश की प्रमुख विश्वविद्यालयों में शामिल है।


In [7]:
# from google.colab import drive
# drive.mount('/content/gdrive')

In [8]:
df = pd.read_csv('mahasent_train_new_part1.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,label
0,0,ज्येष्ठ पत्रकार अनंत दीक्षित यांच्या निधनाचे...,-1
1,1,सर्वोच्च न्यायालयाचे निर्देश डावलून पुणे पोल...,-1
2,2,उद्धव ठाकरेंनी भाजपासोबत युती करून शिवसैनिका...,-1
3,3,आपला समाज खूप मोठा आहे. त्यात अनेक घटक अंतर्...,1
4,4,बलात्काराचा बदला बलात्काराने घेतला पाहिजे हे म...,-1


In [9]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [10]:
df.head()

Unnamed: 0,tweet,label
0,ज्येष्ठ पत्रकार अनंत दीक्षित यांच्या निधनाचे...,-1
1,सर्वोच्च न्यायालयाचे निर्देश डावलून पुणे पोल...,-1
2,उद्धव ठाकरेंनी भाजपासोबत युती करून शिवसैनिका...,-1
3,आपला समाज खूप मोठा आहे. त्यात अनेक घटक अंतर्...,1
4,बलात्काराचा बदला बलात्काराने घेतला पाहिजे हे म...,-1


In [11]:
# for paraphrasing, create a seperate 'paraphrase' column
df['paraphrased'] = ""

In [12]:
df.head()

Unnamed: 0,tweet,label,paraphrased
0,ज्येष्ठ पत्रकार अनंत दीक्षित यांच्या निधनाचे...,-1,
1,सर्वोच्च न्यायालयाचे निर्देश डावलून पुणे पोल...,-1,
2,उद्धव ठाकरेंनी भाजपासोबत युती करून शिवसैनिका...,-1,
3,आपला समाज खूप मोठा आहे. त्यात अनेक घटक अंतर्...,1,
4,बलात्काराचा बदला बलात्काराने घेतला पाहिजे हे म...,-1,


In [13]:
# data preprocessing:
# removing non marathi text 
import re
def remove_non_marathi_for_tweets(text):
    i= re.sub(r"http\S+", "", text)
    i= re.sub(r"pic\S+", "", i)
    i= re.sub(r"#", "", i)
    i= re.sub(r"lokmat\S+", "", i)
    i= re.sub(r"@\S+", "", i)
    i= re.sub(r"fb\S+", "", i)
    i= re.sub(r"twitter\S+", "", i)
    i= re.sub(r"bit\S+", "", i)    
    # i= re.sub(r"['\"!-./,;\[\]?]", "", i)  # during augmenting, keep the necessary punctuations [.?!'"] in the sentence.  
    i= re.sub(r"[\[\]]", "", i)    
    i= re.sub(r'[a-zA-Z0-9]', "", i)    
    i=re.split(r'[\s\n]+', i.strip())
    return(' '.join(i))

In [14]:
df['tweet'][205]

'  पेट्रोल आणि डिझेलचे दर हे सरकारी नियंत्रणातून काँग्रेसनेच मुक्त केले. 2018 मध्ये जेव्हा अशाच प्रकारची दरवाढ झाली, तेव्हा 4 ऑक्टोबर 2018 रोजी 5 रूपयांचा दिलासा राज्यातील नागरिकांना देण्याचा निर्णय आपल्या सरकारने घेतला होता. pic.twitter.com/B1GuxY3NP7\n'

In [15]:
remove_non_marathi_for_tweets(df['tweet'][205])

'पेट्रोल आणि डिझेलचे दर हे सरकारी नियंत्रणातून काँग्रेसनेच मुक्त केले. मध्ये जेव्हा अशाच प्रकारची दरवाढ झाली, तेव्हा ऑक्टोबर रोजी रूपयांचा दिलासा राज्यातील नागरिकांना देण्याचा निर्णय आपल्या सरकारने घेतला होता.'

In [16]:
for i in range(len(df)):
  df['tweet'][i] = remove_non_marathi_for_tweets(df['tweet'][i])
df = df[df['tweet']!='']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tweet'][i] = remove_non_marathi_for_tweets(df['tweet'][i])


In [17]:
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
df = df.reset_index(drop=True)

In [18]:
df['tweet'][4]

'बलात्काराचा बदला बलात्काराने घेतला पाहिजे हे म्हणणारे सावरकर आम्हाला मान्य नाहीत. द्वेष आणि तिरस्कार हिंदू धर्मातही अभिप्रेत कधीच नव्हता. आम्ही सावरकरांच्या विचारांचा विरोध करताच राहू त्यात व्यक्तीद्वेष नाही तर वैचारिक विरोध आहे. हा देश गांधींच्या विचारांनीच चालेल!'

In [19]:
import re

# paragraph = "This is a sample paragraph. It has several sentences. Each sentence ends with a period. Except for this one?"

sentences = re.findall(r'[^.?!]+[.?!]', df['tweet'][4])

print(sentences)

['बलात्काराचा बदला बलात्काराने घेतला पाहिजे हे म्हणणारे सावरकर आम्हाला मान्य नाहीत.', ' द्वेष आणि तिरस्कार हिंदू धर्मातही अभिप्रेत कधीच नव्हता.', ' आम्ही सावरकरांच्या विचारांचा विरोध करताच राहू त्यात व्यक्तीद्वेष नाही तर वैचारिक विरोध आहे.', ' हा देश गांधींच्या विचारांनीच चालेल!']


In [20]:
final = ""
for i in sentences:
  inp = tokenizer(i+" </s> <2mr>", add_special_tokens=False, return_tensors="pt", padding=True).input_ids 

  model_output=model.generate(inp, use_cache=True,no_repeat_ngram_size=3,encoder_no_repeat_ngram_size=3, num_beams=4, max_length=40, min_length=1, early_stopping=True, pad_token_id=pad_id, bos_token_id=bos_id, eos_token_id=eos_id, decoder_start_token_id=tokenizer._convert_token_to_id_with_added_voc("<2mr>"))

  # Decode to get output strings
  decoded_output=tokenizer.decode(model_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
  final = final+decoded_output+" "
print(final)
final = final.strip()
print(final)

बलात्काराची शिक्षा बलात्काराने घेतली पाहिजे, असे सावरकर म्हणाले. हिंदू धर्मात कधीही द्वेष आणि द्वेष नव्हता. आम्ही सावरकरांचे विचार विरोध करत राहू, त्यात व्यक्तीस्वातंत्र्य नाही, वैचारिक विरोधही आहे. हा देश गांधीजींच्या विचाराने चालला आहे. 
बलात्काराची शिक्षा बलात्काराने घेतली पाहिजे, असे सावरकर म्हणाले. हिंदू धर्मात कधीही द्वेष आणि द्वेष नव्हता. आम्ही सावरकरांचे विचार विरोध करत राहू, त्यात व्यक्तीस्वातंत्र्य नाही, वैचारिक विरोधही आहे. हा देश गांधीजींच्या विचाराने चालला आहे.


In [21]:
print(df['tweet'][4])
print(final)

बलात्काराचा बदला बलात्काराने घेतला पाहिजे हे म्हणणारे सावरकर आम्हाला मान्य नाहीत. द्वेष आणि तिरस्कार हिंदू धर्मातही अभिप्रेत कधीच नव्हता. आम्ही सावरकरांच्या विचारांचा विरोध करताच राहू त्यात व्यक्तीद्वेष नाही तर वैचारिक विरोध आहे. हा देश गांधींच्या विचारांनीच चालेल!
बलात्काराची शिक्षा बलात्काराने घेतली पाहिजे, असे सावरकर म्हणाले. हिंदू धर्मात कधीही द्वेष आणि द्वेष नव्हता. आम्ही सावरकरांचे विचार विरोध करत राहू, त्यात व्यक्तीस्वातंत्र्य नाही, वैचारिक विरोधही आहे. हा देश गांधीजींच्या विचाराने चालला आहे.


In [22]:
from tqdm import tqdm
for i in tqdm(range(len(df))):
  final = ""
  sentences = re.findall(r'[^.?!]+[.?!]', df['tweet'][i])
  for j in sentences:
    inp = tokenizer(j+" </s> <2mr>", add_special_tokens=False, return_tensors="pt", padding=True).input_ids 
    model_output=model.generate(inp, use_cache=True,no_repeat_ngram_size=3,encoder_no_repeat_ngram_size=3, num_beams=4, max_length=40, min_length=1, early_stopping=True, pad_token_id=pad_id, bos_token_id=bos_id, eos_token_id=eos_id, decoder_start_token_id=tokenizer._convert_token_to_id_with_added_voc("<2mr>"))
    # Decode to get output strings
    decoded_output=tokenizer.decode(model_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
    final = final+decoded_output+" "
  final = final.strip()
  df['paraphrased'][i] = final

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['paraphrased'][i] = final
100%|██████████| 6045/6045 [8:02:55<00:00,  4.79s/it]  


In [23]:
# once dataframe is ready, then create a new dataframe, containing the 'paraphrased' and 'label' columns only:
df_paraphrased = df[['paraphrased', 'label']]
df_paraphrased.head()

Unnamed: 0,paraphrased,label
0,अनंत दीक्षित यांचे निधन झाले आहे. चार दशके त्य...,-1
1,सर्वोच्च न्यायालयाच्या आदेशाचे उल्लंघन केल्याच...,-1
2,,-1
3,आपला समाज फार मोठा असतो. यात अनेक घटकांचा समाव...,1
4,"बलात्काराची शिक्षा बलात्काराने घेतली पाहिजे, अ...",-1


In [24]:
df_paraphrased['paraphrased'][0]

'अनंत दीक्षित यांचे निधन झाले आहे. चार दशके त्यांनी आपल्या लेखनातून पत्रकारितेमध्ये अमूल्य योगदान दिलं आहे. दीक्षित यांच्या मार्गदर्शनात पत्रकारांची एक पिढी झाली. अनंत दीक्षितांना भावपूर्ण आदरांजली! आम्ही त्यांच्या कुटुंबीयांच्या दुःखात सहभागी आहे.'

In [25]:
df['tweet'][0]

'ज्येष्ठ पत्रकार अनंत दीक्षित यांच्या निधनाचे वृत्त दु:खद आहे. चार दशकं त्यांनी आपल्या परखड लेखणीने पत्रकारितेत अमूल्य योगदान दिले. दीक्षित यांच्या मार्गदर्शनाखाली पत्रकारांची पिढी घडली. अनंत दीक्षित यांना भावपूर्ण श्रद्धांजली! आम्ही त्यांच्या परिवाराच्या दु:खात सहभागी आहोत.'

In [26]:
df_paraphrased.rename(columns = {'paraphrased':'tweet'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_paraphrased.rename(columns = {'paraphrased':'tweet'}, inplace = True)


In [27]:
df.drop(['paraphrased'], axis=1, inplace=True)
df.head()

Unnamed: 0,tweet,label
0,ज्येष्ठ पत्रकार अनंत दीक्षित यांच्या निधनाचे व...,-1
1,सर्वोच्च न्यायालयाचे निर्देश डावलून पुणे पोलिस...,-1
2,उद्धव ठाकरेंनी भाजपासोबत युती करून शिवसैनिकांच...,-1
3,आपला समाज खूप मोठा आहे. त्यात अनेक घटक अंतर्भू...,1
4,बलात्काराचा बदला बलात्काराने घेतला पाहिजे हे म...,-1


In [28]:
len(df_paraphrased)

6045

In [29]:
df_paraphrased.drop_duplicates()

Unnamed: 0,tweet,label
0,अनंत दीक्षित यांचे निधन झाले आहे. चार दशके त्य...,-1
1,सर्वोच्च न्यायालयाच्या आदेशाचे उल्लंघन केल्याच...,-1
2,,-1
3,आपला समाज फार मोठा असतो. यात अनेक घटकांचा समाव...,1
4,"बलात्काराची शिक्षा बलात्काराने घेतली पाहिजे, अ...",-1
...,...,...
6040,या पुस्तकाचे प्रकाशन मुंबईतील विधान भवनात होणा...,0
6041,संतश्रेष्ठ तुकाराम महाराज यांच्या पुण्यतिथीनिम...,1
6042,तसेच काजू व्यापाऱ्यांची गेल्या कालावधीतील कर प...,1
6043,कर्जमाफीची योजना लागू होऊन २ वर्षांचा कालावधी ...,-1


In [30]:
df_paraphrased['tweet'][10]

'ज्येष्ठ पत्रकार, सामाजिक कार्यकर्ते दिनू रणदीवे यांची पत्नी सविता यांचे निधन झाले असून त्यांच्या निधनामुळे सामाजिक, सांस्कृतिक चळवळीतील एक आदर्श व्यक्तीमत्व गमावले आहे. त्यांना आदरांजली!'

In [31]:
df['tweet'][10]

'संयुक्त महाराष्ट्राच्या लढाईतील निष्ठावान सैनिक, उपेक्षितांच्या हक्कांसाठी लढणाऱ्या सामाजिक कार्यकर्त्या तसंच ज्येष्ठ पत्रकार दिनु रणदिवे यांच्या पत्नी सविताताई रणदिवे यांच्या निधनानं सामाजिक, पुरोगामी चळवळीतील आदर्श व्यक्तिमत्व पडद्याआड गेलं आहे. त्यांना भावपूर्ण श्रद्धांजली!'

In [32]:
df_final = df.append(df_paraphrased, ignore_index = True)

  df_final = df.append(df_paraphrased, ignore_index = True)


In [33]:
df_final.to_csv('augmented-with-paraphrasing-mahasent.csv', index=False)
# use this dataset directly for training. do not preprocess this at all.

In [34]:
df_final

Unnamed: 0,tweet,label
0,ज्येष्ठ पत्रकार अनंत दीक्षित यांच्या निधनाचे व...,-1
1,सर्वोच्च न्यायालयाचे निर्देश डावलून पुणे पोलिस...,-1
2,उद्धव ठाकरेंनी भाजपासोबत युती करून शिवसैनिकांच...,-1
3,आपला समाज खूप मोठा आहे. त्यात अनेक घटक अंतर्भू...,1
4,बलात्काराचा बदला बलात्काराने घेतला पाहिजे हे म...,-1
...,...,...
12085,या पुस्तकाचे प्रकाशन मुंबईतील विधान भवनात होणा...,0
12086,संतश्रेष्ठ तुकाराम महाराज यांच्या पुण्यतिथीनिम...,1
12087,तसेच काजू व्यापाऱ्यांची गेल्या कालावधीतील कर प...,1
12088,कर्जमाफीची योजना लागू होऊन २ वर्षांचा कालावधी ...,-1


In [35]:
len(df_final)

12090