# Imports

In [None]:
!pip install transformers
!pip install tensorflow
!pip install torch
!pip install tweet-preprocessor
!pip install bs4
!pip install sentencepiece 
!pip install langdetect
!pip install translate-api
!pip install aspect-based-sentiment-analysis

In [None]:
import tensorflow as tf
import pandas as pd
import preprocessor as p
from bs4 import BeautifulSoup
import re
import time
from transformers import DistilBertTokenizer,TFDistilBertForSequenceClassification
from langdetect import detect
import aspect_based_sentiment_analysis as absa
import translators as ts

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Test Dataset Preprocessing

In [None]:
t0 = time.time()

In [None]:
df=pd.read_excel("/content/drive/MyDrive/H2_B2I_14/EvaluationDatasets/evaluation_data.xlsx")

In [None]:
id=list(df['Text_ID'])
text=list(df['Text'])

In [None]:
#Basic Preprocessing
def remove_html(word):
  soup = BeautifulSoup(word, 'lxml')
  html_free = soup.get_text()
  return html_free

def remove_urls(word):
    url_pattern = re.compile(r'https?:\/\/.*[\r\n]*')
    return url_pattern.sub(r'', word)

punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''
feature_1=[]
for i,txt in enumerate(text):
  if('tweet' in id[i] and len(p.clean('txt'))!=1):
    
    m=p.clean(txt)
    m=m[len('QT '):] if m.startswith('QT ') else m
    for y in punc:
      m=m[len(y):] if m.startswith(y) else m
    m.strip()
    feature_1.append(m)
  else:
    temp=remove_html(txt)
    temp=remove_urls(temp)
    idx=temp.find('\n')
    subw=temp[:idx]
    if(len(subw)<50 or len(subw.split(" "))<7):
      feature_1.append(temp[idx:])
    else:
      feature_1.append(temp)

feature_2=[re.sub('\n+','',t) for t in feature_1]


In [None]:
#Translation
feature_final=[]
count=0

for a in feature_2:
  if(len(a)>5000):
    a=a[:4999]
  if(a!=''):
    feature_final.append(ts.google(str(a))) 
    time.sleep(0.5)
  else:
    feature_final.append('')
  print(f"Example number {count}")
  count+=1

In [None]:
t1=time.time()
print(f"Preprocessing Time = {t1-t0} seconds")

# Task 1: Sentiment Classification

In [None]:
t2=time.time()

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model= TFDistilBertForSequenceClassification.from_pretrained("/content/drive/MyDrive/H2_B2I_14/DistilBERT")

In [None]:
predicted_labels=[]
counter=0
for y in feature_final:
  if(type(y) is str):
    predict_input = tokenizer.encode(y,
                                  truncation=True,
                                  padding=True,
                                  return_tensors="tf",max_length=512)
    tf_output = model.predict(predict_input)[0]
    tf_prediction = tf.nn.softmax(tf_output, axis=1).numpy()[0]
    predicted_labels.append(int(tf.argmax(tf_prediction)))
  else:
    predicted_labels.append(0)
  print(f"Example {counter}")
  counter+=1

In [None]:
t3=time.time()
print(f"Evaluation Time for Task 1 is {t3-t2} seconds")

In [None]:
op1=pd.DataFrame(list(zip(id,predicted_labels)),columns=['Text_ID','Mobile_Tech_Flag_Predicted'])

In [None]:
op1.to_csv("/content/drive/MyDrive/H2_B2I_14/Outputs/Output1.csv",index=False)
op1.to_csv("/content/drive/MyDrive/H2_B2I_14/Outputs/Output2.csv",index=False)

In [None]:
df2=pd.DataFrame(list(zip(feature_label)),columns=['Text'])
df2.to_csv("/content/drive/MyDrive/H2_B2I_14/Preprocessed_Text.csv")

# Task 2: Entity Level Sentiment Analysis

In [None]:
t4=time.time()

In [None]:
mobile_companies = ['acer','alcatel','amoi','apple','archos','asus','at&t','benefon','blackberry','blackview','blu','bq','celkon','chea','coolpad','energizer','ericsson','eten','fairphone','gionee','google','honor','hp','htc','huawei','i-mate','i-mobile','icemobile','infinix','innostream','intex','jolla','karbonn','kyocera','lava','leeco','lenovo','lg','maxon','maxwest','miezu','micromax','microsoft','mitac','modu','motorola','neonode','niu','nokia','o2','oneplus','oppo','panasonic','qmobile','qtek','razor','realme','sagem','samsung','sendo','sewon','sharp','sonim','sony','sony-ericsson','spice','t-mobile','tcl','tecno','tel.me.','telit','thuraya','toshiba','ulefone','vertu','verykool','vivo','vk mobile','vodafone','wiko','wnd','xcute','xiaomi','xolo','yota','yu','zte']

In [None]:
nlp=absa.load()

In [None]:
brand_found=[]
sentiment=[]
for i,x in enumerate(feature_final):
  temp=[]
  temp1=[]
  if(predicted_labels[i]==1):
    x=' '+x+' '
    x = re.sub('([.,!?()])', r' \1 ', x)
    x = re.sub('\s{2,}', ' ', x)
    for y in mobile_companies:  
      if(' '+y+' ' in x.lower()):
        idx=x.lower().find(y)
        x0=str(x[max(idx-250,0):min(idx+250,len(x))]).lower()
        temp.append(y)
        sent=nlp((x0),aspects=[y])
        if(int(sent.subtasks[y].examples[0].sentiment)==2):
          temp1.append('postive')
        elif(int(sent.subtasks[y].examples[0].sentiment)==1):
          temp1.append('negative')
        else:
          temp1.append('neutral')
    else:
      pass

  brand_found.append(temp)
  sentiment.append(temp1)
  print(i)

In [None]:
brand=list()
sent=list()
for i,x in enumerate(brand_found):
  te=''
  for y in x:
    te+=y.capitalize()+','
  brand.append(te[:-1])

for i,x in enumerate(sentiment):
  te=''
  for y in x:
    te+=y.capitalize()+','
  sent.append(te[:-1])

In [None]:
op1['Brands_Entity_Identified']=brand

In [None]:
op1['Sentiment_Identified']=sent

In [None]:
op1.to_csv("/content/drive/MyDrive/H2_B2I_14/Outputs/Output2.csv",index=False)

In [None]:
t5=time.time()
print(f"Evaluation Time for Task 2 is {t5-t4} seconds")

# Task 3: Headline Generator

In [None]:
!pip install transformers==4.4.2

Please restart runtime at this point for installing the new version of Transformers

In [None]:
t6=time.time()

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import pandas as pd

In [None]:
model = PegasusForConditionalGeneration.from_pretrained('/content/drive/MyDrive/H2_B2I_14/Pegasus')
tokenizer = PegasusTokenizer.from_pretrained('/content/drive/MyDrive/H2_B2I_14/Pegasus')

In [None]:
op2=pd.read_csv("/content/drive/MyDrive/H2_B2I_14/Outputs/Output1.csv")

In [None]:
id=op2['Text_ID']
predicted_labels=op2['Mobile_Tech_Flag_Predicted'] 

In [None]:
text=pd.read_csv("/content/drive/MyDrive/H2_B2I_14/Preprocessed_Text.csv")['Text'].tolist()

In [None]:
headlines_gen = []
a= 0
for c,y in enumerate(text):
  if(('article' in id[c]) and predicted_labels[c]==1):
    temp = tokenizer(y, return_tensors = 'pt',padding=True,truncation=True)
    summ = model.generate(input_ids=temp['input_ids'], 
                          attention_mask=temp['attention_mask'],
                          early_stopping=True)
    pred = tokenizer.decode(summ[0], skip_special_tokens=True)
    headlines_gen.append(pred)
  else:
    headlines_gen.append("")
  a+=1
  print(f"Example {a}")

In [None]:
op2['Headline_Generated_Eng_Lang']=headlines_gen

In [None]:
op2.to_csv("/content/drive/MyDrive/H2_B2I_14/Outputs/Output1.csv", index=False)

In [None]:
t7=time.time()
print(f"Evaluation Time for Task 3 is {t7-t6}")

In [None]:
print(f"Total Evaluation Time is {t1+t3+t5+t7-t6-t4-t2-t0}")