<a href="https://colab.research.google.com/github/beatriceyapsm/temporaltest/blob/main/CombiModels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Import Python Libraries and Define Global Variables

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Github/Group-9C-Capstone

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Github/Group-9C-Capstone


In [2]:
#Import Pandas and Numpy Python Libraries
import pandas as pd        #data analysis and manipulation library for Python
import numpy as np         #mathematical operations over arrays

#Global variables
train_data = []            #sentence for training with labels 
train_target = []          #labels for training data
test_data = []             #sentence for testing with labels  
test_target = []           #labels for testing data

In [3]:
#Import files into a dataframe
def import_text(file_txt):
    
    colnames=['ID', 'Text'] 
    
    df = pd.read_csv(file_txt
                       ,skip_blank_lines=True   #input files have emplty lines
                       ,header=None             #no haeders
                       ,sep='\t'                #tab delimited
                       ,engine='python'         #engine
                       ,quotechar='^'           #if there are doublequotes in the text
                       ,comment='Comment:'      #this is for human reader, we don't need
                       ,names=colnames          #preset columns we need
                      )
    print("Number of rows: ", len(df.index))

    return df

In [4]:
#clean ""
def clean_doublequotes(df):
    return df.replace('"', '', regex=True)

#BEATRICE replace all other classifications with "Other"//so we do not lose these samples
def clean_class(df):
    
    return df.replace(regex=['Product-Producer','Entity-Origin','Instrument-Agency','Component-Whole','Content-Container','Entity-Destination','Member-Collection','Message-Topic'], value='Other')

In [5]:
#clean tags e.g. (e2,e1) - to check why?
def clean_tags(df_column,braket_type):
    
    if braket_type == 'angle':
        pattern = r'<.*?>'
    elif braket_type == 'round':
        pattern = r'(\(.*?\))'
    else:
        raise Exception("angle or round")
    
    return df_column.str.replace(pattern, '', regex=True)

In [6]:
#create tuple of keys
keys = ('Other','Cause-Effect')
#keys = ('Other','Cause-Effect','Product-Producer','Entity-Origin','Instrument-Agency','Component-Whole','Content-Container','Entity-Destination','Member-Collection','Message-Topic')
#for i in range(len(keys)): print(i, keys[i])

In [7]:
def prepare_file(file_name,use):
    
    #Import training file into a dataframe
    df = import_text(file_name)

    #clean ""
    df = clean_doublequotes(df)

    #BEATRICE make binary
    df = clean_class(df)

    # combine rows 1&2
    df['Classification'] = df['ID'].shift(-1)

    #remove leftover rows
    df = df[df.Text.notna()]

    #clean tags e.g. <e2> - better option would be to keep them and use for more precise prediction
    df.Text = clean_tags(df.Text,braket_type='angle')

    #clean tags e.g. (e2,e1) - better option would be to keep them and use for more precise prediction
    df.Classification = clean_tags(df.Classification,braket_type='round')

    #Map Keys to numbers
    df['Classification_ID'] = df.Classification.map(lambda x: keys.index(x))

    #to use in thes same file
    if use == 'train':
        global train_data
        train_data = df.Text
        
        global train_target
        train_target = df.Classification_ID
    elif use == 'test':
        global test_data
        test_data = df.Text
        
        global test_target
        test_target = df.Classification_ID

    #check
    print('Output rows:', len(df.index))
  
    return df

In [8]:
def prepare_newfile(file_name,use):
    
    #Import training file into a dataframe
    df = import_text(file_name)

    #clean ""
    df = clean_doublequotes(df)

    #clean tags e.g. <e2> 
    df.Text = clean_tags(df.Text,braket_type='angle')

    #clean tags e.g. (e2,e1)
    df.Classification = clean_tags(df.Classification,braket_type='round')


    #to use in thes same file
    if use == 'train':
        global train_data
        train_data = df.Text
        
        global train_target
        train_target = df.Classification_ID
    elif use == 'test':
        global test_data
        test_data = df.Text

    #check
    print('Output rows:', len(df.index))
  
    return df

In [9]:
#prepare training file
df_train = prepare_file('semeval2010task8_train.txt','train')

df_train.head(5)

Number of rows:  16000
Output rows: 8000


Unnamed: 0,ID,Text,Classification,Classification_ID
0,1,The system as described above has its greatest...,Other,0
2,2,The child was carefully wrapped and bound into...,Other,0
4,3,The author of a keygen uses a disassembler to ...,Other,0
6,4,A misty ridge uprises from the surge.,Other,0
8,5,The student association is the voice of the un...,Other,0


In [10]:
df_test = prepare_file('semeval2010task8_test.txt','test')

df_test.head(5)

Number of rows:  5434
Output rows: 2717


Unnamed: 0,ID,Text,Classification,Classification_ID
0,8001,The most common audits were about waste and re...,Other,0
2,8002,The company fabricates plastic chairs.,Other,0
4,8003,The school master teaches the lesson with a st...,Other,0
6,8004,The suspect dumped the dead body into a local ...,Other,0
8,8005,Avian influenza is an infectious disease of bi...,Cause-Effect,1


In [11]:
#prepare predicting file
df_pred = pd.read_csv('NewsArticlesFile.csv', encoding='cp1252') 

df_pred.head(5)

Unnamed: 0,id,text
0,1,"Prior to Ford, two more American firms- Genera..."
1,2,The U.S. carmaker entered India 25 years ago b...
2,3,Ford is shutting its car factories in India af...
3,4,The government will give about 260 billion rup...
4,5,More electric vehicles in India soon as automa...


In [12]:
global pred_data
pred_data = df_pred.text

###Support Vector Machine Linear Kernel with Gridsearch (2/3-gram)

In [13]:
# use the TF-IDF vectorizer and create a pipeline 
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
#Reference: https://gist.github.com/dspp779/5a9597e2d8a2518b80fb0ad191ea8463

In [14]:
#Build a vectorizer / classifier pipeline that filters out tokens that are too rare or too frequent
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', LinearSVC()),
])

In [15]:
#Build a grid search to find out whether 2-grams or 3-grams are more useful. 
#Fit the pipeline on the training set using grid search for the parameters
parameters = {
    'vect__ngram_range': [(1, 2), (1, 3)],
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)
grid_search.fit(train_data, train_target)

GridSearchCV(estimator=Pipeline(steps=[('vect', TfidfVectorizer()),
                                       ('clf', LinearSVC())]),
             n_jobs=-1, param_grid={'vect__ngram_range': [(1, 2), (1, 3)]})

In [16]:
#Predict the outcome on the testing set and store it in a variable named y_predicted
y_predicted = grid_search.predict(test_data)

In [17]:
accuracy_for_test_keys = np.mean(y_predicted == test_target)
print("SVM Model Accuracy = {} %".format(accuracy_for_test_keys*100))

SVM Model Accuracy = 96.72432830327567 %


In [21]:
#Predict the outcome on the new set and store it in a variable named p_predicted
p_predicted = grid_search.predict(pred_data)

In [22]:
df_pred['Predicted'] = p_predicted
df_pred.head(10)


Unnamed: 0,id,text,Predicted
0,1,"Prior to Ford, two more American firms- Genera...",0
1,2,The U.S. carmaker entered India 25 years ago b...,0
2,3,Ford is shutting its car factories in India af...,0
3,4,The government will give about 260 billion rup...,0
4,5,More electric vehicles in India soon as automa...,0
5,6,GLASGOW — At least six major automakers — incl...,0
6,7,India’s push to become a hub for semiconductor...,0
7,8,Apple has made building manufacturing faciliti...,0
8,9,The Tata Avinya concept car is unveiled during...,0
9,10,Tata Motors became the country's second larges...,0


In [34]:
#BEATRICE Return Predicted Cause-Effect Statements 
df_ceff=df_pred[(df_pred.Predicted.eq(1))]
df_ceff.head(10)

Unnamed: 0,id,text,Predicted
202,203,Japanese automaker Nissan returned to profitab...,1
589,590,Hong Kong CNN Business —China’s Sichuan provin...,1
711,27,"MILAN, Oct 18 (Reuters) - Maserati on Monday d...",1
732,48,Toyota and other automakers continue to strugg...,1
832,156,"TOKYO, Nov 1 (Reuters) - Japan's automobile sa...",1
871,195,Japan’s three biggest automakers are facing th...,1
873,197,Toyota is suspending more production due to pa...,1
879,203,"TOKYO, June 1 (Reuters) - Renesas Electronics ...",1
927,251,"TOKYO, Sept 21 (Reuters) - Honda Motor Co (726...",1
964,288,Toyota and other automakers continue to strugg...,1


### SPACY

In [23]:
!python3 -m spacy download en_core_web_trf
!pip install spacy-transformers

2022-10-17 03:50:11.408311: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-trf==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.4.0/en_core_web_trf-3.4.0-py3-none-any.whl (460.3 MB)
[K     |████████████████████████████████| 460.3 MB 32 kB/s 
[?25hCollecting spacy-transformers<1.2.0,>=1.1.2
  Downloading spacy_transformers-1.1.8-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.3 MB/s 
Collecting spacy-alignments<1.0.0,>=0.7.2
  Downloading spacy_alignments-0.8.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 9.9 MB/s 
[?25hCollecting transformers<4.22.0,>=3.4.0
  Downloading transformers-4.21.3-py3-none-any.whl (4.7 MB)
[K     |█

In [24]:
# Load SPACY 
import spacy
#from spacy.lang.en import English
from spacy import displacy
nlp=spacy.load('en_core_web_sm')
import pandas as pd
import numpy as np
import re

In [26]:
#load roberta transformer
trf = spacy.load('en_core_web_trf')

In [35]:
df_temptest=df_ceff.head(10)
df_temptest

Unnamed: 0,id,text,Predicted
202,203,Japanese automaker Nissan returned to profitab...,1
589,590,Hong Kong CNN Business —China’s Sichuan provin...,1
711,27,"MILAN, Oct 18 (Reuters) - Maserati on Monday d...",1
732,48,Toyota and other automakers continue to strugg...,1
832,156,"TOKYO, Nov 1 (Reuters) - Japan's automobile sa...",1
871,195,Japan’s three biggest automakers are facing th...,1
873,197,Toyota is suspending more production due to pa...,1
879,203,"TOKYO, June 1 (Reuters) - Renesas Electronics ...",1
927,251,"TOKYO, Sept 21 (Reuters) - Honda Motor Co (726...",1
964,288,Toyota and other automakers continue to strugg...,1


In [36]:
df_temptest['TRFDates'] = df_temptest['text'].apply(lambda sent: [(ent.text) for ent in trf(sent).ents if ent.label_ == "DATE"])   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [37]:
df_temptest

Unnamed: 0,id,text,Predicted,TRFDates
202,203,Japanese automaker Nissan returned to profitab...,1,"[the last fiscal year, three years, Thursday, ..."
589,590,Hong Kong CNN Business —China’s Sichuan provin...,1,"[six days, six decades, July, Monday, Tuesday,..."
711,27,"MILAN, Oct 18 (Reuters) - Maserati on Monday d...",1,"[Oct 18, Monday, the spring of 2022, November ..."
732,48,Toyota and other automakers continue to strugg...,1,"[annual, May, Wednesday, next month]"
832,156,"TOKYO, Nov 1 (Reuters) - Japan's automobile sa...",1,"[Nov 1, October, a year earlier, the fourth st..."
871,195,Japan’s three biggest automakers are facing th...,1,"[the coming years, 2011]"
873,197,Toyota is suspending more production due to pa...,1,"[June, June, August, fiscal year]"
879,203,"TOKYO, June 1 (Reuters) - Renesas Electronics ...",1,"[June 1, Tuesday, mid-June, the previously est..."
927,251,"TOKYO, Sept 21 (Reuters) - Honda Motor Co (726...",1,"[Sept 21, August, September, early October]"
964,288,Toyota and other automakers continue to strugg...,1,"[annual, May, Wednesday, next month]"


###EVENTS
https://www.qualicen.de/natural-language-processing-timeline-extraction-with-regexes-and-spacy/

In [None]:
pip install daterangeparser

In [None]:
import requests
import IPython
from daterangeparser import parse

In [None]:
#response = requests.get('https://raw.githubusercontent.com/qualicen/timeline/master/history_of_germany.txt')
#text = response.text
#print('Loaded {} lines'.format(text.count('\n')))

In [None]:
doc = nlp(raw_text)
for ent in filter(lambda e: e.label_=='DATE',doc.ents):
  print(ent.text)

In [None]:
#doc = nlp("After that, public records revealed that Musk had informed the Delaware Chancery Court that he would complete a $44 billion acquisition of Twitter in 10/10/2022 October, a deal he had been trying to evade for months.")
#IPython.display.HTML(spacy.displacy.render(doc,style="dep", page=True, options={"compact":True}))

https://downloads.cs.stanford.edu/nlp/software/dependencies_manual.pdf

In [None]:
def dep_subtree(token, dep):
  deps =[child.dep_ for child in token.children]
  child=next(filter(lambda c: c.dep_==dep, token.children), None)
  if child != None:
    return " ".join([c.text for c in child.subtree])
  else:
    return ""

# to remove citations, e.g. "[91]" as this makes problems with spaCy
p = re.compile(r'\[\d+\]')
  

In [None]:
#original code on parsing events
def extract_events_spacy(line):
  line=p.sub('', line)
  events = []
  doc = nlp(line)
  for ent in filter(lambda e: e.label_=='DATE',doc.ents):
    try:
      start,end = parse(ent.text)
    except:
      # could not parse the dates, hence ignore it
      continue
    current = ent.root
    while current.dep_ != "ROOT":
      current = current.head
    desc = " ".join(filter(None,[
                                 dep_subtree(current,"nsubj"),
                                 dep_subtree(current,"nsubjpass"),
                                 dep_subtree(current,"auxpass"),
                                 dep_subtree(current,"amod"),
                                 dep_subtree(current,"det"),
                                 current.text, 
                                 dep_subtree(current,"acl"),
                                 dep_subtree(current,"dobj"),
                                 dep_subtree(current,"attr"),
                                 dep_subtree(current,"advmod")]))
    events = events + [(start,ent.text,desc)]
    print (events)
  return events

In [None]:
def extract_all_events(text, extract_function):
  all_events = []
  processed = 0
  # Process the events
  for processed,line in enumerate(text.splitlines()):
    events = extract_function(line)
    all_events = all_events + events

  print("Extracted {} events.".format(len(all_events)))

  # Print out the events
  for event in all_events:
    print(event)

  devent= pd.DataFrame(all_events)
  return devent

In [None]:
def extract_events_spacytest(line):
  line=p.sub('', line)
  events = []
  doc = nlp(line)
  for ent in filter(lambda e: e.label_=='DATE',doc.ents):

    current = ent.root
    while current.dep_ != "ROOT":
      current = current.head
    desc = " ".join(filter(None,[
                                 dep_subtree(current,"nsubj"),
                                 dep_subtree(current,"csubj"),
                                 dep_subtree(current,"auxpass"),
                                 dep_subtree(current,"pobj"),
                                 current.text,
                                 dep_subtree(current,"prep"),
                                 dep_subtree(current,"dobj"),
                                 dep_subtree(current,"advmod"),
                                 dep_subtree(current,"xcomp"),
                                 dep_subtree(current,"acl"),
                                 dep_subtree(current,"attr")]))
    events = events + [(ent.text,desc)]
  return events

In [None]:
def extract_events_spacytesttrf(line):
  line=p.sub('', line)
  events = []
  doc = trf(line)
  for ent in filter(lambda e: e.label_=='DATE',doc.ents):

    current = ent.root
    while current.dep_ != "ROOT":
      current = current.head
    desc = " ".join(filter(None,[
                                 dep_subtree(current,"nsubj"),
                                 dep_subtree(current,"csubj"),
                                 dep_subtree(current,"auxpass"),
                                 dep_subtree(current,"pobj"),
                                 current.text,
                                 dep_subtree(current,"prep"),
                                 dep_subtree(current,"dobj"),
                                 dep_subtree(current,"advmod"),
                                 dep_subtree(current,"xcomp"),
                                 dep_subtree(current,"acl"),
                                 dep_subtree(current,"attr")]))
    events = events + [(ent.text,desc)]
  return events

In [None]:
text = raw_text

In [None]:
#test for edited subtree
extract_all_events(text,extract_events_spacytest)

In [None]:
#test w roberta transformer
extract_all_events(text,extract_events_spacytesttrf)