In [180]:
#!python -m spacy download en_core_web_lg

In [181]:
#!pip install contractions


### Import Libraries

In [127]:
import spacy
from spacy import displacy
from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from string import punctuation
from spacy.matcher import Matcher 
from spacy.tokens import Span 
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import contractions
import numpy as np


### Define a function to load data from text file

In [183]:
def load_data(file):
  df = pd.read_csv(file, sep=":",names=['speakerID', 'utteranceText'],header=None)
  return df

### use the load_data function to load the content in the transcript.txt file

In [184]:
df=load_data("transcript.txt")

### Define a function to expand all shortened words

In [185]:
def expand_contractions(sentence):
  expanded_words =[]
  for word in sentence.split():
    expanded_words.append(contractions.fix(word))
 
  expanded_text = ' '.join(expanded_words)

  return expanded_text


### apply the expand_contractions function on the dataframe to expand all shortened words. example, "I'm" will become "I am". 

In [10]:
result = []
for value in df["utteranceText"]:
  result.append(expand_contractions(str(value)))
      
df["expanded"] = result

### preview the dataframe after expanding all contractions. 

1.   speakerID stores the original speakerID from the transcript.txt file
2.   utteranceText column stores the text or response of that speaker
3.   expanded column store the utteranceText after the expand_contractions funtion was applied to it






In [11]:
df

Unnamed: 0,speakerID,utteranceText,expanded
0,Speaker1,"Hello, I’d like to introduce myself. My name ...","Hello, I would like to introduce myself. My na..."
1,Speaker2,"Hi, I’m David Smith and I work in book sales ...","Hi, I am David Smith and I work in book sales ..."
2,Speaker3,"Thanks David. So as David said, I’m Sally, Sa...","Thanks David. So as David said, I am Sally, Sa..."
3,Speaker4,"Nice to meet everyone, although I already kno...","Nice to meet everyone, although I already know..."
4,Speaker1,"Thanks everyone, and yes just to bring David ...","Thanks everyone, and yes just to bring David a..."
5,Speaker3,"Ah, that explains why we’re here!","Ah, that explains why we are here!"
6,Speaker2,Yes.,Yes.


### load our spacy model

In [14]:
nlp =spacy.load("en_core_web_lg")

### Use displacy from spacy to preview the entities identified

In [15]:
for value in df["expanded"]:
  doc = nlp(value)
  displacy.render(doc, style="ent", jupyter=True)





### Define a function to extraxt all parts of speach identifies in a given sentence

In [16]:
def getPos(doc):
  pos=[]
  for token in doc:
    pos.append(token.pos_)
  
  return pos

### define a function to extrat

In [128]:
def generate_entities_relation(txt):
  check=[]
  person=[]
  my_dict = {"person":[],"check":[]};
  doc = nlp(txt)
  sentences = list(doc.sents)

  for sentence in sentences:
    docw = nlp(str(sentence))
    for val in docw.ents:        
        if val.label_=='PERSON':
          pos_=getPos(docw)
          person.append(val.text)
          check1=check_part_of_speach(pos_,val.start,val.end)
          if check1==True:
            my_dict['person'].append(val.text)
            my_dict['check'].append(check1)

  return  pd.DataFrame(my_dict)

### define a function to identifies the speakers in a sentence. The function was developed to satisfy three conditions.

*   If the identified entity is a person and the three parts of speach identified before that entity are "PRON , NOUN, AUX"
*    If the identified entity is a person and the 2 parts of speach identified before that entity are "AUX , PRON"

*    If the identified entity is a person and the 4 parts of speach identified after that entity are "PUNCT , PRON,AUX,PRON"



In [129]:
def check_part_of_speach(pos,start,end):
  result =False
  if start >= 2:
    if pos[start-1]=='AUX' and  pos[start-2]=='NOUN' and  pos[start-3]=='PRON' and pos[start]=='PROPN' :
      result=True
    elif pos[start-1]=='AUX' and  pos[start-2]=='PRON' and pos[start]=='PROPN' :
      result=True
    elif len(pos)>end+2:
      if pos[end]=='PUNCT' and  pos[end+1]=='PRON' and  pos[end+2]=='AUX' and  pos[end+3]=='PRON' and  pos[start]=='PROPN':
        result=True
      else:
        result=False
    else:
      result=False
  elif start <2:
    if pos[start-1]=='AUX' and  pos[start-2]=='PRON' and pos[start]=='PROPN' :
      result=True
    elif len(pos)>end+2:
      if pos[end]=='PUNCT' and  pos[end+1]=='PRON' and  pos[end+2]=='AUX' and  pos[end+3]=='PRON' and  pos[start]=='PROPN':
        result=True
      else:
        result=False
    else:
      result=False

  return result
      



### define a function to extract all speakers and return the results as a dataframe

In [130]:
def extract_speakers(df):

  result = pd.DataFrame(columns=['speakerID','person', 'check'])
  my_dict1 = {"person":[],"check":[]};
  
  for index, row in df.iterrows():
    value=row["expanded"]
    
    speakers=generate_entities_relation(value) 
    speakers['speakerID']=row['speakerID']
  
    result = result.append(speakers, ignore_index=True)
    
  return result

### use the extract_speakers function to extract all the speakers

In [132]:
speakers_df=extract_speakers(df)

### preview the extrated speakers

In [144]:
speakers_df

Unnamed: 0,speakerID,person,check
0,Speaker1,Alice Roberts,True
1,Speaker2,David Smith,True
2,Speaker3,Sally,True
3,Speaker4,Paul Owens,True


In [133]:
speakers_df.describe()

Unnamed: 0,speakerID,person,check
count,4,4,4
unique,4,4,1
top,Speaker1,Alice Roberts,True
freq,1,1,4


### Define a function to search and return the name of a person based on the speakerID

In [163]:
def search_speaker(df,val):
  rslt_df = df[df['speakerID'] ==val]
  return rslt_df.person.values[0]

### add a new column "speakerName" to our dataframe to store all speakerNames

In [165]:
result = []
for value in df["speakerID"]:
  result.append(search_speaker(speakers_df,(str(value))))
      
df["speakerName"] = result

### extract the speakerNames and utteranceText for the dataframe and save it as transcript_df

In [174]:
transcript_df=df[['speakerName','utteranceText']]

### preview transcript_df

In [175]:
transcript_df

Unnamed: 0,speakerName,utteranceText
0,Alice Roberts,"Hello, I’d like to introduce myself. My name ..."
1,David Smith,"Hi, I’m David Smith and I work in book sales ..."
2,Sally,"Thanks David. So as David said, I’m Sally, Sa..."
3,Paul Owens,"Nice to meet everyone, although I already kno..."
4,Alice Roberts,"Thanks everyone, and yes just to bring David ..."
5,Sally,"Ah, that explains why we’re here!"
6,David Smith,Yes.


### write transcript df to a text file "transcript_processed" using pandas.to_csv and setting the seperator to ':'

In [186]:
transcript_df.to_csv(r'transcript_processed.txt', header=None, index=None, sep=':')