In [1]:
import pandas as pd
import os, glob

***
### Define blocks

In [2]:
def readfile(fname):
    file_data = open(fname, encoding='Latin-1')
    text = file_data.readlines()
    file_data.close()
    return text

In [3]:
def get_header_type(s):
    try:
        if s == 'Company Participants':
            return ['Company Participants', 'Conference Call Participants']
        elif s == 'Executives':
            return ['Executives','Analysts']
        
    except HeaderStyleNotFound:
        print('Header for list of Participants not recognized')

In [4]:
def get_participants(df,lst,xtra_headers):
    # occurs in exactly one place, where the headers are defined:
    idx_company = int(df.loc[df['Text'] == lst[0]].index.values)
    idx_others  = int(df.loc[df['Text'] == lst[1]].index.values)
    
    # multiple entries throughout the dataset:
    idx_operator  = df1_text.loc[df1_text['Text'] == 'Operator'].index.values
    
    # to construct lists of names
    csuite_names = []
    other_names  = []
    
    # get names for company representatives:
    for i in range(int(idx_company+1),int(idx_others)):
        s = df['Text'][i].split()
        csuite_names.append(s[0]+' '+s[1])       
    
    for j in range(int(idx_others+1),int(idx_operator[0])):
        s = df['Text'][j].split()
        other_names.append(s[0]+' '+s[1])
    
    other_names.extend(xtra_headers)
    all_names = csuite_names+other_names
    
    return csuite_names, all_names

In [5]:
# loop through text dataframe and extract only texts from company representatives:
def get_transcript(df, csuite_names, all_names):
    corporate_text = []
    
    tup_all_names = tuple(all_names)
    
    for name in csuite_names:
        idx_csuite = df.loc[df['Text'] == name].index.values
        idx_csuite = list(idx_csuite)
        
        if len(idx_csuite) > 1:
            for idx in idx_csuite:
                for i in range(int(idx+1), int(idx+20)):
                    if(df['Text'][i].startswith(tup_all_names)):
                        break
                    else:
                        corporate_text.append(df['Text'][i])
                        continue
                continue

    return corporate_text

In [6]:
def create_df(text_list,q_list):
    df = pd.DataFrame()
    for i in range(len(transcript_list)):
        temp_df = pd.DataFrame()
        temp_df = pd.concat([temp_df,pd.DataFrame(transcript_list[i],columns={'Text'})],ignore_index=True)
        temp_df['Quarter'] = quarter_list[i]
        df = pd.concat([df,temp_df],ignore_index=True)
    return df

***
### Main program
This program prepares the documents necessary for creating a corpus (or corporas)
- It leverages Quarterly Earnings Calls Transcripts that I manually scraped from the SeekingAlpha financial website. I believe that website incorporates subtle deliberate changes to their pages to deter automated web scraping.

In [51]:
ticker  = 'amd'
company = 'AMD'

transcript_df = pd.DataFrame()
transcript_list = []
quarter_list = []

for file in glob.glob('data/semiconductor/'+ticker+'/*.txt'):
    s=file.split('.')
    quarter_str = s[0][-6:]
    quarter_list.append(quarter_str)
    
    # Will need to loop over this file later (for each ticker symbol in a sector):
    #input_file = file.replace('\\','/')
    raw_text = readfile(file)
    
    
    # Read the scraped file in and lightly process each document (row) in the dataframe:
    #raw_text = readfile(input_file)
    df1_text = pd.DataFrame([s[:-1] for s in raw_text if s != '\n'], columns=['Text'])
    
    # Determine which header convention is used for participants:
    # The only requirement being that the earnings call transcript header shows up in the 2nd row (which it nearly always does).
    headers = get_header_type(df1_text['Text'][1])
    #print(headers)
    
    # Identify all participants on the call:
    other_headers=['Operator','Question-and-Answer Session']
    company_reps, all_reps = get_participants(df1_text,headers,other_headers)
    
    
    # Load up a dataframe with texts only from the company representatives:
    transcript_list.append(get_transcript(df1_text, company_reps, all_reps))

In [52]:
# iteratively create a large df of earnings calls:
transcript_df = create_df(transcript_list,quarter_list)
transcript_df['Company'] = company

In [53]:
#remove restriction on maximum column width for display in jupyter notebooks
pd.set_option('display.max_colwidth', -1)

transcript_df.head(200)

Unnamed: 0,Text,Quarter,Company
0,"Thank you, and welcome to AMD's first quarter conference call. By now you should have had the opportunity to review a copy of our earnings release and the CFO commentary and accompanying slides. If you have not reviewed these documents, they can be found on AMD's website at ir.amd.com.",Q12015,AMD
1,"Participants on today's conference call are Lisa Su, our President and Chief Executive Officer, and Devinder Kumar, our Senior Vice President and Chief Financial Officer.",Q12015,AMD
2,This is a live call and will be replayed via webcast on amd.com.,Q12015,AMD
3,"I would like to take this opportunity to highlight a few dates for you. AMD will host its Financial Analyst Day on May 6 in New York. Devinder Kumar will present at the Jefferies 2015 Technology Media and Telecom Conference on May 13 in Miami. And our second quarter quiet tired will begin at the close of business on Friday, June 12, 2015.",Q12015,AMD
4,"Before we begin, let me remind everyone that today's discussion contains forward-looking statements based on the environment as we currently see it. Those statements are based on current beliefs, assumptions and expectations, speak only as of the current date and as such involve risks and uncertainties that could cause actual results to differ materially from our current expectations.",Q12015,AMD
5,"As a reminder, beginning in the first quarter 2015, our non-GAAP results exclude the impact of stock-based compensation. Additionally, please note that non-GAAP financial measures referenced during this call are reconciled to their most directly comparable GAAP financial measure in the press release and CFO commentary posted on our website. Please refer to the cautionary statements in today's earnings press release and CFO commentary for more information, and you'll also find detailed discussions about our risk factors in our filings with the SEC, in particular AMD's annual report on Form 10-K for the year ended December 27, 2014.",Q12015,AMD
6,"Now with that, I'd like to hand the call over to Lisa. Lisa?",Q12015,AMD
7,"Thank you, Devinder.",Q12015,AMD
8,"Okay, operator, we'll take the next question please.",Q12015,AMD
9,"Great. Operator, that concludes today's earnings conference call. If you could close the call. And we'd like to thank everybody for participating.",Q12015,AMD


In [54]:
# save version of corpus before preprocessing:
#transcript_df.to_csv(ticker+'_corpus_before_preprocessing.csv')

In [55]:
len(transcript_df)

730

#### Now, we have each company's transcripts over 5 years as individual corpuses.
- Each can be pre-processed to get cleaned up data, prior to combining to generate a larger (more inclusive corpus, or corpora) for topic modeling

***
### Text Cleaning and & Preprocessing (over each corpus)

In [56]:
# Text preprocessing steps - remove numbers, captial letters and punctuation (rudimentary way, no context for language's structure)
##import re
##import string
##
##alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
##punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
##
##df_corpus['Text'] = df_corpus['Text'].map(alphanumeric).map(punc_lower)
##df_corpus.head(20)

In [57]:
doc_complete = list(transcript_df['Text'])
#print(transcript_df.shape)
#print(doc_complete)

#### Produce a cleaned up corpus, before converting into a document-term matrix
- Remove stopwords, any punctualizations, and normalize the data using wordnet lemmatization from NLTK
- Rather than taking a best-guess for stemming (pruning) the data, we're using the Lemmatizer to preserve potential nuances in the structure of the high-impact, high-specificity language that might arise during earnings calls.

#### Approach 1 for lemmatization: No part-of-speech tagging (everything's a noun)

In [59]:
# Approach 1 for lemmatization: No parts-of-speech tagging (everything's a noun)

from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
import string

stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc) for doc in doc_complete]

# may call split after lemmatization (it might not matter at all since the lemmatization acts on individual words)
doc_clean_split = [clean(doc).split() for doc in doc_complete]

In [60]:
# Additional clean-up step that I think can help:
##doc_clean_split = [doc for doc in doc_clean_split if len(doc) > 7]
print(len(doc_clean))

730


#### Approach 2: Lemmatization with part-of-speech tagging

In [None]:
# reference: https://www.geeksforgeeks.org/python-lemmatization-approaches-with-examples/

import nltk 
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
#nltk.download('averaged_perceptron_tagger')
#nltk.download('punkt')

# Define function to lemmatize each word with its POS tag 
  
# POS_TAGGER_FUNCTION : TYPE 1 
def pos_tagger(nltk_tag): 
    if nltk_tag.startswith('J'): 
        return wordnet.ADJ 
    elif nltk_tag.startswith('V'): 
        return wordnet.VERB 
    elif nltk_tag.startswith('N'): 
        return wordnet.NOUN 
    elif nltk_tag.startswith('R'): 
        return wordnet.ADV 
    else:           
        return None

In [None]:
lemma = WordNetLemmatizer()

In [None]:
# tokenize the sentence and find the POS tag for each token 
pos_tagged = nltk.pos_tag(nltk.word_tokenize(doc_clean[0]))

In [None]:
print(pos_tagged)

In [None]:
# As you may have noticed, the above pos tags are a little confusing. 
  
# we use our own pos_tagger function to make things simpler to understand. 
wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))

In [None]:
print(wordnet_tagged)

In [None]:
lemmatized_sentence = [] 
for word, tag in wordnet_tagged: 
    if tag is None: 
        # if there is no available tag, append the token as is 
        lemmatized_sentence.append(word) 
    else:         
        # else use the tag to lemmatize the token 
        lemmatized_sentence.append(lemma.lemmatize(word, tag)) 
lemmatized_sentence = " ".join(lemmatized_sentence) 
  
#print original:
print(doc_complete[0],'\n')
#print lemmatized:
print(lemmatized_sentence,'\n')

##### Approach 2 is generally more robust and reasonable than Approach 1. So, we'll only compare Approach 2 with the advanced methods up next.

#### Approach 3: Advanced Lemmatization with Gensim

In [None]:
from gensim.utils import lemmatize

In [None]:
lemmatized_sentence = [word.decode('utf-8').split('.')[0] for word in lemmatize(doc_clean[0])] 
  
print(lemmatized_sentence)
print(len(lemmatized_sentence))

In [None]:
print([i.split('/')[0] for i in lemmatized_sentence],'\n')
print(doc_clean_split[0])
print(len(doc_clean_split[0]))

##### Gensim removes the '300' and 'million' words from the final lemmatized and tokenized output
- Otherwise, it does a very good job of retaining the main terms from the input.
- I'm not sure that I like the removal of a potentially meaningful word like 'million' from an earnings call. We can look at spaCy next.

#### Approach 4: Advanced Lemmatization with spaCy

In [17]:
import spacy 
nlp = spacy.load('en_core_web_sm')

In [61]:
def lemmatize_tokenize(lst):
    lemmatized_list = []
    for i in range(len(lst)):
        doc = nlp(lst[i])
        lemmatized_list.append([token.lemma_ for token in doc])

    return lemmatized_list

In [62]:
token_list = lemmatize_tokenize(doc_clean)

In [63]:
## QC print
#print(len(token_list))
#print(len(quarter_list))

730
20


In [64]:
# WRITE OUT: tokenized corpus for a company's earnings calls
new_list = []
for i in range(len(token_list)):
    new_list.append(" ".join([doc for doc in token_list[i]]))

out_df = pd.DataFrame()
out_df = pd.DataFrame(new_list,columns={'Text'})
out_df['Company'] = company

out_df.to_csv(ticker+'_corpus_POSTprocessed.csv')

In [None]:
# For input into wordcloud
lemma_sentence_all_docs = [" ".join(doc for document in token_list for doc in document if doc != '-PRON-')]

##### spaCy preserves the 'million' token in the lemmatized sentence, yay!
- It also preserves the term '1 plus billion' which I think is also contextually relevant.
- We'll go with spaCy for the lemmatized tokenization of the corpora!

***
### Creating Wordclouds

In [None]:
# Import wordcloud library
from wordcloud import WordCloud
from matplotlib import pyplot as plt


# Create a WordCloud object
wordcloud = WordCloud(width=800, height=600, background_color="white", 
                      max_words=1000, contour_width=3, contour_color='orange').generate(str(lemma_sentence_all_docs))

# alternate way to generate a word cloud:
#wordcloud.generate(str(lemma_sentence_all_docs))

# Visualize the word cloud
#wordcloud.to_image()
#wordcloud.to_file(ticker+'_wordcloud.png')
plt.figure( figsize=(20,15), facecolor='k')
plt.imshow(wordcloud)
plt.axis("off")

# to show image in notebook:
#plt.tight_layout(pad=0)
#plt.show()

# to plot to file:
plt.savefig(ticker+'_wordcloud.png', facecolor='k', bbox_inches='tight',dpi=600)