<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Data-Cleaning:--Earning-Call-Transcripts" data-toc-modified-id="Data-Cleaning:--Earning-Call-Transcripts-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Data Cleaning:  Earning Call Transcripts</a></span><ul class="toc-item"><li><span><a href="#Retrieve-all-txt-files-from-directories" data-toc-modified-id="Retrieve-all-txt-files-from-directories-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Retrieve all txt files from directories</a></span></li><li><span><a href="#Extract-text-from-each-text-files-by-filename" data-toc-modified-id="Extract-text-from-each-text-files-by-filename-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Extract text from each text files by filename</a></span></li><li><span><a href="#Create-Date-Frame" data-toc-modified-id="Create-Date-Frame-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Create Date Frame</a></span></li><li><span><a href="#Split-transcripts-into-management-discussion-and-Q&amp;A-session" data-toc-modified-id="Split-transcripts-into-management-discussion-and-Q&amp;A-session-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Split transcripts into management discussion and Q&amp;A session</a></span></li><li><span><a href="#Extract-call-release-data,-revenue-status,-and-ESP-status" data-toc-modified-id="Extract-call-release-data,-revenue-status,-and-ESP-status-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Extract call release data, revenue status, and ESP status</a></span></li></ul></li><li><span><a href="#Get-stock-price-and-volume-from-Yahoo-Finance" data-toc-modified-id="Get-stock-price-and-volume-from-Yahoo-Finance-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Get stock price and volume from Yahoo Finance</a></span></li><li><span><a href="#Sentiment-Score-and-Text-Stat-for-MD-and-QA" data-toc-modified-id="Sentiment-Score-and-Text-Stat-for-MD-and-QA-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Sentiment Score and Text Stat for MD and QA</a></span><ul class="toc-item"><li><span><a href="#Text-stat-in-transcripts" data-toc-modified-id="Text-stat-in-transcripts-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Text stat in transcripts</a></span></li><li><span><a href="#Text-stat-in-MD-&amp;-QA" data-toc-modified-id="Text-stat-in-MD-&amp;-QA-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Text stat in MD &amp; QA</a></span></li></ul></li></ul></div>

In [1]:
import pandas as pd
import datetime
import pandas_datareader as pdr
from textblob import TextBlob

import re
from textstat.textstat import textstat 

import matplotlib.pyplot as plt
%matplotlib inline

### Data Cleaning:  Earning Call Transcripts

#### Retrieve all txt files from directories

In [2]:
import glob

filenames = []
for filename in glob.glob('../data/*.txt'):
    filenames.append(filename)
filenames[0:5]

['../data/Ford Motor (F) Q3 2018 Results - Earnings Call Transcript.txt',
 '../data/Ford Motor (F) Mark Fields on Q1 2016 Results - Earnings Call Transcript.txt',
 '../data/Tesla Motors (TSLA) Elon Reeve Musk on Q2 2015 Results - Earnings Call Transcript.txt',
 '../data/Tesla (TSLA) Q3 2017 Results - Earnings Call Transcript.txt',
 '../data/General Motors (GM) Q3 2017 Results - Earnings Call Transcript.txt']

#### Extract text from each text files by filename

In [3]:
transcripts = []
for filename in filenames: 
    with open(filename, 'r') as f:
        cont = f.read()
        transcripts.append(cont)
        
print('total files loaded:',len(transcripts))


total files loaded: 61


#### Create Date Frame

In [4]:
df = pd.DataFrame(transcripts, columns=[ 'transcripts'])
df.drop_duplicates(subset='transcripts', keep='first', inplace=True)
print('total files loaded:',len(transcripts))

total files loaded: 61


#### Split transcripts into management discussion and Q&A session

In [5]:
text_MD = []
text_QA = []

for i in range(len(df.transcripts)): 
    text_MD.append(df.transcripts[i].split('Question-and-Answer Session')[0])
    text_QA.append(df.transcripts[i].split('Question-and-Answer Session')[1])

df['tx_MD'] = text_MD
df['tx_QA'] = text_QA

#### Extract call release data, revenue status, and ESP status

In [6]:
esp_date = []
esp_status = []
esp_status_all = []
esp = []
company = []

rev_status_all =[]
rev_status = []
rev_total=[]
rev_pct= []

for text in df['tx_MD']:
        esp_date.append(re.findall(r'\d{2}-\d{2}-\d{2}', text)[0])
        esp_status_all.append(re.findall(r'\n(EPS.*?\$([\-\.\d]+) (misses|beats).+?([\-\.\d]+).+?)\n', text)[0][0])
        esp_status.append(re.findall(r'\n(EPS.*?\$([\-\.\d]+) (misses|beats).+?([\-\.\d]+).+?)\n', text)[0][2])
        esp.append(re.findall(r'\n(EPS.*?\$([\-\.\d]+) (misses|beats).+?([\-\.\d]+).+?)\n', text)[0][1])
        
        rev_pct.append(re.findall(r'([\.\d]+)(?=%)', text)[0])
        rev_total.append(re.findall(r'Revenue of.*?\$([\-\.\d]+)', text)[0])   
        
        rev_status_all.append(re.findall(r'(Revenue of.* (misses|beats) .+)', text)[0][0])
        rev_status.append(re.findall(r'(Revenue of.* (misses|beats) .+)', text)[0][1])
        
        company.append(re.findall(r'\((.*?)\)', text)[0])

# Create columns related to ESP
df['esp_date'] = esp_date
df['esp_date']= pd.to_datetime(df['esp_date'])

df['esp_status_all'] = esp_status_all
df['esp_status'] = esp_status
df['esp'] = esp
df['company'] = company

#create columns related to YOY revenues 
df['rev_yoy_growth_percent'] = rev_pct
df['rev_yoy_growth_percent'] = df['rev_yoy_growth_percent'].astype('float32')

df['rev_status'] = rev_status
df['rev_total_b'] = rev_total
df['rev_total_b'] = df['rev_total_b'].astype('float32')

df['rev_status_all'] = rev_status_all

df = df.sort_values(by='esp_date', ascending=False)

### Get stock price and volume from Yahoo Finance 

In [7]:
def get(tickers, startdate, enddate):
  def data(ticker):
    return (pdr.get_data_yahoo(ticker, start=startdate, end=enddate))
  datas = map (data, tickers)
  return(pd.concat(datas, keys=tickers, names=['Ticker', 'Date']))

In [8]:
tickers = ['TSLA','GM','F','FCAU']
df_df = get(tickers, datetime.datetime(2014, 1, 2), datetime.datetime(2019, 5, 13))

In [9]:
df_df['pct_price'] = df_df["Close"].pct_change()
df_df['pct_volume'] = df_df["Volume"].pct_change()
df_df['pct_price_same_day'] = (df_df ['Open'] - df_df['Close'].shift(0)) / (df_df['Close'].shift(0))
#df_df ['weekly_return'] = (df_df ['Open'] - df_df['Close'].shift(-5)) / (df_df['Close'].shift(-5))

In [10]:
df_df.reset_index(inplace=True)

In [11]:
df_df = df_df[['Ticker','Date','Open','Close','Volume','pct_price','pct_price_same_day','pct_volume']]

In [12]:
df = df.merge(df_df, left_on=['esp_date','company'], right_on=['Date','Ticker'], how='left')
df.drop_duplicates(subset='transcripts', keep='first', inplace=True)

In [13]:
df['esp_target'] = df['esp_status'].map({'misses':0, 'beats':1})
df['rev_target'] = df['rev_status'].map({'misses':0, 'beats':1})
df["esp"] = df.esp.astype(float)

In [14]:
df['pct_price_target'] = (df['pct_price'] >0).astype(int)
df['pct_price_target_same_day'] = (df['pct_price_same_day'] >0).astype(int)

#df['weekly_return'] = (df['weekly_return'] >0).astype(int)

df['volatility_percentage_return_esp'] = df.esp.pct_change()
df['volatility_percentage_rev_yoy_growth_percent'] = df.rev_yoy_growth_percent.pct_change()

df.fillna(0, inplace=True)
df.fillna(0, inplace=True)

In [15]:
df_words_target = df[['esp_date','company','transcripts', 
                      'tx_MD','tx_QA','pct_price_target', 
                      'pct_volume',
                       'volatility_percentage_return_esp',
                      'volatility_percentage_rev_yoy_growth_percent',
                      'pct_price_target_same_day',
                      'esp_target',
                      'rev_target']]
                      
#df_words_target.to_csv('..data/df_transcripts.csv', index=False)

In [16]:
import pickle
pickle.dump(df_words_target, open( "../data/df_words_target.pkl", "wb" ))

In [17]:
df_words_target.groupby('company').count()['esp_date']

company
F       19
FCAU     6
GM      21
TSLA    15
Name: esp_date, dtype: int64

In [18]:
len(df_words_target)

61

### Sentiment Score and Text Stat for MD and QA

In [19]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

df['tx_MD_neg'] = df['tx_MD'].map(lambda x: analyser.polarity_scores(x)['neg'])
df['tx_MD_pos'] = df['tx_MD'].map(lambda x: analyser.polarity_scores(x)['pos'])
df['tx_MD_neu'] = df['tx_MD'].map(lambda x: analyser.polarity_scores(x)['neu'])
df['tx_MD_compound'] = df['tx_MD'].map(lambda x: analyser.polarity_scores(x)['compound'])

df['tx_QA_neg'] = df['tx_QA'].map(lambda x: analyser.polarity_scores(x)['neg'])
df['tx_QA_pos'] = df['tx_QA'].map(lambda x: analyser.polarity_scores(x)['pos'])
df['tx_QA_neu'] = df['tx_QA'].map(lambda x: analyser.polarity_scores(x)['neu'])
df['tx_QA_compound'] = df['tx_QA'].map(lambda x: analyser.polarity_scores(x)['compound'])

In [20]:
def detect_polarity(text):
    return TextBlob(text).sentiment.polarity

def detect_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

In [21]:
# polarity is 0.8, which means that the statement is positive 
# 0.75 subjectivity refers that mostly it is a public opinion and not a factual information.
df['polarity'] = df['transcripts'].map(lambda x: detect_polarity(x))
df['subjectivity'] = df['transcripts'].map(lambda x: detect_subjectivity(x))

df['tx_MD_polarity'] = df['tx_MD'].map(lambda x: detect_polarity(x))
df['tx_QA_polarity'] = df['tx_QA'].map(lambda x: detect_polarity(x))

df['tx_MD_subjectivity'] = df['tx_MD'].map(lambda x: detect_subjectivity(x))
df['tx_QA_subjectivity'] = df['tx_QA'].map(lambda x: detect_subjectivity(x))

#### Text stat in transcripts

In [22]:
col = 'transcripts'

for i in range(11): 
    df[col+'_num_syl'] = [textstat.syllable_count(x) for x in list(df.loc[:,col])]
    df[col+'_avg_sentence_length'] = [textstat.avg_sentence_length(x) for x in list(df.loc[:,col])]
    df[col+'textstat.lexicon_count'] = [textstat.lexicon_count(x) for x in list(df.loc[:,col])]
    df[col+'flesch_reading_ease'] = [textstat.flesch_reading_ease(x) for x in list(df.loc[:,col])]
    df[col+'flesch_kincaid_grade'] = [textstat.flesch_kincaid_grade(x) for x in list(df.loc[:,col])]
    df[col+'difficult_words'] = [textstat.difficult_words(x) for x in list(df.loc[:,col])]
    df[col+'linsear_write_formula'] = [textstat.linsear_write_formula(x) for x in list(df.loc[:,col])]
    df[col+'gunning_fog'] = [textstat.gunning_fog(x) for x in list(df.loc[:,col])]
    df[col+'automated_readability_index'] = [textstat.automated_readability_index(x) for x in list(df.loc[:,col])]
    df[col+'coleman_liau_index'] = [textstat.coleman_liau_index(x) for x in list(df.loc[:,col])]
    df[col+'dale_chall_readability_score'] = [textstat.dale_chall_readability_score(x) for x in list(df.loc[:,col])]

#### Text stat in MD & QA

In [23]:
col = 'tx_MD'

for i in range(11): 
    df[col+'_num_syl'] = [textstat.syllable_count(x) for x in list(df.loc[:,col])]
    df[col+'_avg_sentence_length'] = [textstat.avg_sentence_length(x) for x in list(df.loc[:,col])]
    df[col+'textstat.lexicon_count'] = [textstat.lexicon_count(x) for x in list(df.loc[:,col])]
    df[col+'flesch_reading_ease'] = [textstat.flesch_reading_ease(x) for x in list(df.loc[:,col])]
    df[col+'flesch_kincaid_grade'] = [textstat.flesch_kincaid_grade(x) for x in list(df.loc[:,col])]
    df[col+'difficult_words'] = [textstat.difficult_words(x) for x in list(df.loc[:,col])]
    df[col+'linsear_write_formula'] = [textstat.linsear_write_formula(x) for x in list(df.loc[:,col])]
    df[col+'gunning_fog'] = [textstat.gunning_fog(x) for x in list(df.loc[:,col])]
    df[col+'automated_readability_index'] = [textstat.automated_readability_index(x) for x in list(df.loc[:,col])]
    df[col+'coleman_liau_index'] = [textstat.coleman_liau_index(x) for x in list(df.loc[:,col])]
    df[col+'dale_chall_readability_score'] = [textstat.dale_chall_readability_score(x) for x in list(df.loc[:,col])]
    
col = 'tx_QA'

for i in range(11): 
    df[col+'_num_syl'] = [textstat.syllable_count(x) for x in list(df.loc[:,col])]
    df[col+'_avg_sentence_length'] = [textstat.avg_sentence_length(x) for x in list(df.loc[:,col])]
    df[col+'textstat.lexicon_count'] = [textstat.lexicon_count(x) for x in list(df.loc[:,col])]
    df[col+'flesch_reading_ease'] = [textstat.flesch_reading_ease(x) for x in list(df.loc[:,col])]
    df[col+'flesch_kincaid_grade'] = [textstat.flesch_kincaid_grade(x) for x in list(df.loc[:,col])]
    df[col+'difficult_words'] = [textstat.difficult_words(x) for x in list(df.loc[:,col])]
    df[col+'linsear_write_formula'] = [textstat.linsear_write_formula(x) for x in list(df.loc[:,col])]
    df[col+'gunning_fog'] = [textstat.gunning_fog(x) for x in list(df.loc[:,col])]
    df[col+'automated_readability_index'] = [textstat.automated_readability_index(x) for x in list(df.loc[:,col])]
    df[col+'coleman_liau_index'] = [textstat.coleman_liau_index(x) for x in list(df.loc[:,col])]
    df[col+'dale_chall_readability_score'] = [textstat.dale_chall_readability_score(x) for x in list(df.loc[:,col])]

In [24]:
df_final = df
df_final

Unnamed: 0,transcripts,tx_MD,tx_QA,esp_date,esp_status_all,esp_status,esp,company,rev_yoy_growth_percent,rev_status,...,tx_QA_avg_sentence_length,tx_QAtextstat.lexicon_count,tx_QAflesch_reading_ease,tx_QAflesch_kincaid_grade,tx_QAdifficult_words,tx_QAlinsear_write_formula,tx_QAgunning_fog,tx_QAautomated_readability_index,tx_QAcoleman_liau_index,tx_QAdale_chall_readability_score
0,Fiat Chrysler Automobiles N.V. (FCAU) CEO Mike...,Fiat Chrysler Automobiles N.V. (FCAU) CEO Mike...,"\n\nOperator\n\nThank you, sir. [Operator Inst...",2019-05-03,EPS of $0.40318064732892821145 misses by $-0.08,misses,0.403181,FCAU,16.200001,misses,...,28.5,6529,67.93,10.9,499,5.444444,12.83,14.3,8.59,6.26
1,General Motors Company (GM) CEO Mary Barra on ...,General Motors Company (GM) CEO Mary Barra on ...,\n\nOperator\n\n[Operator Instructions] Your f...,2019-04-30,EPS of $1.41 beats by $0.29,beats,1.410000,GM,3.380000,misses,...,21.7,7177,66.37,9.4,572,6.000000,10.22,11.5,8.94,5.97
2,Ford Motor Company (F) CEO Jim Hackett on Q1 2...,Ford Motor Company (F) CEO Jim Hackett on Q1 2...,\n\nOperator\n\n[Operator Instructions] Your f...,2019-04-25,EPS of $0.44 beats by $0.26,beats,0.440000,F,91.000000,beats,...,25.5,8225,62.51,10.9,653,6.625000,11.78,13.1,8.94,6.15
3,"Tesla, Inc (TSLA) CEO Elon Musk on Q1 2019 Res...","Tesla, Inc (TSLA) CEO Elon Musk on Q1 2019 Res...",\nOperator\nThank you. Our first question come...,2019-04-24,EPS of $-1.7675 misses by $-0.83,misses,-1.767500,TSLA,33.230000,misses,...,17.3,5255,70.84,7.7,476,6.250000,8.71,8.7,8.06,5.92
4,Fiat Chrysler Automobiles NV (FCAU) CEO Mike M...,Fiat Chrysler Automobiles NV (FCAU) CEO Mike M...,\n\nOperator\n\nThank you. [Operator Instructi...,2019-02-07,EPS of $1.1796733212341198 beats by $0.10,beats,1.179673,FCAU,3.060000,beats,...,23.2,5454,64.85,10.0,435,5.111111,10.80,11.7,8.65,6.05
5,General Motors Co. (GM) CEO Mary Barra on Q4 2...,General Motors Co. (GM) CEO Mary Barra on Q4 2...,\n\nOperator\n\n[Operator Instructions] Our fi...,2019-02-06,EPS of $1.43 beats by $0.23,beats,1.430000,GM,11.360000,beats,...,28.1,6989,59.87,11.9,526,11.800000,12.69,14.2,8.71,6.22
6,"Tesla, Inc. (TSLA) CEO Elon Musk on Q4 2018 Re...","Tesla, Inc. (TSLA) CEO Elon Musk on Q4 2018 Re...",\nOperator\nThank you [Operator Instructions] ...,2019-01-30,EPS of $1.93 misses by $-0.09,misses,1.930000,TSLA,119.750000,beats,...,15.9,6423,72.26,7.1,536,7.571429,8.04,8.2,7.94,5.74
7,Ford Motor Company (F) CEO Jim Hackett on Q4 2...,Ford Motor Company (F) CEO Jim Hackett on Q4 2...,\n\nOperator\n\n[Operator Instructions] Your f...,2019-01-23,EPS of $0.3 misses by $-0.00,misses,0.300000,F,0.560000,beats,...,28.5,4877,59.47,12.0,442,12.000000,13.15,14.0,8.42,6.48
8,General Motors (GM) Q3 2018 Results - Earnings...,General Motors (GM) Q3 2018 Results - Earnings...,\n\nOperator\n\nYour first question comes from...,2018-10-31,EPS of $1.87 beats by $0.62,beats,1.870000,GM,17.440001,beats,...,26.1,6383,61.90,11.1,533,6.000000,12.14,14.0,9.52,6.25
9,Fiat Chrysler Automobiles NV (FCAU) Q3 2018 Re...,Fiat Chrysler Automobiles NV (FCAU) Q3 2018 Re...,"\n\nOperator\n\nThank you very much, sir.\n\nJ...",2018-10-30,EPS of $0.97050147492625368732 beats by $0.14,beats,0.970501,FCAU,5.080000,beats,...,23.2,7218,64.85,10.0,531,5.000000,10.74,12.2,9.12,5.95


In [25]:
import pickle
pickle.dump(df_final, open( "../data/df_final.pkl", "wb" ))