In [339]:
import numpy as np
import pandas as pd
import csv


# data from: https://github.com/Carlossn/Python/tree/master/NLP_Prof_Warning_Prediction/Data/transcripts/trans

df = pd.read_json('transcripts.json')
df.shape

#### Clean Data ####

# Errors in period column: instead of hardcoding a fix, extract data from text column
print(df.Period.unique())
# errors = ['Citi20','J.P.Mo','CitiIn', '“Behi','Trading']
# df.loc[df.Period.isin(errors)]

## Clean up names, which are irregular
# remove periods and commas
df['Name'] = df['Name'].str.replace(',', '')
df['Name'] = df['Name'].str.replace('.', '')
# capitalize each leading letter
df['Name'] = df['Name'].str.title()
# change co to company, corp to corporation, inc to incorporated
# in two steps, so that "corporation" is not changed to "corporationoration" (.replace does not distinguish context)
df['Name'] = df['Name'].str.replace('Corporation','Corp')
df['Name'] = df['Name'].str.replace('Corporatio','Corp')
df['Name'] = df['Name'].str.replace('Corp', 'Corporation')
df['Name'] = df['Name'].str.replace('Incorporated', 'Inc')
df['Name'] = df['Name'].str.replace('Inc','Incorporated')
df['Name'] = df['Name'].str.replace('Company','Co')
# remove trailing space
df['Name'] = df['Name'].str.strip()

# fix multiple names for same company (e.g.,'Roper Industries Inc. ' 'Roper Technologies, Inc.) '
df['Name'] = df['Name'].replace('Advanced Drainage Systems Incorporated', 'Advanced Drainage Systems')
df['Name'] = df['Name'].replace('Clean Diesel Technologies Incorporated (N', 'Clean Diesel Technologies Incorporated')
df['Name'] = df['Name'].replace('Illumina Incorporated (N', 'Illumina Incorporated')
df['Name'] = df['Name'].replace('Parker Hannifin Corporation', 'Parker-Hannifin Corporation')
df['Name'] = df['Name'].replace('Roper Industries Incorporated', 'Roper Technologies Incorporated')

#print(df.Name.unique())

# Split text using 'operator' -- only on the first instance. This keeps analyst calls.
df.dtypes
df['Text'] = df['Text'].astype(str)
df['Clean Text'] = df['Text'].str.split("Operator", 1).str[1]
# Four rows don't have an operator, and are relatively clean (they're a bit different -- broker calls, conferences, interim results)
df['Clean Text'] = np.where(df['Clean Text'].isnull(), df['Text'], df['Clean Text'])

# Add column that is just executives' opening remarks (NO Q&A)
df['Remarks'] = df['Clean Text'].str.split("'Question-and-Answer Session'", 1).str[0]
# check: df[df['Remarks'].isnull()]

# Grab headline from text. Can't split on commas because some companies have commas in name.
df['nyse_splits'] = df['Text'].str.split("NYSE",1).str[1]
df['nasdaq_splits'] = df['Text'].str.split("NASDAQ",1).str[1]
df['Text2'] = np.where(df['nyse_splits'].notnull(), df['nyse_splits'],df['nasdaq_splits'])
df['Headline'] = df['Text2'].str.split(",",2).str[1]

df['Exchange'] = np.where(df['nyse_splits'].notnull(), 'NYSE', 'NASDAQ')


# Hmm. A fiscal year can start on July 1, making fiscal quarter in headline not that informative.
def quarter(m):
    if m > 9:
        return 4
    elif m > 6:
        return 3
    elif m > 3:
        return 2
    else:
        return 1
    
df['Quarter'] = df['Month'].apply(quarter)

# Create indicators for political topics
globalization = ['inequality', 'globalization', 'trade', 'tariffs', 'dumping']
politics = ['Trump', 'Clinton', 'Obama', 'President', 'election', 'immigration']
healthcare = ['Obamacare', 'ACA']
keywords = [globalization, politics, healthcare]
indicators = []
for lst in keywords:
    for k in lst:
        df['i_'+k] = np.where(df['Clean Text'].str.contains(k),1,0)
        indicators.append('i_'+k)


# Figure out how to create columns in list items with multiple words
# healthcare = ['health reform', 'healthcare reform', 'medical costs']
# taxes = ['tax reform', 'tax cuts']

df['i_health_reform'] = np.where(df['Clean Text'].str.contains('health reform'),1,0)
df['i_healthcare_reform'] = np.where(df['Clean Text'].str.contains('healthcare reform'),1,0)
df['i_medical_costs'] = np.where(df['Clean Text'].str.contains('medical costs'),1,0)
df['i_tax_reform'] = np.where(df['Clean Text'].str.contains('tax reform'),1,0)
df['i_tax_cuts'] = np.where(df['Clean Text'].str.contains('tax cuts'),1,0)
indicators.append('i_health_reform')
indicators.append('i_healthcare_reform')
indicators.append('i_medical_costs')
indicators.append('i_tax_reform')
indicators.append('i_tax_cuts')

# Check for transcripts from just after the election?
df['After Election'] = np.where(df['Date'] > '2016-11-08', 1, 0)
# df.loc[df['After Election'] == 1]

# Check for duplicates
df.drop_duplicates(inplace=True)
print(df.shape) # there are 18 duplicates

# Drop unhelpful/extra columns
to_drop = ['Symbol', 'Target', 'Period', 'nyse_splits','nasdaq_splits','Text2', 'After Election']
df.drop(to_drop, axis=1, inplace=True)

# Download list of companies, as csv
companies = np.asarray(df['Name'].unique())
np.savetxt("companies.csv", companies, delimiter=",",fmt='%5s')
# Load list of post-elections change in valuations, and add column to df
valuations = pd.read_csv('valuations.csv')
df = pd.merge(df, valuations, on='Name')
# Create indicator for above median (not avg, thrown off by big upside outlier)
df['AboveMedian'] = np.where(df['Change'] > df['Change'].median(), 1, 0)

# Save as csv
df.to_csv('clean_transcripts.csv', doublequote=False, escapechar='\\')


# Question 1
lst = []
for i in indicators:
    lst.append(df[i].mean())
    
table1 = pd.DataFrame({'Tag': indicators, 'Frequency': lst})
table1.sort_values(['Frequency','Tag'],ascending=[False,True],inplace=True)

f, ax = plt.subplots(figsize=(12, 7))
sns.set(style="darkgrid")
sns.barplot(y="Tag",x="Frequency",data=table1)
plt.savefig("Figure1.png")

# Question 2 and 3
db_export = pd.read_csv('databricks_export.csv')

f, ax = plt.subplots(figsize=(5, 3))
sns.set(style="darkgrid")
sns.countplot(x="topic", data=db_export)
plt.savefig("Figure3.png")

f, ax = plt.subplots(figsize=(12, 7))
sns.set(style="darkgrid")
sns.countplot(x="topic", hue="SubSector", data=db_export)
plt.savefig("Figure4.png")

# Question 4
db_export['Change'] = db_export['Change'].astype(float)
db_export['Change'].corr(db_export['topic'])


['Q22016' 'Q32016' 'Q42016' 'Q12017' 'Q32015' 'Q42015' 'Q12016' 'Q12015'
 'Q22015' 'Q32014' 'Q22017' 'CitiIn' '“Behi' 'Citi20' 'J.P.Mo' '2016In'
 'Q22014' '2015In' 'Trading']
(120, 35)
