# Import Libraries

In [1]:
import json
import pickle
import glob
import csv
import pandas as pd
import numpy as np

# Extract Feature Names

In [None]:
entities = glob.glob("person/*")

key_set = set()

for entity in entities:
    try:
        js = json.loads(pickle.load(open(entity,"rb")))
    
    except:
        js = {"not-working": entity}
    for key in list(js.keys()):
        if key not in key_set:
            key_set.add(key)

pickle.dump(key_set, open("person_key_set", "wb"))

In [None]:
entities = glob.glob("company/*")

key_set = set()

for entity in entities:
    try:
        js = json.loads(pickle.load(open(entity,"rb")))
    
    except:
        js = {"not-working": entity}
    for key in list(js.keys()):
        if key not in key_set:
            key_set.add(key)

pickle.dump(key_set, open("company_key_set", "wb"))

In [60]:
features = {}
features['person'] = ['permalink',
                      'first_name',
                      'last_name',
                      'twitter_username',
                      'born_year',
                      'born_month',
                      'born_day',
                      'birthplace',
                      'degrees',
                      'overview',
                      'alias_list',
                      'affiliation_name']

features['degrees'] = ['person_permalink','degree_type', 'institution', 'subject']

features['funding_round'] = ['company_permalink','round_code',
                             'raised_amount',
                             'raised_currency_code',
                             'funded_year',
                             'funded_month',
                             'funded_day',]

features['company'] = ['permalink',
                       'name',
                       'category_code',
                       'description',
                       'overview',
                       'twitter_username',
                       'alias_list',
                       'number_of_employees',
                       'total_money_raised',
                       'founded_year',
                       'founded_month',
                       'founded_day',
                       'deadpooled_year',
                       'deadpooled_month',
                       'deadpooled_day',
                       'ipo',
                       'acquisition']

features['competition'] = ['competitor', 'permalink']

features['relationship'] = ['person_permalink',
                            'company_permalink',
                            'is_past',
                            'title']

features['milestones'] = ['stoneable',
                          'description',
                          'stoned_year',
                          'stoned_month',
                          'stoned_day',
                          'stoneable_type',
                          'stoned_acquirer',
                          'stoned_value',
                          'stoned_value_type']

features['acquisition'] = ['company_permalink',
                           'acquired_day',
                          'acquired_month',
                          'acquired_year',
                          'acquiring_company',
                          'price_amount',
                          'price_currency_code',
                          'source_description',
                          'term_code']

features['ipo'] = ['company_permalink',
                   'valuation_amount', 
                   'valuation_currency_code', 
                   'pub_year',
                   'pub_month',
                   'pub_day',
                   'stock_symbol']

# Extract Information From JSON 

In [None]:
file = open('foo.txt', 'w')

entities = glob.glob("person/*")

for entity in entities:
    try:
        js = json.loads(pickle.load(open(entity,"rb")))
    
    except:
        continue
        
    string = ""
    for feature in features['person']:
        
        if not js[feature]:   
            string += "None" + '\t'
        
        #elif features == "overview":
        #    string += "%r"%str(js[feature]) + '\t'
        
        else:
            string += "%r"%str(js[feature]) + '\t'
    
    string += '\n'
    
    file.write(string)

file.close()

In [5]:
file = open('foo.txt', 'w')

entities = glob.glob("person/*")

for entity in entities:
    try:
        js = json.loads(pickle.load(open(entity,"rb")))
    
    except:
        continue
        
    for degree in js['degrees']:
        
        string = "%r"%js['permalink'] + '\t'
        try:
            for feature in features['degrees']:       
                string += "%r"%degree[feature] + '\t'
        except:
            print(degree)
    
        string += '\n'
        file.write(string)

file.close()

In [None]:
file = open('foo.txt', 'w')

entities = glob.glob("company/*")

for entity in entities:
    try:
        js = json.loads(pickle.load(open(entity,"rb")))
    
    except:
        continue
        
    string = ""
    for feature in features['company']:
        
        if not js[feature]:   
            string += "None" + '\t'

        
        #elif features == "overview":
        #    string += "%r"%str(js[feature]) + '\t'
        
        else:
            string += "%r"%str(js[feature]) + '\t'
    
    string += '\n'
    
    file.write(string)

file.close()

In [None]:
file = open('foo.txt', 'w')

entities = glob.glob("company/*")

for entity in entities:
    try:
        js = json.loads(pickle.load(open(entity,"rb")))
    
    except:
        continue
    
    for milestone in js['milestones']:
        string = "%r"%milestone['stoneable']['permalink'] + '\t'
        for feature in features['milestones']:
            string += "%r"%milestone[feature] + '\t'
    
    
        string = string+'\n'
        file.write(string)

file.close()

In [None]:
file = open('foo.txt', 'w')

entities = glob.glob("company/*")

for entity in entities:
    try:
        js = json.loads(pickle.load(open(entity,"rb")))
    
    except:
        continue
    
    for funding_round in js['funding_rounds']:
        string = "%r"%js['permalink'] + '\t'
                            
        for feature in features['funding_round']:
            
            string += "%r"%funding_round[feature] + '\t'
    
    
        string = string+'\n'
        file.write(string)

file.close()

In [57]:
file = open('foo.txt', 'w')

entities = glob.glob("company/*")

for entity in entities:
    try:
        js = json.loads(pickle.load(open(entity,"rb")))
    
    except:
        continue

    string = "%r"%js['permalink'] + '\t'
    
    if not js['ipo']:
        continue
    for feature in features['ipo']:
        if feature == 'acquiring_company':
            string += "%r"%js['acquisition']['acquiring_company']['permalink'] + '\t'
        else:
            string += "%r"%js['ipo'][feature] + '\t'
    
    string = string+'\n'
    file.write(string)

file.close()

In [None]:
entities = glob.glob("company/*")

key_set = []
for entity in entities:
    try:
        js = json.loads(pickle.load(open(entity,"rb")))
    
    except:
        continue
        
    for foo in js['competitions']:
        bar = foo['competitor']
        for key in list(bar.keys()):
            if key not in key_set:
                key_set.append(key)
    

key_set

In [63]:
ipo = pd.read_csv('foo.txt', delimiter='\t', header=None)
rename_col = dict(zip(range(len(features['ipo'])), features['ipo']))
ipo = ipo.rename(columns=rename_col)
ipo = ipo.drop(7, axis=1)
ipo_df = ipo
pickle.dump(ipo_df, open("ipo_df", "wb"))

In [65]:
for col in ipo_df.columns:
    ipo_df[col] = ipo_df[col].str.strip("'")
ipo_df = ipo_df.replace('None', np.NaN)

In [67]:
pickle.dump(ipo_df, open("ipo_clean_df", "wb"))

In [None]:
for col in relationship_person_df.columns:
    relationship_person_df[col] = relationship_person_df[col].str.strip("'")
    relationship_company_df[col] = relationship_company_df[col].str.strip("'")
relationship_df = pd.concat([relationship_person_df,relationship_company_df])
relationship_df.drop_duplicates(['person_permalink', 'company_permalink'])

for col in person_df.columns:
    person_df[col] = person_df[col].str.strip("'")
person_df = person_df.replace('None', np.NaN)

for col in degrees_df.columns:
    degrees_df[col] = degrees_df[col].str.strip("'")
degrees_df = degrees_df.replace('None', np.NaN)


pickle.dump(relationship_df, open('relationship_clean_df','wb'))
pickle.dump(degrees_df, open('degrees_clean_df','wb'))
pickle.dump(person_df, open('person_clean_df','wb'))