In [1]:
import pandas as pd 
import numpy as np
from sklearn import preprocessing
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
data_path = "../cb_db_csv/"
relatioinship_file_name = "cb_relationships.csv"
object_file_name = "cb_objects.csv"



In [2]:
def normalize(x):
    x = x.values.reshape(-1, 1)
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    return x_scaled

In [3]:
df_relationship = pd.read_csv(data_path + relatioinship_file_name, encoding = "ISO-8859-1",low_memory=False)
df_relationship = df_relationship.iloc[:,[2,3,8]]
df_relationship 

Unnamed: 0,person_object_id,relationship_object_id,title
0,p:2,c:1,Co-Founder/CEO/Board of Directors
1,p:3,c:1,VP Marketing
2,p:4,c:3,Evangelist
3,p:5,c:3,Senior Director Strategic Alliances
4,p:7,c:4,Chief Executive Officer
5,p:8,c:4,Senior Software Engineer
6,p:9,c:4,Systems Engineering Manager
7,p:10,c:5,"Founder and CEO, Board Of Directors"
8,p:11,c:5,Co-Founder
9,p:12,c:5,"Chief Revenue Officer, VP of Operations"


In [4]:
df_ppl = df_relationship.groupby(['relationship_object_id'],as_index=False).count()

df_ppl 

Unnamed: 0,relationship_object_id,person_object_id,title
0,c:1,17,17
1,c:10,6,6
2,c:100,12,12
3,c:1000,1,1
4,c:10002,2,2
5,c:100042,1,1
6,c:10005,2,2
7,c:100062,2,2
8,c:10009,2,2
9,c:1001,11,7


In [5]:
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()
neg = df_relationship["title"].apply(lambda x :  sid.polarity_scores(str(x))["neg"]) 
compound = df_relationship["title"].apply(lambda x :  sid.polarity_scores(str(x))["compound"]) 
neu = df_relationship["title"].apply(lambda x :  sid.polarity_scores(str(x))["neu"]) 
pos = df_relationship["title"].apply(lambda x :  sid.polarity_scores(str(x))["pos"]) 

df_relationship["neg"] = neg
df_relationship["compound"] = compound
df_relationship["neu"] = neu
df_relationship["pos"] = pos

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/Abby/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [6]:
df_semantic = df_relationship.groupby(['relationship_object_id'],as_index=False).sum()

df_semantic

Unnamed: 0,relationship_object_id,neg,compound,neu,pos
0,c:1,0.000,0.0000,17.000,0.000
1,c:10,0.000,0.0000,6.000,0.000
2,c:100,0.000,0.0000,12.000,0.000
3,c:1000,0.000,0.0000,1.000,0.000
4,c:10002,0.000,0.4404,1.256,0.744
5,c:100042,0.000,0.0000,1.000,0.000
6,c:10005,0.000,0.0000,2.000,0.000
7,c:100062,0.000,0.0000,2.000,0.000
8,c:10009,0.000,0.0000,2.000,0.000
9,c:1001,0.000,0.0000,11.000,0.000


In [9]:
df = pd.merge(df_semantic, df_ppl, how='inner' ,left_on="relationship_object_id", right_on="relationship_object_id")
df = df.iloc[:,[0,1,2,3,4,5]] 
df.rename(columns={"person_object_id":"employees"},inplace=True)
df["employees"] = normalize(df["employees"])
df



Unnamed: 0,relationship_object_id,neg,compound,neu,pos,employees
0,c:1,0.000,0.0000,17.000,0.000,0.013594
1,c:10,0.000,0.0000,6.000,0.000,0.004248
2,c:100,0.000,0.0000,12.000,0.000,0.009346
3,c:1000,0.000,0.0000,1.000,0.000,0.000000
4,c:10002,0.000,0.4404,1.256,0.744,0.000850
5,c:100042,0.000,0.0000,1.000,0.000,0.000000
6,c:10005,0.000,0.0000,2.000,0.000,0.000850
7,c:100062,0.000,0.0000,2.000,0.000,0.000850
8,c:10009,0.000,0.0000,2.000,0.000,0.000850
9,c:1001,0.000,0.0000,11.000,0.000,0.008496


In [10]:
df_unique_funding = pd.read_pickle('./merge_pkl/funding.pkl')

df = pd.merge(df_unique_funding.iloc[:,[0]], df, how="left",left_on="funding_object_id", right_on="relationship_object_id" )

df.drop(columns=["relationship_object_id"], inplace=True)
df.fillna(0, inplace=True)

df

Unnamed: 0,funding_object_id,neg,compound,neu,pos,employees
0,c:144040,0.0,0.0000,1.000,0.000,0.000000
1,c:18038,0.0,0.0000,11.000,0.000,0.008496
2,c:161452,0.0,0.0000,3.000,0.000,0.001699
3,c:161453,0.0,0.0000,2.000,0.000,0.000850
4,c:161480,0.0,0.0000,4.000,0.000,0.002549
5,c:264245,0.0,0.0000,1.000,0.000,0.000000
6,c:18202,0.0,0.0000,1.000,0.000,0.000000
7,c:51578,0.0,0.0000,3.000,0.000,0.001699
8,c:161699,0.0,0.0000,1.000,0.000,0.000000
9,c:161944,0.0,0.5719,4.448,0.552,0.003398


In [20]:
X_static = df.iloc[:,1:].values
#X_static = np.tile(X_static, (1,226,1)).reshape(-1,226,5)

In [21]:
np.save("X_static", X_static)

In [22]:
X = np.load("X_static.npy")

In [23]:
X.shape

(16778, 5)