In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob

In [2]:
# !pip install scikit-learn

In [3]:
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from scipy import stats

In [4]:
# df_all = pd.read_csv(glob.glob('data/clean/ALL_*.csv')[0],
#                      low_memory=False)

In [5]:
df_ria = pd.read_csv(glob.glob('data/clean/RIA_*.csv')[0],
                     low_memory=False)

In [6]:
# df_ria.head()

#### Check we have correctly removed all ERA?

In [7]:
df_ria['Rgstn.@FirmType'].unique()

array([nan, 'Registered'], dtype=object)

In [8]:
df_ria.loc[:, 'ERA'].any()

False

In [9]:
df_ria = df_ria.loc[:, ~df_ria.columns.str.contains('ERA')]

#### TODO: decide if we should cut out SUSPENDED firms?

In [10]:
df_ria['StateRgstn.Rgltrs.Rgltr.@St'].unique()

array(['APPROVED', nan, 'TERMREQUEST', 'CONDREST', 'LIMITED', 'SUSPENDED'],
      dtype=object)

# Process Dataframe into something useable by Scikit-Learn 
#### structured numpy array
#### categorical vars -> dummy /one-hot encoded vars
#### ['Y', 'N'], ['True','False'], etc. -> [1,0] integer
#### "percentile" answers -> (0, 1) continuous
#### Remove Near-Zero Variance variables

In [11]:
to_dummy_var_cols= [
    'registration',
    'MainAddr.@State',
    'FormInfo.Part1A.Item3C.@StateCD',
    'StateRgstn.Rgltrs.Rgltr.@Cd',
    'StateRgstn.Rgltrs.Rgltr.@St',
    'FormInfo.Part1A.Item3A.@OrgFormNm',
    'Rgstn.@FirmType',
    'Rgstn.@St',
    'FormInfo.Part1A.Item3C.@CntryNm',
    'FormInfo.Part1B.ItemH.@Q1B2HScrtsNvsmt',
    'FormInfo.Part1B.ItemH.@Q1B2HNScrtsNvsmt',
    'FormInfo.Part1A.Item5H.@Q5H'
]

In [12]:
df_ria_with_dum = pd.get_dummies(df_ria,
                                 columns=to_dummy_var_cols)

In [13]:
zero_information_cols = [
    'Info.@SECNb',
    'Info.@FirmCrdNb',
    'Info.@SECRgnCD',
#    'NoticeFiled.States',
    'NoticeFiled.States.@St',
    'NoticeFiled.States.@RgltrCd',
    'StateRgstn.Rgltrs',
    'StateRgstn.Rgltrs.Rgltr',
    'FormInfo.Part1A.Item1.@Q1P', #legal entity identifier
    'Rgstn.@Dt',
    'Info.@BusNm',
    'Info.@LegalNm',
    'Info.@UmbrRgstn',
    'MainAddr.@Strt1',
    'MainAddr.@Strt2',
    'MainAddr.@City',
    'MainAddr.@Cntry',
    'MainAddr.@PostlCd',
    'MainAddr.@PhNb',
    'MainAddr.@FaxNb',
    'MailingAddr.@Strt1',
    'MailingAddr.@Strt2',
    'MailingAddr.@City',
    'MailingAddr.@State',
    'MailingAddr.@Cntry',
    'MailingAddr.@PostlCd',
    'StateRgstn.Rgltrs.Rgltr.@Dt', 'Filing.@Dt', 'Filing.@FormVrsn',
    'FormInfo.Part1A.Item1.WebAddrs',
    'FormInfo.Part1A.Item1.WebAddrs.WebAddr',
    'FormInfo.Part1A.Item9E.@Q9E',
    'FormInfo.Part1A.Item3B.@Q3B',
    'FormInfo.Part1B.Item2.@Q1B2B1',
    'FormInfo.Part1B.ItemG.@Q1B2G2',
    'urls', 'url_domains', 'url_schemes',
    'url_social_media', 'url_aggregator', 'url_firm_specific',
]

In [14]:
problem_columns = [
    'FormInfo.Part1A.Item5C.@Q5C1',
    'FormInfo.Part1B.ItemK.@Q1B2K1',
    'NoticeFiled.States.@Dt',
    'FormInfo.Part1A.Item1.@Q1ODESC',
    'FormInfo.Part1A.Item1.@Q1ODesc'
]

In [15]:
recode_any_to_Y = [
 'FormInfo.Part1A.Item5G.@Q5G12Oth',
 'FormInfo.Part1A.Item6A.@Q6A14Oth',
 'FormInfo.Part1A.Item5D.@Q5D1MOth',
 'FormInfo.Part1A.Item3A.@OrgFormOthNm',
 'FormInfo.Part1A.Item5E.@Q5E7Oth',
 'FormInfo.Part1A.Item5D.@Q5D2MOth',
 'FormInfo.Part1B.ItemI.@Q1B2I2AiiOthTx',
 'FormInfo.Part1A.Item5D.@Q5DN3Oth'
]

In [16]:
sorted(df_ria_with_dum.columns)

['Filing.@Dt',
 'Filing.@FormVrsn',
 'FormInfo.Part1A.Item1.@Q1F5',
 'FormInfo.Part1A.Item1.@Q1I',
 'FormInfo.Part1A.Item1.@Q1M',
 'FormInfo.Part1A.Item1.@Q1N',
 'FormInfo.Part1A.Item1.@Q1O',
 'FormInfo.Part1A.Item1.@Q1ODESC',
 'FormInfo.Part1A.Item1.@Q1ODesc',
 'FormInfo.Part1A.Item1.@Q1P',
 'FormInfo.Part1A.Item1.WebAddrs',
 'FormInfo.Part1A.Item1.WebAddrs.WebAddr',
 'FormInfo.Part1A.Item10A.@Q10A',
 'FormInfo.Part1A.Item11.@Q11',
 'FormInfo.Part1A.Item11A.@Q11A1',
 'FormInfo.Part1A.Item11A.@Q11A2',
 'FormInfo.Part1A.Item11B.@Q11B1',
 'FormInfo.Part1A.Item11B.@Q11B2',
 'FormInfo.Part1A.Item11C.@Q11C1',
 'FormInfo.Part1A.Item11C.@Q11C2',
 'FormInfo.Part1A.Item11C.@Q11C3',
 'FormInfo.Part1A.Item11C.@Q11C4',
 'FormInfo.Part1A.Item11C.@Q11C5',
 'FormInfo.Part1A.Item11D.@Q11D1',
 'FormInfo.Part1A.Item11D.@Q11D2',
 'FormInfo.Part1A.Item11D.@Q11D3',
 'FormInfo.Part1A.Item11D.@Q11D4',
 'FormInfo.Part1A.Item11D.@Q11D5',
 'FormInfo.Part1A.Item11E.@Q11E1',
 'FormInfo.Part1A.Item11E.@Q11E2',
 'F

In [17]:
df_ria_with_dum_zi = df_ria_with_dum.drop(columns=zero_information_cols)

In [18]:
# df_ria_with_dum_zi[df_ria_with_dum_zi.columns.to_list()[310:320]]
# df_ria_with_dum_zi.columns.to_list()

In [19]:
df_ria_with_dum_zi.columns[df_ria_with_dum_zi.columns.str.contains('url')]

Index(['url_has_linkedin', 'url_has_social_media', 'url_has_aggregator',
       'url_has_firm_specific', 'url_social_media_count'],
      dtype='object')

In [20]:
df_ria_with_dum_zi[recode_any_to_Y]

Unnamed: 0,FormInfo.Part1A.Item5G.@Q5G12Oth,FormInfo.Part1A.Item6A.@Q6A14Oth,FormInfo.Part1A.Item5D.@Q5D1MOth,FormInfo.Part1A.Item3A.@OrgFormOthNm,FormInfo.Part1A.Item5E.@Q5E7Oth,FormInfo.Part1A.Item5D.@Q5D2MOth,FormInfo.Part1B.ItemI.@Q1B2I2AiiOthTx,FormInfo.Part1A.Item5D.@Q5DN3Oth
0,,,,,,,,
1,,,,,,,,
2,,,,,TIMING SERVICES PROVIDED BY OTHERS,,,
3,CONSULTING SERVICES FOR PROFESSIONAL INVESTMEN...,,,,,,FUND ADMINISTRATOR,
4,,,,,,,,
...,...,...,...,...,...,...,...,...
32525,,,,,,,,
32526,,,,,,,,
32527,,,,,,,,
32528,,,,,,,,


In [21]:
for col_name in recode_any_to_Y:
    df_ria_with_dum_zi[col_name] = [*map(lambda x: 'Y' if x else np.NaN,
                                         pd.notnull(df_ria_with_dum_zi[col_name]))]

In [22]:
df_ria_with_dum_zi_yn = df_ria_with_dum_zi.replace(('Y', 'N',
                                                    '0 percent', '11-25 percent', '26-50 percent', '51-75 percent', '76-99 percent', '100 percent',
                                                    'Up to 10 percent', 'Up to 25 percent', 'Up to 50 percent', 'Up to 75 percent', 'More than 75 percent',
                                                   'Fewer than 5 clients'),
                                                   (1, 0, 0.0,
                                                    0.18, 0.38, 0.63, 0.875, 1.0,
                                                    0.1, 0.25, 0.5, 0.75, 1.0,
                                                    0.5))

In [23]:
df_ria_clean = df_ria_with_dum_zi_yn.loc[:,[c for c in df_ria_with_dum_zi_yn.columns.to_list() if c not in problem_columns]]

In [25]:
df_ria_clean = df_ria_clean.fillna(0)

In [26]:
regressor_cols = df_ria_clean.loc[:,~df_ria_clean.columns.str.contains('url|@Q1I')].columns

In [27]:
bool_cols = df_ria_clean.columns[df_ria_clean.apply(lambda x: len(np.unique(x))) == 2]

In [28]:
bool_cols

Index(['FormInfo.Part1A.Item1.@Q1I', 'FormInfo.Part1A.Item1.@Q1M',
       'FormInfo.Part1A.Item1.@Q1N', 'FormInfo.Part1A.Item1.@Q1O',
       'FormInfo.Part1A.Item5E.@Q5E1', 'FormInfo.Part1A.Item5E.@Q5E2',
       'FormInfo.Part1A.Item5E.@Q5E3', 'FormInfo.Part1A.Item5E.@Q5E4',
       'FormInfo.Part1A.Item5E.@Q5E5', 'FormInfo.Part1A.Item5E.@Q5E6',
       ...
       'FormInfo.Part1B.ItemH.@Q1B2HNScrtsNvsmt_Over $5,000,000',
       'FormInfo.Part1B.ItemH.@Q1B2HNScrtsNvsmt_Under $100,000',
       'FormInfo.Part1A.Item5H.@Q5H_0', 'FormInfo.Part1A.Item5H.@Q5H_1-10',
       'FormInfo.Part1A.Item5H.@Q5H_101-250',
       'FormInfo.Part1A.Item5H.@Q5H_11-25',
       'FormInfo.Part1A.Item5H.@Q5H_251-500',
       'FormInfo.Part1A.Item5H.@Q5H_26-50',
       'FormInfo.Part1A.Item5H.@Q5H_51-100',
       'FormInfo.Part1A.Item5H.@Q5H_More than 500'],
      dtype='object', length=459)

In [29]:
bool_regressor_cols = []
bool_regressor_cols_i = []

for i, col in enumerate(regressor_cols):
    if col in bool_cols and (col != 'FormInfo.Part1A.Item1.@Q1I'):
        bool_regressor_cols.append(col)
        bool_regressor_cols_i.append(i)
bool_regressor_cols = pd.Index(bool_regressor_cols)

In [30]:
cont_cols = df_ria_clean.columns[df_ria_clean.apply(lambda x: len(np.unique(x))) > 2]

In [31]:
cont_regressor_cols = []
cont_regressor_cols_i = []

for i, col in enumerate(regressor_cols):
    if col in cont_cols and (col != 'FormInfo.Part1A.Item1.@Q1I'):
        cont_regressor_cols.append(col)
        cont_regressor_cols_i.append(i)
cont_regressor_cols = pd.Index(cont_regressor_cols)

In [32]:
cont_regressor_cols

Index(['FormInfo.Part1A.Item1.@Q1F5', 'FormInfo.Part1A.Item5A.@TtlEmp',
       'FormInfo.Part1A.Item5B.@Q5B1', 'FormInfo.Part1A.Item5B.@Q5B2',
       'FormInfo.Part1A.Item5B.@Q5B3', 'FormInfo.Part1A.Item5B.@Q5B4',
       'FormInfo.Part1A.Item5B.@Q5B5', 'FormInfo.Part1A.Item5B.@Q5B6',
       'FormInfo.Part1A.Item5C.@Q5C2', 'FormInfo.Part1A.Item5D.@Q5DA1',
       'FormInfo.Part1A.Item5D.@Q5DA3', 'FormInfo.Part1A.Item5D.@Q5DB1',
       'FormInfo.Part1A.Item5D.@Q5DB3', 'FormInfo.Part1A.Item5D.@Q5DC1',
       'FormInfo.Part1A.Item5D.@Q5DC3', 'FormInfo.Part1A.Item5D.@Q5DD1',
       'FormInfo.Part1A.Item5D.@Q5DD3', 'FormInfo.Part1A.Item5D.@Q5DE1',
       'FormInfo.Part1A.Item5D.@Q5DE3', 'FormInfo.Part1A.Item5D.@Q5DF1',
       'FormInfo.Part1A.Item5D.@Q5DF3', 'FormInfo.Part1A.Item5D.@Q5DG1',
       'FormInfo.Part1A.Item5D.@Q5DG3', 'FormInfo.Part1A.Item5D.@Q5DH1',
       'FormInfo.Part1A.Item5D.@Q5DH3', 'FormInfo.Part1A.Item5D.@Q5DI1',
       'FormInfo.Part1A.Item5D.@Q5DI3', 'FormInfo.Part1A.It

### Convert to numpy

In [33]:
X_ria = df_ria_clean.loc[:,regressor_cols].to_numpy()

In [34]:
Y_url_firm_specific = df_ria_clean['FormInfo.Part1A.Item1.@Q1I'].to_numpy()   #'url_has_firm_specific'

In [35]:
X_ria[:,bool_regressor_cols_i]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [36]:
X_ria[:,cont_regressor_cols_i]

array([[0.0000000e+00, 1.0000000e+00, 1.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 1.0000000e+00],
       [1.0000000e+00, 1.0000000e+00, 1.0000000e+00, ..., 0.0000000e+00,
        3.6000000e+06, 1.0000000e+00],
       [4.0000000e+00, 2.6000000e+01, 2.4000000e+01, ..., 0.0000000e+00,
        7.6000000e+07, 2.6000000e+01],
       ...,
       [0.0000000e+00, 3.0000000e+00, 2.0000000e+00, ..., 0.0000000e+00,
        6.9133149e+07, 3.0000000e+00],
       [0.0000000e+00, 2.0000000e+00, 1.0000000e+00, ..., 0.0000000e+00,
        8.7305898e+07, 2.0000000e+00],
       [0.0000000e+00, 3.0000000e+00, 1.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 3.0000000e+00]])

### Remove NZV regressors

#### booleans

In [37]:
sel_not_nzv = VarianceThreshold(threshold=(0.95 * (1 - 0.95)))
X_ria_bool_nzv = sel_not_nzv.fit_transform(X_ria[:,bool_regressor_cols_i])

In [38]:
bool_regressor_cols_not_nzv = sel_not_nzv.get_feature_names_out(bool_regressor_cols)

In [39]:
X_ria_bool_nzv.shape

(32530, 86)

In [40]:
X_ria.shape

(32530, 536)

#### continuous

In [45]:
sel_not_zv_continuous = VarianceThreshold(threshold=0)
X_ria_cont_nzv = sel_not_zv_continuous.fit_transform(X_ria[:,cont_regressor_cols_i])

In [46]:
X_ria_cont_nzv.shape

(32530, 76)

In [47]:
X_ria.shape

(32530, 536)

In [48]:
cont_regressor_cols_not_nzv = sel_not_zv_continuous.get_feature_names_out(cont_regressor_cols)

### Regressionfrom scipy import stats

In [49]:
X_ria_cont_nzv_normed = stats.zscore(X_ria_cont_nzv, axis=0)

In [50]:
X_ria_reg = np.concatenate((X_ria_bool_nzv, X_ria_cont_nzv_normed), axis=1)

In [51]:
all_regressor_cols_not_nzv = np.concatenate([bool_regressor_cols_not_nzv, cont_regressor_cols_not_nzv])

In [52]:
np.save('X_ria_reg.npy', X_ria_reg)
np.save('Y_url_firm_specific.npy', Y_url_firm_specific)

In [53]:
X = X_ria_reg
Y = Y_url_firm_specific

In [54]:
# Always scale the input. The most convenient way is to use a pipeline.
clf_sgd = SGDClassifier(loss = 'log_loss',
                                  penalty = 'elasticnet',
                                  max_iter=100000,
                                  tol=1e-3)
clf_sgd.fit(X, Y)

#print(clf.predict([[-0.8, -1]]))

In [55]:
type(clf_sgd)

sklearn.linear_model._stochastic_gradient.SGDClassifier

In [56]:
df_sgd = pd.DataFrame(clf_sgd.coef_)

In [57]:
df_sgd.columns = all_regressor_cols_not_nzv

In [58]:
df_sgd.transpose().sort_values(0, ascending=False)[0:20]

Unnamed: 0,0
FormInfo.Part1A.Item5D.@Q5D2D,4.106307
FormInfo.Part1A.Item5D.@Q5DJ1,1.845063
FormInfo.Part1A.Item5D.@Q5DB1,1.568358
FormInfo.Part1A.Item5D.@Q5D1M,1.230528
FormInfo.Part1A.Item5A.@TtlEmp,1.111969
TotalEmp,1.088217
FormInfo.Part1A.Item5G.@Q5G8,1.082485
FormInfo.Part1A.Item1.@Q1F5,0.941206
FormInfo.Part1A.Item5D.@Q5DL3,0.856066
FormInfo.Part1A.Item5L.@Q5L1A,0.832551


In [59]:
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression(solver='newton-cholesky',
                            max_iter = 1000000)
clf_lr.fit(X, Y)
# clf_lr.predict(X[:2, :])

# clf.predict_proba(X[:2, :])


# clf.score(X, y)

In [60]:
df_lr = pd.DataFrame(clf_lr.coef_)

In [61]:
all_regressor_cols_not_nzv = np.concatenate([bool_regressor_cols_not_nzv, cont_regressor_cols_not_nzv])

In [62]:
df_lr.columns = all_regressor_cols_not_nzv

In [63]:
df_lr.transpose().sort_values(0, ascending=False)[0:20]

Unnamed: 0,0
FormInfo.Part1A.Item5A.@TtlEmp,1.653097
TotalEmp,1.584094
FormInfo.Part1A.Item1.@Q1F5,1.362268
FormInfo.Part1A.Item5D.@Q5DB1,1.274733
FormInfo.Part1A.Item5G.@Q5G8,1.074159
FormInfo.Part1A.Item5L.@Q5L1A,0.836692
Rgstn.@St_APPROVED,0.781516
FormInfo.Part1A.Item5D.@Q5DH1,0.749838
FormInfo.Part1A.Item5D.@Q5DF1,0.744644
FormInfo.Part1A.Item3A.@OrgFormNm_Limited Liability Company,0.696615


# Try clustering Example from Scikit-Learn docs
### https://scikit-learn.org/stable/modules/clustering.html

In [172]:
import time
import warnings

import numpy as np
import matplotlib.pyplot as plt

from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice

In [173]:
np.random.seed(0)

In [174]:
np.concatenate([X_ria_reg, Y_url_firm_specific.reshape(-1,1)], axis=1).shape

(32530, 163)

In [175]:
dataset = np.concatenate([X_ria_reg, Y_url_firm_specific.reshape(-1,1)], axis=1)

In [176]:
datasets = [
    (
        dataset,
        {
            "eps": 0.18,
            "n_neighbors": 2,
            "min_samples": 7,
            "xi": 0.01,
            "min_cluster_size": 0.2,
        },
    )]

In [177]:
default_base = {
    "quantile": 0.3,
    "eps": 0.3,
    "damping": 0.9,
    "preference": -200,
    "n_neighbors": 3,
    "n_clusters": 3,
    "min_samples": 7,
    "xi": 0.05,
    "min_cluster_size": 0.1,
}