In [1]:
import gensim
import pandas as pd

In [2]:
df = pd.read_csv('data/word2vec_train.csv')

In [3]:
df.head()

Unnamed: 0,pid,practice_area,expertise,language,location
0,0,Immigration,"border policy, visa, permanent residency, immi...",Chinese,Dallas
1,1,Corporate,"M&A, Mergers, employment, contract, due dilige...",Spanish,Los Angeles
2,2,Employment,"Contracts, termination, payroll, NDA",Chinese,Washington
3,3,Immigration,"border, work visa, citizenship",Italian,Seattle
4,4,Family Law,"Divorce, child custody, alimony",English,New York


In [4]:
#analysis

print('Unique Languages: ',df.language.unique())
print('Unique Locations: ',df.location.unique())

print('current datatype for Expertise column: ', type(df.expertise[0]))
print('\n')
print(df.info())

Unique Languages:  ['Chinese' 'Spanish' 'Italian' 'English' 'French' 'Japanese' 'Korean'
 'Arabic' 'Hindi']
Unique Locations:  ['Dallas' 'Los Angeles' 'Washington' 'Seattle' 'New York' 'Miami'
 'Chicago' 'San Francisco']
current datatype for Expertise column:  <class 'str'>


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   pid            100 non-null    int64 
 1   practice_area  100 non-null    object
 2   expertise      100 non-null    object
 3   language       100 non-null    object
 4   location       100 non-null    object
dtypes: int64(1), object(4)
memory usage: 4.0+ KB
None


In [5]:
#clean up Expertise column: make lower case -> connect multi-word keywords with '_' -> conver text to list of keywords

def cleanup(text):
    
    return text.lower().replace(' ','_').split(',_')



In [6]:
#test
x = 'Border Policy, visa, permanent residency, immigration'
print(cleanup(x))

['border_policy', 'visa', 'permanent_residency', 'immigration']


In [7]:
#apply to df
df.expertise = df.expertise.apply(cleanup)

In [8]:
df.expertise

0     [border_policy, visa, permanent_residency, imm...
1     [m&a, mergers, employment, contract, due_dilig...
2                [contracts, termination, payroll, nda]
3                      [border, work_visa, citizenship]
4                     [divorce, child_custody, alimony]
                            ...                        
95    [real_estate_transactions, property_developmen...
96    [criminal_investigations, criminal_defense, cr...
97    [family_court_proceedings, child_custody_dispu...
98    [healthcare_regulations, medical_ethics, healt...
99    [tax_disputes, tax_planning, international_tax...
Name: expertise, Length: 100, dtype: object

In [11]:
#not needed
#gensim.utils.simple_preprocess('border policy, visa, permanent residency, immigration')

['border', 'policy', 'visa', 'permanent', 'residency', 'immigration']

In [9]:
model = gensim.models.Word2Vec(
    window = 3,
    min_count = 1,
    workers = 4
)

In [10]:
model.build_vocab(df.expertise, progress_per=1)

In [11]:
model.epochs

5

In [12]:
model.corpus_count

100

In [13]:
model.train(df.expertise, total_examples=model.corpus_count, epochs=model.epochs)

(1058, 1545)

In [14]:
model.wv.most_similar('immigration')

[('m&a', 0.26826635003089905),
 ('severance_agreements', 0.25243422389030457),
 ('real_estate_financing', 0.24010947346687317),
 ('real_estate_sales', 0.19859245419502258),
 ('immigration_waivers', 0.1934790313243866),
 ('visa_applications', 0.18896745145320892),
 ('mergers', 0.18883106112480164),
 ('labor_union_negotiations', 0.18606069684028625),
 ('embezzlement', 0.18393173813819885),
 ('ip_management', 0.17269083857536316)]

In [143]:
model.wv

<gensim.models.keyedvectors.KeyedVectors at 0x168d8a3f850>

In [144]:
word_vectors = model.wv
vocabulary_size = len(word_vectors.key_to_index)
print("Vocabulary size:", vocabulary_size)

Vocabulary size: 216


In [145]:
word_vectors.key_to_index

{'criminal_appeals': 0,
 'corporate_governance': 1,
 'healthcare_compliance': 2,
 'tax_planning': 3,
 'ip_enforcement': 4,
 'criminal_defense': 5,
 'medical_ethics': 6,
 'workplace_discrimination': 7,
 'corporate_investigations': 8,
 'criminal_sentencing': 9,
 'ip_strategy': 10,
 'ip_management': 11,
 'refugee_law': 12,
 'asylum_petitions': 13,
 'immigration_appeals': 14,
 'corporate_litigation': 15,
 'child_custody_disputes': 16,
 'real_estate_transactions': 17,
 'adoption_proceedings': 18,
 'labor_negotiations': 19,
 'real_estate_disputes': 20,
 'property_development': 21,
 'corporate_compliance': 22,
 'guardianship': 23,
 'employment_contracts': 24,
 'international_taxation': 25,
 'compliance': 26,
 'medical_malpractice': 27,
 'tax_evasion': 28,
 'corporate_finance': 29,
 'family_mediation': 30,
 'child_protection': 31,
 'real_estate_arbitration': 32,
 'tax_litigation': 33,
 'deportation_defense': 34,
 'healthcare_regulation': 35,
 'healthcare_licensing': 36,
 'ip_portfolio_manageme

### Attemp 2


In [15]:
#prepend practice area before expertise

def combine(prac,exper):
     
    out=[]
    for i in exper:
        out.append(prac+'_'+i)
        
    return out


#build train data
td = []

for index, row in df.iterrows():
    #print(row['practice_area'], row['expertise'])
    td.append(combine(row['practice_area'], row['expertise']))

In [24]:
print('TEST: ',combine(df.practice_area[0],df.expertise[0]))

TEST:  ['Immigration_border_policy', 'Immigration_visa', 'Immigration_permanent_residency', 'Immigration_immigration']


In [26]:
td[0]

['Immigration_border_policy',
 'Immigration_visa',
 'Immigration_permanent_residency',
 'Immigration_immigration']

In [27]:
model2 = gensim.models.Word2Vec(
    window = 3,
    min_count = 1,
    workers = 4
)

model2.build_vocab(td, progress_per=1)

In [28]:
model2.epochs

5

In [29]:
model2.corpus_count

100

In [30]:
model2.train(td, total_examples=model.corpus_count, epochs=model.epochs)

(1059, 1545)

In [31]:
model2.wv.most_similar('immigration')

KeyError: "Key 'immigration' not present in vocabulary"

In [32]:
word_vectors2 = model2.wv
vocabulary_size = len(word_vectors2.key_to_index)
print("Vocabulary size:", vocabulary_size)

word_vectors2.key_to_index

Vocabulary size: 217


{'Criminal Law_criminal_appeals': 0,
 'Corporate_corporate_governance': 1,
 'Healthcare_healthcare_compliance': 2,
 'Tax Law_tax_planning': 3,
 'Intellectual Property_ip_enforcement': 4,
 'Criminal Law_criminal_defense': 5,
 'Employment_workplace_discrimination': 6,
 'Healthcare_medical_ethics': 7,
 'Family Law_adoption_proceedings': 8,
 'Intellectual Property_ip_strategy': 9,
 'Intellectual Property_ip_management': 10,
 'Immigration_asylum_petitions': 11,
 'Immigration_refugee_law': 12,
 'Corporate_corporate_litigation': 13,
 'Criminal Law_criminal_sentencing': 14,
 'Immigration_immigration_appeals': 15,
 'Employment_labor_negotiations': 16,
 'Real Estate_real_estate_transactions': 17,
 'Real Estate_real_estate_disputes': 18,
 'Real Estate_property_development': 19,
 'Corporate_corporate_investigations': 20,
 'Family Law_child_custody_disputes': 21,
 'Family Law_family_mediation': 22,
 'Employment_employment_contracts': 23,
 'Tax Law_tax_evasion': 24,
 'Family Law_guardianship': 25,
 

In [36]:
print(model2.wv.most_similar('Immigration_border_policy'))
print('NOT EVEN CLOSE!')

[('Criminal Law_juvenile_offenses', 0.26937511563301086), ('Corporate_corporate_finance', 0.22572247684001923), ('Healthcare_healthcare_privacy_laws', 0.21571440994739532), ('Criminal Law_theft', 0.1921129673719406), ('Immigration_refugee_law', 0.18741276860237122), ('Criminal Law_white-collar_crime', 0.18393783271312714), ('Criminal Law_capital_offenses', 0.17898088693618774), ('Real Estate_construction', 0.17114433646202087), ('Corporate_private_equity', 0.17092764377593994), ('Family Law_child_custody_disputes', 0.16574051976203918)]
NOT EVEN CLOSE!
