## POINTWISE MUTUAL INFORMATION
The idea of PMI is that we want to quantify the likelihood of co-occurrence of two words, taking into account the fact that it might be caused by the frequency of the single words. Hence, the algorithm computes the (log) probability of co-occurrence scaled by the product of the single probability of occurrence.
Now, knowing that, when ‘a’ and ‘b’ are independent, their joint probability is equal to the product of their marginal probabilities, when the ratio equals 1 (hence the log equals 0), it means that the two words together don’t form a unique concept: they co-occur by chance.
On the other hand, if either one of the words (or even both of them) has a low probability of occurrence if singularly considered, but its joint probability together with the other word is high, it means that the two are likely to express a unique concept.

In [1]:
import spacy
import string
import re
import pandas as pd
import numpy as np
import seaborn as sns
import random
import pickle
from unidecode import unidecode
import nltk
nltk.download('wordnet')
nltk.download('words')
nltk.download('punkt')

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import skipgrams
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords 
from spacy.lang.en import English

from tqdm import tqdm
from gensim import corpora
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_non_alphanum
from gensim.models import CoherenceModel
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures,TrigramCollocationFinder, TrigramAssocMeasures

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## PMI - DRUG

In [71]:
#Reload save pkl file
with open('drug.pkl', 'rb') as f:
    drug = pickle.load(f)

In [72]:
def get_pmi_value(word1, word2, matrix):
    return matrix.loc[matrix.index == word1, [word2]]

def pmi(df, positive=True):
    cols = df.sum(axis=0)
    total = cols.sum()
    rows = df.sum(axis=1)
    expected = np.outer(rows, cols) / total
    df_pmi = df / expected
    with np.errstate(divide='ignore'):
        df_pmi = np.log(df_pmi)
    df_pmi[np.array(np.isinf(df_pmi))] = 0.0 
    if positive:
        df_pmi[df_pmi < 0] = 0.0
    return df_pmi

def max_cooccurrences(df_column):
    return df_column.idxmax() , df_column.max()

In [73]:
vectorizer = CountVectorizer(min_df=0.01)
X = vectorizer.fit_transform(drug.Text)

In [74]:
pmi_drug = pd.DataFrame(X.todense(), columns=vectorizer.vocabulary_)

In [75]:
asint = pmi_drug.astype(int)
drug_pmi = asint.T.dot(asint)

In [76]:
drug_pmi.values[[np.arange(drug_pmi.shape[0])]*2] = 0

  drug_pmi.values[[np.arange(drug_pmi.shape[0])]*2] = 0


In [77]:
drug_pmi

Unnamed: 0,undercover,cocaine,facing,away,according,remove,purse,phone,later,apparently,...,averment,disconnected,dying,doubtless,map,demurrer,repugnant,stole,rescind,assented
undercover,0,424,3,10,4,13,164,159,36,15,...,35,73,9,90,20,100,2,88,57,34
cocaine,424,0,41,44,16,11,848,440,142,73,...,92,231,29,166,68,344,16,360,63,89
facing,3,41,0,2,9,106,240,114,64,12,...,30,124,5,111,7,69,38,209,12,0
away,10,44,2,0,12,8,153,54,29,11,...,15,44,3,23,13,58,0,42,2,0
according,4,16,9,12,0,5,85,36,15,1,...,10,29,7,9,12,37,13,57,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
demurrer,100,344,69,58,37,81,852,594,311,39,...,699,558,95,305,173,0,29,879,105,21
repugnant,2,16,38,0,13,5,88,79,46,4,...,26,62,1,60,16,29,0,152,2,1
stole,88,360,209,42,57,317,1620,1252,781,38,...,366,1261,116,350,195,879,152,0,92,7
rescind,57,63,12,2,1,8,283,161,54,19,...,23,113,39,69,48,105,2,92,0,4


In [78]:
drug_occ = pmi(drug_pmi, positive=True)
drug_occ

Unnamed: 0,undercover,cocaine,facing,away,according,remove,purse,phone,later,apparently,...,averment,disconnected,dying,doubtless,map,demurrer,repugnant,stole,rescind,assented
undercover,0.000000,2.189862,0.000000,0.240559,0.000000,0.000000,0.000000,0.253358,0.000000,0.042002,...,0.184582,0.000000,0.000000,0.322359,0.000000,0.337529,0.000000,0.000000,1.086969,0.872124
cocaine,2.189862,0.000000,0.000000,0.478409,0.000000,0.000000,0.222529,0.027474,0.046289,0.380657,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.329246,0.000000,0.000000,0.000000,0.590646
facing,0.000000,0.000000,0.000000,0.000000,0.076030,1.469845,0.000000,0.000000,0.155157,0.000000,...,0.000000,0.059655,0.000000,0.194137,0.000000,0.000000,0.902901,0.000000,0.000000,0.000000
away,0.240559,0.478409,0.000000,0.000000,1.255746,0.000000,0.307931,0.000000,0.255603,0.285938,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.346893,0.000000,0.000000,0.000000,0.000000
according,0.000000,0.000000,0.076030,1.255746,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.340941,0.000000,0.131364,0.176142,1.001072,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
demurrer,0.337529,0.329246,0.000000,0.346893,0.176142,0.000000,0.000000,0.000000,0.422485,0.000000,...,1.527361,0.250151,0.464518,0.000000,0.315359,0.000000,0.000000,0.089779,0.046354,0.000000
repugnant,0.000000,0.000000,0.902901,0.000000,1.001072,0.000000,0.000000,0.000000,0.382232,0.000000,...,0.106705,0.000000,0.000000,0.136268,0.000000,0.000000,0.000000,0.205773,0.000000,0.000000
stole,0.000000,0.000000,0.000000,0.000000,0.000000,0.481886,0.000000,0.000000,0.573427,0.000000,...,0.110503,0.295611,0.000000,0.000000,0.000000,0.089779,0.205773,0.000000,0.000000,0.000000
rescind,1.086969,0.000000,0.000000,0.000000,0.000000,0.000000,0.028884,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.885762,0.000000,0.344828,0.046354,0.000000,0.000000,0.000000,0.000000


In [79]:
get_pmi_value('cocaine', 'undercover', drug_occ)

Unnamed: 0,undercover
cocaine,2.189862


In [80]:
max_cooccurrences(drug_occ)

(undercover    essentially
 cocaine        undercover
 facing           emphasis
 away             strategy
 according        stopping
                  ...     
 demurrer            tenth
 repugnant         despite
 stole              gather
 rescind           install
 assented           formal
 Length: 2340, dtype: object,
 undercover    3.120438
 cocaine       2.189862
 facing        3.609566
 away          2.743107
 according     3.034203
                 ...   
 demurrer      2.274755
 repugnant     2.920316
 stole         2.160492
 rescind       3.406246
 assented      3.872346
 Length: 2340, dtype: float64)

###  Bigrams/Trigrams - Drug

In [4]:
list = drug['Text'].tolist()

In [5]:
tokenized_sentences = []
for line in list:
    token = line.split()
    tokenized_sentences.append(token)

In [6]:
#bigrams (sequences of 2 words) and collapsed them into a unique term with the underscore symbol

finder = BigramCollocationFinder.from_documents(tokenized_sentences)
bgm = BigramAssocMeasures()
score = bgm.mi_like
collocations = {'_'.join(bigram): pmi for bigram, pmi in finder.score_ngrams(score)}

In [7]:
collocations

{'reasonable_doubt': 1851.883656678027,
 'emphasis_added': 918.8051289713386,
 'armed_robbery': 420.1849018146785,
 'sexual_assault': 324.7248065449666,
 'closely_balanced': 183.8526161121839,
 'myocardial_infarction': 169.43279589415837,
 'willful_wanton': 167.44970765195927,
 'ingress_egress': 156.57454108766512,
 'brutal_heinous': 152.8204278331407,
 'contributory_negligence': 141.97316294037515,
 'manifest_evidence': 120.77897799284435,
 'emotional_distress': 118.40846333807846,
 'exceptionally_brutal': 87.90234069779292,
 'reckless_homicide': 87.73295186572341,
 'aggravation_mitigation': 85.99694507209225,
 'punitive_damages': 85.93391436266583,
 'anhydrous_ammonia': 84.4486019736842,
 'degree_murder': 52.83814687935476,
 'plea_guilty': 51.470552247362754,
 'improbable_unsatisfactory': 48.096717739306484,
 'uninsured_motorist': 45.451533307384345,
 'chloral_hydrate': 42.0,
 'unconstitutionally_vague': 39.81348167539267,
 'directed_verdict': 39.379996988927886,
 'reversed_reversed'

In [6]:
#trigrams (sequences of 3 words) and collapsed them into a unique term with the underscore symbol

finder = TrigramCollocationFinder.from_documents(tokenized_sentences)
tgm = TrigramAssocMeasures()
score = tgm.mi_like
collocations = {'_'.join(trigram): pmi for trigram, pmi in finder.score_ngrams(score)}

In [7]:
collocations

{'adenine_thymine_cytosine': 1.0,
 'cytosine_guanine_helical': 1.0,
 'dullness_egophony_bleating': 1.0,
 'egomaniac_megalomaniac_hypocrite': 1.0,
 'peripapillary_papilla_intraretinal': 1.0,
 'shyster_egomaniac_megalomaniac': 1.0,
 'thymine_cytosine_guanine': 1.0,
 'vegetarian_applesauce_garlic': 1.0,
 'webbing_alacritous_reoperate': 1.0,
 'barbas_sheard_verdigris': 0.5,
 'gummy_potassa_tannin': 0.5,
 'insecticide_fungicide_rodenticide': 0.5,
 'mucilaginous_gummy_potassa': 0.5,
 'perchlorethylene_nonflammable_tetrachloride': 0.5,
 'tetrachloride_perchlorethylene_nonflammable': 0.5,
 'varioloid_puerperal_membranous': 0.5,
 'alchemy_devilish_flirting': 0.3333333333333333,
 'hypnotism_mesmerism_legerdemain': 0.3333333333333333,
 'endorser_certifier_osmose': 0.25,
 'florid_hyperplasia_adenosis': 0.25,
 'guaiacol_glycerol_theobromine': 0.25,
 'hyaline_cytoplasm_cirrhotic': 0.25,
 'itchy_flattop_enchanting': 0.25,
 'maxillary_parotid_scrofulous': 0.25,
 'namesake_hammerhead_shark': 0.25,
 'qu

## PMI - WEAPONS

In [81]:
#Reload save pkl file
with open('weapons.pkl', 'rb') as f:
    weapons = pickle.load(f)

In [14]:
vectorizer = CountVectorizer(min_df=0.01)
X = vectorizer.fit_transform(weapons.Text)

In [15]:
pmi_weapons = pd.DataFrame(X.todense(), columns=vectorizer.vocabulary_)

In [16]:
asint = pmi_weapons.astype(int)
weapons_pmi = asint.T.dot(asint)

In [17]:
weapons_pmi.values[[np.arange(weapons_pmi.shape[0])]*2] = 0

  weapons_pmi.values[[np.arange(weapons_pmi.shape[0])]*2] = 0


In [18]:
get_pmi_value('pistol', 'murder', weapons_pmi)

Unnamed: 0,murder
pistol,34


In [19]:
weapons_occ = pmi(weapons_pmi, positive=True)
weapons_occ

Unnamed: 0,guilty,degree,murder,burglary,proven,reasonable,doubt,agree,conviction,homicide,...,utterly,occupancy,practiced,distant,replication,assumpsit,averment,indispensable,eighty,compensate
guilty,0.000000,1.395181,0.000000,1.098079,0.000000,0.000000,0.206415,0.076675,0.000000,0.182222,...,1.105455,1.061337,0.240021,0.000000,0.000000,0.000000,0.220249,0.000000,0.000000,1.067001
degree,1.395181,0.000000,0.000000,0.596283,0.000000,0.000000,0.400480,0.023703,0.000000,0.066061,...,0.319316,0.547350,0.000000,0.001150,0.000000,0.024353,0.000000,0.000000,0.000000,0.000000
murder,0.000000,0.000000,0.000000,0.000000,0.776977,0.794285,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.305060,0.745701,0.000000,0.308948,0.000000
burglary,1.098079,0.596283,0.000000,0.000000,0.657697,0.069935,0.000000,0.000000,0.000000,0.491707,...,0.492217,0.511613,0.000000,0.194569,0.000000,0.000000,0.000000,0.000000,0.000000,0.821488
proven,0.000000,0.000000,0.776977,0.657697,0.000000,0.353437,2.660074,0.000000,0.000000,0.028573,...,0.000000,0.000000,0.000000,0.000000,0.059120,0.583793,0.019347,0.246217,0.076855,0.070673
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
assumpsit,0.000000,0.024353,0.305060,0.000000,0.583793,0.000000,0.247727,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.862967,0.016954,0.000000,0.000000
averment,0.220249,0.000000,0.745701,0.000000,0.019347,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.063500,0.543365,1.862967,0.000000,0.110610,0.019833,0.000000
indispensable,0.000000,0.000000,0.000000,0.000000,0.246217,0.000000,0.000000,0.017280,0.000000,0.142839,...,0.000000,0.000000,0.121092,0.084239,0.977589,0.016954,0.110610,0.000000,0.970580,0.000000
eighty,0.000000,0.000000,0.308948,0.000000,0.076855,0.000000,0.000000,0.000000,0.078487,0.109639,...,0.000000,0.000000,0.000000,0.323823,1.408859,0.000000,0.019833,0.970580,0.000000,0.000000


In [20]:
get_pmi_value('pistol', 'murder', weapons_occ)

Unnamed: 0,murder
pistol,0.0


In [21]:
max_cooccurrences(weapons_occ)

(guilty           discretion
 degree           immaterial
 murder            reasoning
 burglary              speak
 proven              falling
                     ...    
 assumpsit          averment
 averment              recur
 indispensable    injunctive
 eighty                 lose
 compensate         evidence
 Length: 2419, dtype: object,
 guilty           2.966372
 degree           2.004413
 murder           2.102548
 burglary         3.023862
 proven           2.880192
                    ...   
 assumpsit        1.862967
 averment         2.173839
 indispensable    1.341409
 eighty           1.822589
 compensate       3.269967
 Length: 2419, dtype: float64)

### Bigrams/Trigrams - Weapons

In [82]:
list = weapons['Text'].tolist()

In [83]:
tokenized_sentences = []
for line in list:
    token = line.split()
    tokenized_sentences.append(token)

In [84]:
#bigrams (sequences of 2 words) and collapsed them into a unique term with the underscore symbol

finder = BigramCollocationFinder.from_documents(tokenized_sentences)
bgm = BigramAssocMeasures()
score = bgm.mi_like
collocations = {'_'.join(bigram): pmi for bigram, pmi in finder.score_ngrams(score)}

In [85]:
collocations

{'reasonable_doubt': 9248.295380297299,
 'armed_robbery': 4119.140889208582,
 'degree_murder': 1666.435892896166,
 'armed_violence': 1641.3474286888038,
 'sexual_assault': 1498.1901499448254,
 'emphasis_added': 1150.5285295569174,
 'brutal_heinous': 998.8657227633362,
 'closely_balanced': 857.9126564999983,
 'exceptionally_brutal': 516.8650997613365,
 'wanton_cruelty': 326.3752555911446,
 'aggravation_mitigation': 298.84242579693506,
 'improbable_unsatisfactory': 229.06934088788435,
 'sudden_intense': 201.04404303746264,
 'willful_wanton': 196.03028864479896,
 'indicative_wanton': 195.17979386986065,
 'promote_facilitate': 186.96780210060808,
 'heinous_indicative': 167.02935943360757,
 'newly_discovered': 151.5542829267132,
 'deadly_weapon': 138.6699273552881,
 'frivolous_patently': 134.33311690819835,
 'appreciate_criminality': 125.42480705581606,
 'manifestly_erroneous': 124.03465843439777,
 'gunshot_wound': 111.40877301633786,
 'useful_citizenship': 91.08134127265797,
 'unlawful_res

In [86]:
#trigrams (sequences of 3 words) and collapsed them into a unique term with the underscore symbol

finder = TrigramCollocationFinder.from_documents(tokenized_sentences)
bgm = TrigramAssocMeasures()
score = bgm.mi_like
collocations = {'_'.join(trigram): pmi for trigram, pmi in finder.score_ngrams(score)}

In [87]:
collocations

{'aquamarine_topaz_birthstone': 1.0,
 'chondromalacia_bunion_hammertoe': 1.0,
 'confectioner_taffy_candied': 1.0,
 'manipulator_intimidator_assaulter': 1.0,
 'measles_mumps_rubella': 1.0,
 'polycyclic_aromatic_pathogenic': 1.0,
 'ammoniacal_clinker_naphthalene': 0.5,
 'benzene_polycyclic_aromatic': 0.5,
 'chromate_arsenate_creosote': 0.5,
 'dicalcium_dicalcic_tricalcium': 0.5,
 'beet_carbonation_sulphitation': 0.3333333333333333,
 'ferrous_calypso_cabinetry': 0.25,
 'hypocoristic_suffix_suffix': 0.25,
 'importune_entreat_implore': 0.25,
 'papal_massacre_dominus': 0.25,
 'sphenoid_cortex_cerebrum': 0.25,
 'wale_strake_abaft': 0.25,
 'exceptionally_brutal_heinous': 0.2253556659284583,
 'metastatic_parotid_carcinoma': 0.2222222222222222,
 'anchorage_outcoming_ingoing': 0.2,
 'bronchospasm_endometritis_diaphoresis': 0.2,
 'douche_sleazy_nightie': 0.2,
 'adenocarcinoma_estrogen_progesterone': 0.16666666666666666,
 'airedale_terrier_schnauzer': 0.16666666666666666,
 'candied_shelled_marzipan

## PMI - ACCIDENT

In [15]:
#Reload save pkl file
with open('accident.pkl', 'rb') as f:
    accident = pickle.load(f)

In [23]:
vectorizer = CountVectorizer(min_df=0.01)
X = vectorizer.fit_transform(accident.Text)

In [24]:
pmi_accident = pd.DataFrame(X.todense(), columns=vectorizer.vocabulary_)

In [25]:
asint = pmi_accident.astype(int)
accident_pmi = asint.T.dot(asint)

In [26]:
accident_pmi.values[[np.arange(accident_pmi.shape[0])]*2] = 0

  accident_pmi.values[[np.arange(accident_pmi.shape[0])]*2] = 0


In [27]:
get_pmi_value('motorman','jurisprudence', accident_pmi)

Unnamed: 0,jurisprudence
motorman,2188


In [28]:
accident_occ = pmi(accident_pmi, positive=True)
accident_occ

Unnamed: 0,arose,declaratory,sought,extent,coverage,provided,occurrence,limitation,affirm,covered,...,pursuance,eighty,alongside,expenditure,motorman,jurisprudence,solicitor,tenth,contemporaneous,daylight
arose,0.000000,2.868897,0.557986,0.000000,0.043556,0.000000,1.190169,0.000000,0.168542,0.782283,...,0.401715,0.315792,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.433465
declaratory,2.868897,0.000000,0.497670,0.000000,0.100977,0.008697,0.449493,0.000000,0.000000,0.057403,...,0.225035,0.000000,0.000000,0.000000,0.290899,0.249676,0.074085,0.000000,0.000000,0.000000
sought,0.557986,0.497670,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.446430,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.656428,0.000000,0.000000,0.000000,0.000000
extent,0.000000,0.000000,0.000000,0.000000,0.331779,0.000000,0.191583,0.000000,0.000000,0.000000,...,0.000000,0.438061,0.646588,0.000000,0.055151,0.000000,0.000000,0.000000,0.000000,0.739656
coverage,0.043556,0.100977,0.000000,0.331779,0.000000,0.417106,0.000000,0.000000,0.000000,0.007982,...,0.000000,0.050057,0.190295,0.566370,0.000000,0.000000,0.067980,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
jurisprudence,0.000000,0.249676,0.656428,0.000000,0.000000,0.000000,0.469968,0.000000,0.000000,0.000000,...,0.054251,0.694573,1.231997,1.147415,1.710730,0.000000,0.167795,0.000000,0.000000,0.000000
solicitor,0.000000,0.074085,0.000000,0.000000,0.067980,0.000000,0.135654,0.000000,0.474374,0.437042,...,1.484110,0.121551,0.000000,0.000000,0.000000,0.167795,0.000000,0.157396,0.000000,0.000000
tenth,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.259667,0.000000,0.000000,0.000000,...,0.637159,0.008849,0.000000,0.000000,0.000000,0.000000,0.157396,0.000000,0.067054,0.000000
contemporaneous,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.018231,0.000000,0.000000,...,0.000000,0.000000,0.415831,0.000000,0.323456,0.000000,0.000000,0.067054,0.000000,0.768607


In [29]:
get_pmi_value('motorman','jurisprudence', accident_occ)

Unnamed: 0,jurisprudence
motorman,1.71073


In [30]:
max_cooccurrences(accident_occ)

(arose              declaratory
 declaratory              arose
 sought             termination
 extent                   shock
 coverage            eventually
                       ...     
 jurisprudence         opposite
 solicitor              consent
 tenth                    weigh
 contemporaneous         repeat
 daylight            eyewitness
 Length: 2179, dtype: object,
 arose              2.868897
 declaratory        2.868897
 sought             2.283645
 extent             2.269953
 coverage           0.955734
                      ...   
 jurisprudence      2.416352
 solicitor          1.796480
 tenth              1.761466
 contemporaneous    3.013553
 daylight           3.140476
 Length: 2179, dtype: float64)

### Bigrams/Trigrams - Accident

In [56]:
list = accident['Text'].tolist()

In [57]:
tokenized_sentences = []
for line in list:
    token = line.split()
    tokenized_sentences.append(token)

In [58]:
#bigrams (sequences of 2 words) and collapsed them into a unique term with the underscore symbol

finder = BigramCollocationFinder.from_documents(tokenized_sentences)
bgm = BigramAssocMeasures()
score = bgm.mi_like
collocations = {'_'.join(bigram): pmi for bigram, pmi in finder.score_ngrams(score)}

In [59]:
collocations

{'emphasis_added': 1332.5375234956866,
 'uninsured_motorist': 996.5753167367814,
 'contributory_negligence': 486.38534411055343,
 'reasonable_doubt': 431.999543948073,
 'manifest_evidence': 352.0640968345486,
 'armed_robbery': 289.67621571580963,
 'willful_wanton': 283.2622807220624,
 'wanton_misconduct': 281.543357727747,
 'emotional_distress': 236.97508032229547,
 'preliminary_injunction': 219.25739801437018,
 'sexual_assault': 212.43687627309188,
 'motorist_coverage': 203.28706958012117,
 'directed_verdict': 167.45591187229783,
 'punitive_damages': 104.86330869621995,
 'ingress_egress': 99.83471351546578,
 'tort_immunity': 93.34403202470122,
 'notwithstanding_verdict': 66.71265308178846,
 'undue_influence': 66.0789027466239,
 'excited_utterance': 64.33340274343423,
 'infliction_emotional': 54.10813116992229,
 'reversed_reversed': 53.45318532863916,
 'contrary_manifest': 52.3565341054207,
 'opponent_overwhelmingly': 51.69085545722714,
 'aspect_favorable': 50.58538611563848,
 'exercis

In [20]:
#trigrams (sequences of 3 words) and collapsed them into a unique term with the underscore symbol

finder = TrigramCollocationFinder.from_documents(tokenized_sentences)
bgm = TrigramAssocMeasures()
score = bgm.mi_like
collocations = {'_'.join(trigram): pmi for trigram, pmi in finder.score_ngrams(score)}

In [21]:
collocations

{'aurum_argentum_cuprum': 1.0,
 'oxidase_carbohydrate_fermentation': 1.0,
 'pudding_myrrh_frankincense': 1.0,
 'cheetah_margay_lynx': 0.5,
 'coater_bodymaker_slitter': 0.5,
 'dissembler_hypocrite_pretender': 0.5,
 'effluence_putrescible_intercepter': 0.5,
 'ocelot_cheetah_margay': 0.5,
 'ped_episcopalian_churchgoer': 0.5,
 'raving_madman_monomaniac': 0.5,
 'thoracotomy_decortication_apical': 0.5,
 'toughen_psyche_pamper': 0.4444444444444444,
 'godson_namesake_momo': 0.3333333333333333,
 'hak_kung_sari': 0.3333333333333333,
 'lamination_stratification_mica': 0.3333333333333333,
 'namesake_momo_nonfiction': 0.3333333333333333,
 'ancon_eu_guildhall': 0.25,
 'creamy_blink_wrinkle': 0.25,
 'ductile_ferrous_silicate': 0.25,
 'eu_guildhall_hansa': 0.25,
 'follicular_adenoma_lobule': 0.25,
 'stupidly_obstinately_recreant': 0.25,
 'dah_dah_dah': 0.216,
 'fourfold_dharma_artha': 0.2,
 'lynx_bobcat_hyena': 0.2,
 'margay_lynx_bobcat': 0.2,
 'nonreligion_paty_tripartite': 0.2,
 'seismograph_geophys

## PMI - FINANCE

In [22]:
#Reload save pkl file
with open('finance.pkl', 'rb') as f:
    finance = pickle.load(f)

In [32]:
vectorizer = CountVectorizer(min_df=0.01)
X = vectorizer.fit_transform(finance.Text)

In [33]:
pmi_finance= pd.DataFrame(X.todense(), columns=vectorizer.vocabulary_)

In [34]:
asint = pmi_finance.astype(int)
finance_pmi = asint.T.dot(asint)

In [35]:
finance_pmi.values[[np.arange(finance_pmi.shape[0])]*2] = 0

  finance_pmi.values[[np.arange(finance_pmi.shape[0])]*2] = 0


In [37]:
finance_pmi

Unnamed: 0,following,determined,bid,damages,pressure,begun,raise,provided,intended,lend,...,irregularity,distinctly,scarcely,effectual,prop,defence,evade,whilst,embrace,chitty
following,0,665,29,252,134,123,19,11,537,28,...,33,57,171,22,168,97,269,7,76,37
determined,665,0,114,1067,389,365,80,55,1740,100,...,101,141,617,47,634,358,754,34,343,100
bid,29,114,0,396,175,145,21,13,540,42,...,31,69,163,15,169,150,181,32,124,21
damages,252,1067,396,0,3246,1456,265,164,3229,431,...,293,548,1708,584,2415,1264,2242,242,1494,300
pressure,134,389,175,3246,0,483,180,58,1546,317,...,158,305,731,375,1532,611,1026,150,949,156
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
defence,97,358,150,1264,611,339,66,43,1145,144,...,144,140,1216,424,4126,0,658,70,384,127
evade,269,754,181,2242,1026,1067,88,145,3894,489,...,233,1852,1165,298,839,658,0,145,1607,147
whilst,7,34,32,242,150,128,28,11,234,17,...,63,32,120,11,90,70,145,0,98,41
embrace,76,343,124,1494,949,763,80,93,1267,261,...,51,625,843,181,538,384,1607,98,0,72


In [39]:
get_pmi_value('evade', 'debt', finance_pmi)

Unnamed: 0,debt
evade,4315


In [None]:
finance_occ = pmi(finance_pmi, positive=True)
finance_occ

In [40]:
max_cooccurrences(finance_pmi)

(following     knowledge
 determined    knowledge
 bid           knowledge
 damages       knowledge
 pressure      knowledge
                 ...    
 defence       knowledge
 evade         knowledge
 whilst        knowledge
 embrace       knowledge
 chitty        knowledge
 Length: 1805, dtype: object,
 following      4355
 determined    19212
 bid            5231
 damages       78292
 pressure      38575
               ...  
 defence       17946
 evade         52001
 whilst         4239
 embrace       43270
 chitty         5817
 Length: 1805, dtype: int32)

### Bigrams/Trigrams - Finance

In [23]:
list = finance['Text'].tolist()

In [24]:
#bigrams (sequences of 2 words) and collapsed them into a unique term with the underscore symbol

finder = BigramCollocationFinder.from_documents(tokenized_sentences)
bgm = BigramAssocMeasures()
score = bgm.mi_like
collocations = {'_'.join(bigram): pmi for bigram, pmi in finder.score_ngrams(score)}

In [25]:
collocations

{'emphasis_added': 1332.5375234956866,
 'uninsured_motorist': 996.5753167367814,
 'contributory_negligence': 486.38534411055343,
 'reasonable_doubt': 431.999543948073,
 'manifest_evidence': 352.0640968345486,
 'armed_robbery': 289.67621571580963,
 'willful_wanton': 283.2622807220624,
 'wanton_misconduct': 281.543357727747,
 'emotional_distress': 236.97508032229547,
 'preliminary_injunction': 219.25739801437018,
 'sexual_assault': 212.43687627309188,
 'motorist_coverage': 203.28706958012117,
 'directed_verdict': 167.45591187229783,
 'punitive_damages': 104.86330869621995,
 'ingress_egress': 99.83471351546578,
 'tort_immunity': 93.34403202470122,
 'notwithstanding_verdict': 66.71265308178846,
 'undue_influence': 66.0789027466239,
 'excited_utterance': 64.33340274343423,
 'infliction_emotional': 54.10813116992229,
 'reversed_reversed': 53.45318532863916,
 'contrary_manifest': 52.3565341054207,
 'opponent_overwhelmingly': 51.69085545722714,
 'aspect_favorable': 50.58538611563848,
 'exercis

In [26]:
#trigrams (sequences of 3 words) and collapsed them into a unique term with the underscore symbol

finder = TrigramCollocationFinder.from_documents(tokenized_sentences)
bgm = TrigramAssocMeasures()
score = bgm.mi_like
collocations = {'_'.join(trigram): pmi for trigram, pmi in finder.score_ngrams(score)}

In [27]:
collocations

{'aurum_argentum_cuprum': 1.0,
 'oxidase_carbohydrate_fermentation': 1.0,
 'pudding_myrrh_frankincense': 1.0,
 'cheetah_margay_lynx': 0.5,
 'coater_bodymaker_slitter': 0.5,
 'dissembler_hypocrite_pretender': 0.5,
 'effluence_putrescible_intercepter': 0.5,
 'ocelot_cheetah_margay': 0.5,
 'ped_episcopalian_churchgoer': 0.5,
 'raving_madman_monomaniac': 0.5,
 'thoracotomy_decortication_apical': 0.5,
 'toughen_psyche_pamper': 0.4444444444444444,
 'godson_namesake_momo': 0.3333333333333333,
 'hak_kung_sari': 0.3333333333333333,
 'lamination_stratification_mica': 0.3333333333333333,
 'namesake_momo_nonfiction': 0.3333333333333333,
 'ancon_eu_guildhall': 0.25,
 'creamy_blink_wrinkle': 0.25,
 'ductile_ferrous_silicate': 0.25,
 'eu_guildhall_hansa': 0.25,
 'follicular_adenoma_lobule': 0.25,
 'stupidly_obstinately_recreant': 0.25,
 'dah_dah_dah': 0.216,
 'fourfold_dharma_artha': 0.2,
 'lynx_bobcat_hyena': 0.2,
 'margay_lynx_bobcat': 0.2,
 'nonreligion_paty_tripartite': 0.2,
 'seismograph_geophys

## PMI - HOSPITAL

In [28]:
#Reload save pkl file
with open('hospital.pkl', 'rb') as f:
    hospital = pickle.load(f)

In [42]:
vectorizer = CountVectorizer(min_df=0.01)
X = vectorizer.fit_transform(hospital.Text)

In [43]:
pmi= pd.DataFrame(X.todense(), columns=vectorizer.vocabulary_)

In [44]:
asint = pmi.astype(int)
_pmi = asint.T.dot(asint)

In [45]:
_pmi.values[np.array([np.arange(_pmi.shape[0])]*2)] = 0

In [47]:
_pmi

Unnamed: 0,decision,correctly,assessed,usually,raised,remove,inch,bases,extend,privilege,...,badly,forcible,detainer,mainly,partitioned,upwards,inception,throw,endeavor,revert
decision,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
correctly,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
assessed,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
usually,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
raised,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
upwards,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
inception,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
throw,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
endeavor,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
hospital_occ = pmi(_pmi, positive=True)
hospital_occ

In [None]:
get_pmi_value('hospital', '', _pmi)

In [46]:
max_cooccurrences(_pmi)

(decision     decision
 correctly    decision
 assessed     decision
 usually      decision
 raised       decision
                ...   
 upwards      decision
 inception    decision
 throw        decision
 endeavor     decision
 revert       decision
 Length: 1503, dtype: object,
 decision     0
 correctly    0
 assessed     0
 usually      0
 raised       0
             ..
 upwards      0
 inception    0
 throw        0
 endeavor     0
 revert       0
 Length: 1503, dtype: int32)

### Bigrams/Trigrams - Hospital

In [29]:
list = hospital['Text'].tolist()

In [30]:
tokenized_sentences = []
for line in list:
    token = line.split()
    tokenized_sentences.append(token)

In [31]:
#bigrams (sequences of 2 words) and collapsed them into a unique term with the underscore symbol

finder = BigramCollocationFinder.from_documents(tokenized_sentences)
bgm = BigramAssocMeasures()
score = bgm.mi_like
collocations = {'_'.join(bigram): pmi for bigram, pmi in finder.score_ngrams(score)}

In [32]:
collocations

{'undue_influence': 467.9304371002132,
 'extreme_repeated': 154.4571736517017,
 'repeated_cruelty': 75.49634273772205,
 'emphasis_added': 53.04050980882169,
 'habitual_drunkenness': 32.814413655848135,
 'manifest_evidence': 28.597849287671078,
 'methylene_chloride': 26.0,
 'wanton_misconduct': 22.08731732393332,
 'willful_wanton': 19.963953594592127,
 'newly_discovered': 19.145604674796747,
 'reversed_reversed': 18.625734539139895,
 'devise_bequeath': 17.821201192997762,
 'uninsured_motorist': 16.93121693121693,
 'contributory_negligence': 13.932836283405226,
 'emotional_distress': 12.799265284702178,
 'forcible_detainer': 11.71875,
 'unsound_mind': 10.91891516477411,
 'mensa_thoro': 10.0,
 'donative_intent': 8.896326091978265,
 'apart_fault': 8.670891782305388,
 'confidence_reposed': 8.626254804107154,
 'punitive_damages': 8.181206045189287,
 'torture_discommode': 8.1,
 'contrary_manifest': 7.78397366592329,
 'divided_equally': 6.661387328210637,
 'habitual_drunkard': 6.40204752275025

In [33]:
#trigrams (sequences of 3 words) and collapsed them into a unique term with the underscore symbol

finder = TrigramCollocationFinder.from_documents(tokenized_sentences)
bgm = TrigramAssocMeasures()
score = bgm.mi_like
collocations = {'_'.join(trigram): pmi for trigram, pmi in finder.score_ngrams(score)}

In [34]:
collocations

{'blasphemy_debauchery_atheistical': 1.0,
 'debauchery_atheistical_irreligious': 1.0,
 'drowsy_diffident_uncaring': 1.0,
 'endocarditis_aortic_mitral': 1.0,
 'insecticide_fungicide_rodenticide': 1.0,
 'lipoid_adrenal_cortex': 1.0,
 'scattering_melt_scarify': 1.0,
 'soaking_potassium_permanganate': 1.0,
 'verrucous_endocarditis_aortic': 1.0,
 'tubercle_bacillus_organism': 0.6666666666666666,
 'agitate_inhale_fume': 0.5,
 'childishness_orientation_untidiness': 0.5,
 'heresy_dogma_sect': 0.5,
 'hyperemia_subconjunctival_ecchymosis': 0.5,
 'orientation_untidiness_incoherence': 0.5,
 'prolapse_hyperemia_subconjunctival': 0.5,
 'rais_mong_fierce': 0.5,
 'splenitis_fatty_infiltration': 0.5,
 'subsidize_zest_cumulation': 0.5,
 'taciturn_tinge_misanthrope': 0.5,
 'aortic_mitral_mucous': 0.3333333333333333,
 'cystotomy_perivesical_retroperitoneal': 0.3333333333333333,
 'demean_belittle_tattle': 0.3333333333333333,
 'embolus_positioned_statistically': 0.3333333333333333,
 'fetid_purulent_ascendin

## PMI - SEXUAL

In [35]:
#Reload save pkl file
with open('sexual.pkl', 'rb') as f:
    sexual = pickle.load(f)

In [78]:
vectorizer = CountVectorizer(min_df=0.01)
X = vectorizer.fit_transform(sexual.Text)

In [79]:
pmi= pd.DataFrame(X.todense(), columns=vectorizer.vocabulary_)

In [80]:
asint = pmi.astype(int)
_pmi = asint.T.dot(asint)

In [81]:
_pmi.values[[np.arange(_pmi.shape[0])]*2] = 0

  _pmi.values[[np.arange(_pmi.shape[0])]*2] = 0


In [82]:
_pmi

Unnamed: 0,brought,stated,following,deceptive,imposition,unenforceable,penalty,breach,duty,dealing,...,northerly,exhaustive,forbidden,transact,divest,destination,rarely,overturn,latitude,universally
brought,0,356,7,108,50,25,2,31,113,3,...,9,8,75,4,133,51,59,17,46,25
stated,356,0,8,323,196,212,19,64,311,39,...,9,41,150,14,441,98,140,76,37,45
following,7,8,0,68,30,32,9,2,137,23,...,0,6,36,8,37,24,25,7,25,22
deceptive,108,323,68,0,1140,313,78,136,753,166,...,182,103,515,109,799,221,395,306,169,197
imposition,50,196,30,1140,0,124,62,23,398,149,...,113,98,337,121,618,113,167,142,71,117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
destination,51,98,24,221,113,40,18,25,125,27,...,8,12,233,35,703,0,127,30,23,0
rarely,59,140,25,395,167,85,23,18,373,133,...,115,227,174,17,235,127,0,196,47,22
overturn,17,76,7,306,142,65,12,20,195,47,...,174,121,154,1,88,30,196,0,27,23
latitude,46,37,25,169,71,44,27,6,85,27,...,3,92,53,6,44,23,47,27,0,41


In [None]:
sexual_occ = pmi(_pmi, positive=True)
sexual_occ

In [88]:
get_pmi_value('anal', 'felony', _pmi)

In [None]:
max_cooccurrences(_pmi)

### Bigrams/Trigrams - Sexual

In [36]:
list =  hospital['Text'].tolist()

In [37]:
tokenized_sentences = []
for line in list:
    token = line.split()
    tokenized_sentences.append(token)

In [38]:
#bigrams (sequences of 2 words) and collapsed them into a unique term with the underscore symbol

finder = BigramCollocationFinder.from_documents(tokenized_sentences)
bgm = BigramAssocMeasures()
score = bgm.mi_like
collocations = {'_'.join(bigram): pmi for bigram, pmi in finder.score_ngrams(score)}

In [39]:
collocations

{'undue_influence': 467.9304371002132,
 'extreme_repeated': 154.4571736517017,
 'repeated_cruelty': 75.49634273772205,
 'emphasis_added': 53.04050980882169,
 'habitual_drunkenness': 32.814413655848135,
 'manifest_evidence': 28.597849287671078,
 'methylene_chloride': 26.0,
 'wanton_misconduct': 22.08731732393332,
 'willful_wanton': 19.963953594592127,
 'newly_discovered': 19.145604674796747,
 'reversed_reversed': 18.625734539139895,
 'devise_bequeath': 17.821201192997762,
 'uninsured_motorist': 16.93121693121693,
 'contributory_negligence': 13.932836283405226,
 'emotional_distress': 12.799265284702178,
 'forcible_detainer': 11.71875,
 'unsound_mind': 10.91891516477411,
 'mensa_thoro': 10.0,
 'donative_intent': 8.896326091978265,
 'apart_fault': 8.670891782305388,
 'confidence_reposed': 8.626254804107154,
 'punitive_damages': 8.181206045189287,
 'torture_discommode': 8.1,
 'contrary_manifest': 7.78397366592329,
 'divided_equally': 6.661387328210637,
 'habitual_drunkard': 6.40204752275025

In [40]:
#trigrams (sequences of 3 words) and collapsed them into a unique term with the underscore symbol

finder = TrigramCollocationFinder.from_documents(tokenized_sentences)
bgm = TrigramAssocMeasures()
score = bgm.mi_like
collocations = {'_'.join(trigram): pmi for trigram, pmi in finder.score_ngrams(score)}

In [41]:
collocations

{'blasphemy_debauchery_atheistical': 1.0,
 'debauchery_atheistical_irreligious': 1.0,
 'drowsy_diffident_uncaring': 1.0,
 'endocarditis_aortic_mitral': 1.0,
 'insecticide_fungicide_rodenticide': 1.0,
 'lipoid_adrenal_cortex': 1.0,
 'scattering_melt_scarify': 1.0,
 'soaking_potassium_permanganate': 1.0,
 'verrucous_endocarditis_aortic': 1.0,
 'tubercle_bacillus_organism': 0.6666666666666666,
 'agitate_inhale_fume': 0.5,
 'childishness_orientation_untidiness': 0.5,
 'heresy_dogma_sect': 0.5,
 'hyperemia_subconjunctival_ecchymosis': 0.5,
 'orientation_untidiness_incoherence': 0.5,
 'prolapse_hyperemia_subconjunctival': 0.5,
 'rais_mong_fierce': 0.5,
 'splenitis_fatty_infiltration': 0.5,
 'subsidize_zest_cumulation': 0.5,
 'taciturn_tinge_misanthrope': 0.5,
 'aortic_mitral_mucous': 0.3333333333333333,
 'cystotomy_perivesical_retroperitoneal': 0.3333333333333333,
 'demean_belittle_tattle': 0.3333333333333333,
 'embolus_positioned_statistically': 0.3333333333333333,
 'fetid_purulent_ascendin

## PMI - DIVORCE

In [42]:
#Reload save pkl file
with open('divorce.pkl', 'rb') as f:
    divorce = pickle.load(f)

In [60]:
vectorizer = CountVectorizer(min_df=0.01)
X = vectorizer.fit_transform(divorce.Text)

In [61]:
pmi= pd.DataFrame(X.todense(), columns=vectorizer.vocabulary_)

In [62]:
asint = pmi.astype(int)
_pmi = asint.T.dot(asint)

In [63]:
_pmi.values[[np.arange(_pmi.shape[0])]*2] = 0

  _pmi.values[[np.arange(_pmi.shape[0])]*2] = 0


In [64]:
_pmi

Unnamed: 0,decision,correctly,assessed,usually,raised,remove,inch,bases,extend,privilege,...,badly,forcible,detainer,mainly,partitioned,upwards,inception,throw,endeavor,revert
decision,0,133,12,42,43,11,21,6,3,4,...,9,1,9,11,9,6,32,2,10,6
correctly,133,0,5,111,152,91,45,13,3,39,...,72,2,10,24,168,70,60,12,36,16
assessed,12,5,0,9,16,7,12,1,0,7,...,5,2,0,7,3,4,12,0,2,1
usually,42,111,9,0,149,62,7,3,1,3,...,13,1,8,3,3,4,29,2,17,2
raised,43,152,16,149,0,216,155,21,29,125,...,252,37,92,113,148,44,214,9,86,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
upwards,6,70,4,4,44,12,13,0,4,10,...,23,1,8,13,77,0,25,5,12,0
inception,32,60,12,29,214,61,72,6,15,132,...,106,44,221,65,73,25,0,8,112,4
throw,2,12,0,2,9,7,20,1,2,7,...,33,1,0,9,2,5,8,0,7,5
endeavor,10,36,2,17,86,36,55,0,2,17,...,167,13,54,99,69,12,112,7,0,16


In [None]:
divorce_occ = pmi(_pmi, positive=True)
divorce_occ

In [66]:
get_pmi_value('annulment', 'married', _pmi)

Unnamed: 0,married
annulment,9


In [67]:
max_cooccurrences(_pmi)

(decision     discover
 correctly    discover
 assessed     disagree
 usually      discover
 raised       discover
                ...   
 upwards      discover
 inception    discover
 throw          formal
 endeavor     discover
 revert       discover
 Length: 1503, dtype: object,
 decision      579
 correctly    2820
 assessed      473
 usually       823
 raised       6805
              ... 
 upwards       524
 inception    3841
 throw         753
 endeavor     2053
 revert        396
 Length: 1503, dtype: int32)

### Bigrams/Trigrams - Divorce

In [43]:
list =  divorce['Text'].tolist()

In [44]:
tokenized_sentences = []
for line in list:
    token = line.split()
    tokenized_sentences.append(token)

In [45]:
#bigrams (sequences of 2 words) and collapsed them into a unique term with the underscore symbol

finder = BigramCollocationFinder.from_documents(tokenized_sentences)
bgm = BigramAssocMeasures()
score = bgm.mi_like
collocations = {'_'.join(bigram): pmi for bigram, pmi in finder.score_ngrams(score)}

In [46]:
collocations

{'undue_influence': 467.9304371002132,
 'extreme_repeated': 154.4571736517017,
 'repeated_cruelty': 75.49634273772205,
 'emphasis_added': 53.04050980882169,
 'habitual_drunkenness': 32.814413655848135,
 'manifest_evidence': 28.597849287671078,
 'methylene_chloride': 26.0,
 'wanton_misconduct': 22.08731732393332,
 'willful_wanton': 19.963953594592127,
 'newly_discovered': 19.145604674796747,
 'reversed_reversed': 18.625734539139895,
 'devise_bequeath': 17.821201192997762,
 'uninsured_motorist': 16.93121693121693,
 'contributory_negligence': 13.932836283405226,
 'emotional_distress': 12.799265284702178,
 'forcible_detainer': 11.71875,
 'unsound_mind': 10.91891516477411,
 'mensa_thoro': 10.0,
 'donative_intent': 8.896326091978265,
 'apart_fault': 8.670891782305388,
 'confidence_reposed': 8.626254804107154,
 'punitive_damages': 8.181206045189287,
 'torture_discommode': 8.1,
 'contrary_manifest': 7.78397366592329,
 'divided_equally': 6.661387328210637,
 'habitual_drunkard': 6.40204752275025

In [47]:
#trigrams (sequences of 3 words) and collapsed them into a unique term with the underscore symbol

finder = TrigramCollocationFinder.from_documents(tokenized_sentences)
bgm = TrigramAssocMeasures()
score = bgm.mi_like
collocations = {'_'.join(trigram): pmi for trigram, pmi in finder.score_ngrams(score)}

In [48]:
collocations

{'blasphemy_debauchery_atheistical': 1.0,
 'debauchery_atheistical_irreligious': 1.0,
 'drowsy_diffident_uncaring': 1.0,
 'endocarditis_aortic_mitral': 1.0,
 'insecticide_fungicide_rodenticide': 1.0,
 'lipoid_adrenal_cortex': 1.0,
 'scattering_melt_scarify': 1.0,
 'soaking_potassium_permanganate': 1.0,
 'verrucous_endocarditis_aortic': 1.0,
 'tubercle_bacillus_organism': 0.6666666666666666,
 'agitate_inhale_fume': 0.5,
 'childishness_orientation_untidiness': 0.5,
 'heresy_dogma_sect': 0.5,
 'hyperemia_subconjunctival_ecchymosis': 0.5,
 'orientation_untidiness_incoherence': 0.5,
 'prolapse_hyperemia_subconjunctival': 0.5,
 'rais_mong_fierce': 0.5,
 'splenitis_fatty_infiltration': 0.5,
 'subsidize_zest_cumulation': 0.5,
 'taciturn_tinge_misanthrope': 0.5,
 'aortic_mitral_mucous': 0.3333333333333333,
 'cystotomy_perivesical_retroperitoneal': 0.3333333333333333,
 'demean_belittle_tattle': 0.3333333333333333,
 'embolus_positioned_statistically': 0.3333333333333333,
 'fetid_purulent_ascendin

### PMI - BURGLARY

In [49]:
#Reload save pkl file
with open('burglary.pkl', 'rb') as f:
    burglary = pickle.load(f)

In [69]:
vectorizer = CountVectorizer(min_df=0.01)
X = vectorizer.fit_transform(burglary.Text)

In [70]:
pmi= pd.DataFrame(X.todense(), columns=vectorizer.vocabulary_)

In [71]:
asint = pmi.astype(int)
_pmi = asint.T.dot(asint)

In [72]:
_pmi.values[[np.arange(_pmi.shape[0])]*2] = 0

  _pmi.values[[np.arange(_pmi.shape[0])]*2] = 0


In [73]:
_pmi

Unnamed: 0,terminate,parental,following,evidentiary,consideration,excuse,failure,comply,evidence,convincing,...,lapse,libel,ne,pose,technician,sending,forthwith,restraining,pursuance,swear
terminate,0,20,8,439,261,86,42,39,238,23,...,38,28,230,20,191,46,126,20,274,20
parental,20,0,12,122,77,42,14,4,138,20,...,3,21,115,23,74,15,47,9,113,14
following,8,12,0,95,44,39,4,17,41,9,...,12,13,64,4,37,8,32,8,102,3
evidentiary,439,122,95,0,3631,548,173,162,1383,291,...,148,325,967,396,998,325,992,105,1683,140
consideration,261,77,44,3631,0,345,156,92,967,168,...,75,204,549,238,389,147,530,125,1029,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sending,46,15,8,325,147,74,39,7,220,10,...,3,37,224,102,771,0,107,2,191,8
forthwith,126,47,32,992,530,304,70,64,550,78,...,41,713,415,155,239,107,0,74,1017,20
restraining,20,9,8,105,125,37,4,44,147,7,...,7,32,86,59,12,2,74,0,232,11
pursuance,274,113,102,1683,1029,443,56,62,1917,105,...,100,601,964,99,133,191,1017,232,0,52


In [74]:
get_pmi_value('armed', 'burglary', _pmi)

Unnamed: 0,burglary
armed,66


In [None]:
burglary_occ = pmi(_pmi, positive=True)
burglary_occ

In [None]:
get_pmi_value('', '', burglary_occ)

In [75]:
max_cooccurrences(_pmi)

(terminate        remains
 parental         remains
 following        remains
 evidentiary      remains
 consideration    remains
                   ...   
 sending          remains
 forthwith        remains
 restraining      remains
 pursuance        remains
 swear            remains
 Length: 1839, dtype: object,
 terminate         9552
 parental          1978
 following         3146
 evidentiary      63470
 consideration    37035
                  ...  
 sending           4354
 forthwith        20938
 restraining       4616
 pursuance        40180
 swear             2313
 Length: 1839, dtype: int32)

### Bigrams/Trigrams - Burglary

In [50]:
list = burglary['Text'].tolist()

In [51]:
tokenized_sentences = []
for line in list:
    token = line.split()
    tokenized_sentences.append(token)

In [52]:
#bigrams (sequences of 2 words) and collapsed them into a unique term with the underscore symbol

finder = BigramCollocationFinder.from_documents(tokenized_sentences)
bgm = BigramAssocMeasures()
score = bgm.mi_like
collocations = {'_'.join(bigram): pmi for bigram, pmi in finder.score_ngrams(score)}

In [53]:
collocations

{'reasonable_doubt': 5633.149181509741,
 'armed_robbery': 1619.9937694219973,
 'emphasis_added': 1458.898255982386,
 'sexual_assault': 1387.4897468439176,
 'plea_guilty': 499.381128897696,
 'aggravation_mitigation': 408.87067543882057,
 'guilty_plea': 397.06519836031293,
 'brutal_heinous': 367.1484593837535,
 'degree_murder': 344.7916834523891,
 'frivolous_patently': 325.3184629601771,
 'closely_balanced': 303.72935818377584,
 'reckless_homicide': 248.75848200287163,
 'emotional_distress': 192.41633575255187,
 'exceptionally_brutal': 177.81475748194015,
 'insulting_provoking': 131.05269319629926,
 'reversed_reversed': 108.81207016793034,
 'improbable_unsatisfactory': 94.51974517476701,
 'anhydrous_ammonia': 92.070796460177,
 'unconstitutionally_vague': 91.62313709902361,
 'manifest_evidence': 86.5942971467671,
 'fatally_defective': 83.20013060159722,
 'punitive_damages': 81.0856602690704,
 'adjudication_wardship': 72.84170869725673,
 'heinous_indicative': 67.19051254089422,
 'newly_dis

In [54]:
#trigrams (sequences of 3 words) and collapsed them into a unique term with the underscore symbol

finder = TrigramCollocationFinder.from_documents(tokenized_sentences)
bgm = TrigramAssocMeasures()
score = bgm.mi_like
collocations = {'_'.join(trigram): pmi for trigram, pmi in finder.score_ngrams(score)}

In [55]:
collocations

{'fabled_torrid_clime': 1.0,
 'handily_masticate_damnable': 1.0,
 'quadruple_quintuple_sextuple': 0.8,
 'caricature_finicking_hairsplitter': 0.5,
 'intercorporate_unscramble_omelet': 0.5,
 'intrapelvic_extraperitoneal_obturator': 0.5,
 'oceanography_astronomy_limnology': 0.5,
 'stile_ponto_plump': 0.5,
 'coeval_aula_emanation': 0.3333333333333333,
 'insecticide_fungicide_rodenticide': 0.3333333333333333,
 'reformer_prohibitionist_apostle': 0.3333333333333333,
 'satiate_avaricious_craving': 0.3333333333333333,
 'stellate_ganglion_blockade': 0.3333333333333333,
 'tantalum_molybdenum_rubidium': 0.3333333333333333,
 'tungsten_tantalum_molybdenum': 0.3333333333333333,
 'concave_concha_antihelix': 0.25,
 'fetid_putrescent_sweepings': 0.25,
 'leopard_ocelot_cheetah': 0.25,
 'ocelot_cheetah_margay': 0.25,
 'percale_shirting_shirting': 0.25,
 'peritoneum_pericardium_pleura': 0.25,
 'scintillating_prosaically_renown': 0.25,
 'sensitively_coldness_ingratitude': 0.25,
 'sinner_handily_masticate': 