## POINTWISE MUTUAL INFORMATION - Bigrams

In [1]:
import spacy
import string
import re
import pandas as pd
import numpy as np
import seaborn as sns
import random
import pickle
from unidecode import unidecode
import nltk
nltk.download('wordnet')
nltk.download('words')
nltk.download('punkt')

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import skipgrams
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords 
from spacy.lang.en import English

from tqdm import tqdm
from gensim import corpora
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_non_alphanum
from gensim.models import CoherenceModel
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures,TrigramCollocationFinder, TrigramAssocMeasures

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## PMI - DRUG

In [3]:
def get_pmi_value(word1, word2, matrix):
    return matrix.loc[matrix.index == word1, [word2]]

def pmi(df, positive=True):
    cols = df.sum(axis=0)
    total = cols.sum()
    rows = df.sum(axis=1)
    expected = np.outer(rows, cols) / total
    df_pmi = df / expected
    with np.errstate(divide='ignore', invalid='ignore'):
        df_pmi = np.log(df_pmi)
    df_pmi[np.array(np.isinf(df_pmi))] = 0.0 
    if positive:
        df_pmi[df_pmi < 0] = 0.0
    return df_pmi

def max_cooccurrences(df_column):
    return df_column.idxmax() , df_column.max()

In [4]:
#Reload save pkl file
with open('drug.pkl', 'rb') as f:
    drug = pickle.load(f)

In [5]:
vectorizer = CountVectorizer(min_df=0.01,  ngram_range=(2,2))
X = vectorizer.fit_transform(drug.Text)

In [6]:
pmi_drug = pd.DataFrame(X.todense(), columns=vectorizer.vocabulary_)

In [7]:
asint = pmi_drug.astype(int)
_pmi = asint.T.dot(asint)
_pmi.values[tuple([np.arange(_pmi.shape[0])]*2)] = 0

In [8]:
drug_occ = pmi(_pmi, positive=True)
drug_occ

Unnamed: 0,cocaine undercover,conviction felony,minimum sentence,received sentence,included offense,vacate sentence,guilty reasonable,reasonable doubt,evidence sufficient,sufficient sustain,...,unlawful heroin,evidence unable,presumption innocence,tried guilty,statute require,statute necessary,giving notice,demurrer sustained,guilty armed,different evidence
cocaine undercover,0.000000,0.586224,0.000000,0.301831,0.377331,0.632103,0.000000,0.000000,0.857328,0.114456,...,0.555792,0.102785,0.492235,0.000000,0.473293,0.101957,0.124287,0.000000,0.000000,0.000000
conviction felony,0.586224,0.000000,0.000000,0.000000,2.357859,0.415406,0.000000,0.366408,0.000000,0.000000,...,0.076731,0.000000,0.052395,0.715644,0.000000,1.117404,0.000000,0.000000,0.373166,0.000000
minimum sentence,0.000000,0.000000,0.000000,0.669874,0.000000,0.000000,0.175196,0.000000,0.000000,0.780653,...,0.895086,0.000000,0.000000,0.301856,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
received sentence,0.301831,0.000000,0.669874,0.000000,0.841322,0.000000,0.000000,0.073646,0.000000,0.103989,...,0.000000,0.161311,0.327618,0.000000,0.000000,0.000000,0.182813,0.000000,0.000000,0.000000
included offense,0.377331,2.357859,0.000000,0.841322,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.082660,1.319408,1.066367,0.000000,0.081832,0.000000,0.000000,0.000000,0.202424
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
statute necessary,0.101957,1.117404,0.000000,0.000000,0.081832,0.218821,0.000000,0.352144,0.000000,0.000000,...,0.467933,0.000000,0.484419,0.000000,1.116776,0.000000,0.234253,0.000000,0.000000,0.348263
giving notice,0.124287,0.000000,0.000000,0.182813,0.000000,0.513085,0.265450,0.000000,0.644784,0.000000,...,0.705038,0.522763,0.903845,0.631002,0.477507,0.234253,0.000000,0.000000,0.000000,0.000000
demurrer sustained,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.425435,0.000000
guilty armed,0.000000,0.373166,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.425435,0.000000,0.000000


In [9]:
max_cooccurrences(drug_occ)

(cocaine undercover         manifestly erroneous
 conviction felony              included offense
 minimum sentence      improbable unsatisfactory
 received sentence                sought damages
 included offense            discretion evidence
                                 ...            
 statute necessary              offense evidence
 giving notice               seriousness offense
 demurrer sustained                 guilty armed
 guilty armed                 demurrer sustained
 different evidence        testimony prosecution
 Length: 961, dtype: object,
 cocaine undercover    1.991926
 conviction felony     2.357859
 minimum sentence      1.993971
 received sentence     2.114093
 included offense      4.506532
                         ...   
 statute necessary     2.163177
 giving notice         1.727877
 demurrer sustained    3.425435
 guilty armed          3.425435
 different evidence    3.054178
 Length: 961, dtype: float64)

## PMI - WEAPONS

In [10]:
#Reload save pkl file
with open('weapons.pkl', 'rb') as f:
    weapons = pickle.load(f)

In [11]:
vectorizer = CountVectorizer(min_df=0.01, ngram_range=(2,2))
X = vectorizer.fit_transform(weapons.Text)

In [12]:
pmi_weapons = pd.DataFrame(X.todense(), columns=vectorizer.vocabulary_)

In [13]:
asint = pmi_weapons.astype(int)
_pmi = asint.T.dot(asint)
_pmi.values[tuple([np.arange(_pmi.shape[0])]*2)] = 0

In [14]:
weapons_occ = pmi(_pmi, positive=True)
weapons_occ

Unnamed: 0,guilty degree,degree murder,proven guilty,guilty reasonable,reasonable doubt,pled guilty,knew knew,determine testimony,testimony evidence,evidence actually,...,later outside,closeness evidence,ineffective objective,simply stated,pocket knife,liable damages,evidence far,intent felony,begun recover,brought evidence
guilty degree,0.000000,0.000000,0.478187,1.020261,0.000000,1.344625,0.047875,0.031695,0.000000,0.417662,...,0.407047,0.346544,0.415702,0.063929,0.350928,0.265894,0.000000,0.000000,0.413592,0.000000
degree murder,0.000000,0.000000,0.000000,0.000000,0.751672,0.616070,0.237692,0.000000,0.000000,0.322642,...,0.516821,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
proven guilty,0.478187,0.000000,0.000000,1.197307,0.000000,0.107978,0.091389,0.000000,0.907721,0.343394,...,0.000000,0.000000,0.133216,0.135615,0.000000,0.337580,0.000000,0.000000,0.148805,0.149933
guilty reasonable,1.020261,0.000000,1.197307,0.000000,0.000000,0.984254,0.361530,0.527672,0.341613,0.000000,...,0.278869,0.000000,0.962972,0.000290,0.000000,0.000000,0.000000,0.000000,0.000000,0.738527
reasonable doubt,0.000000,0.751672,0.000000,0.000000,0.000000,0.043904,0.294015,0.000000,0.000000,0.101639,...,0.012097,0.000000,0.310720,0.000000,0.052605,0.000000,0.211119,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
liable damages,0.265894,0.000000,0.337580,0.000000,0.000000,0.000000,0.000000,0.178770,1.022331,0.734636,...,0.671905,0.000000,0.950543,0.798791,0.623166,0.000000,0.000000,0.000000,0.694198,0.200004
evidence far,0.000000,0.000000,0.000000,0.000000,0.211119,0.000000,0.655084,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.812543,0.000000,0.647235
intent felony,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.018548,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.812543,0.000000,0.000000,0.425398
begun recover,0.413592,0.000000,0.148805,0.000000,0.000000,0.000000,0.000000,0.000000,0.710954,0.000000,...,0.000000,0.000000,0.000000,0.022229,0.166127,0.694198,0.000000,0.000000,0.000000,0.624334


In [15]:
max_cooccurrences(weapons_occ)

(guilty degree                 decision admit
 degree murder         inconsistent statement
 proven guilty                    stated gave
 guilty reasonable             decision admit
 reasonable doubt             noted testimony
                               ...           
 liable damages       credibility credibility
 evidence far                   intent felony
 intent felony              evidence reversal
 begun recover          favorable prosecution
 brought evidence         irrelevant evidence
 Length: 1800, dtype: object,
 guilty degree        2.336831
 degree murder        2.856531
 proven guilty        3.177001
 guilty reasonable    1.999898
 reasonable doubt     1.629223
                        ...   
 liable damages       2.742677
 evidence far         3.812543
 intent felony        3.982414
 begun recover        2.697281
 brought evidence     1.592183
 Length: 1800, dtype: float64)

## PMI - FINANCE

In [16]:
#Reload save pkl file
with open('finance.pkl', 'rb') as f:
    finance = pickle.load(f)

In [17]:
vectorizer = CountVectorizer(min_df=0.01,  ngram_range=(2,2))
X = vectorizer.fit_transform(finance.Text)

In [18]:
pmi_finance= pd.DataFrame(X.todense(), columns=vectorizer.vocabulary_)

In [19]:
asint = pmi_finance.astype(int)
_pmi = asint.T.dot(asint)
_pmi.values[tuple([np.arange(_pmi.shape[0])]*2)] = 0

In [20]:
finance_occ = pmi(_pmi, positive=True)
finance_occ

Unnamed: 0,necessary necessary,month month,think sufficient,verdict damages,damages damages,verdict evidence,notwithstanding verdict,contrary manifest,manifest evidence,evidence decision,...,consideration testimony,plea demurrer,demurrer demurrer,absolutely void,giving refusing,prove evidence,brought assumpsit,semi annually,shall lawful,shall satisfaction
necessary necessary,0.000000,0.631943,0.0,0.918322,0.116587,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.078712,0.000000,0.000000,0.316176,0.000000,0.000000,0.000000,0.000000,0.171062
month month,0.631943,0.000000,0.0,0.000000,0.192090,0.314743,0.000000,0.000000,0.000000,0.030840,...,0.000000,0.047070,0.000000,0.300283,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
think sufficient,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.301718,...,0.449459,0.258251,0.446003,0.000000,0.125689,0.000000,0.000000,0.000000,0.000000,0.000000
verdict damages,0.918322,0.000000,0.0,0.000000,0.473934,1.411395,0.613206,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,2.085444,2.856822,2.865653,0.318222,0.104595
damages damages,0.116587,0.192090,0.0,0.473934,0.000000,1.328782,0.584660,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.742577,1.257678,0.757665,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
prove evidence,0.000000,0.000000,0.0,2.085444,0.742577,0.808072,0.389373,0.489442,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.702640,0.000000,0.000000,1.313705,2.540723,0.020281,0.000000
brought assumpsit,0.000000,0.000000,0.0,2.856822,1.257678,1.494445,0.239498,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.313705,0.000000,2.401430,0.860805,0.000000
semi annually,0.000000,0.000000,0.0,2.865653,0.757665,1.474005,1.066356,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,2.540723,2.401430,0.000000,0.000000,0.038951
shall lawful,0.000000,0.000000,0.0,0.318222,0.000000,0.000000,0.370108,0.000000,0.394619,0.000000,...,0.000000,0.000000,0.000000,0.021481,0.000000,0.020281,0.860805,0.000000,0.000000,0.285311


In [21]:
max_cooccurrences(finance_occ)

(necessary necessary    immediately following
 month month                 absence evidence
 think sufficient           evidence contrary
 verdict damages                semi annually
 damages damages               evidence shown
                                ...          
 prove evidence            sustained evidence
 brought assumpsit        knowledge knowledge
 semi annually            knowledge knowledge
 shall lawful                     breach duty
 shall satisfaction           effect evidence
 Length: 490, dtype: object,
 necessary necessary    1.362278
 month month            1.427158
 think sufficient       2.093649
 verdict damages        2.865653
 damages damages        3.204323
                          ...   
 prove evidence         2.571709
 brought assumpsit      3.100245
 semi annually          2.954117
 shall lawful           2.999751
 shall satisfaction     1.509681
 Length: 490, dtype: float64)

## PMI - SEXUAL

In [22]:
#Reload save pkl file
with open('sexual.pkl', 'rb') as f:
    sexual = pickle.load(f)

In [23]:
sexual

Unnamed: 0,Text,crimes
11,overdraft brought stated following deceptive i...,sexual
21,guilty sexual contending erroneous hearsay evi...,sexual
25,brought lawsuit pointed misnomer procedure aff...,sexual
32,following ballot ballot nomination circulator ...,sexual
59,decision discrimination articulated nondiscrim...,sexual
...,...,...
186277,negligence stemming slip fall genuine preclude...,sexual
186288,verdict apportion fault verdict liable verdict...,sexual
186325,slander punitive damages privilege barred puni...,sexual
186359,introduction accordance procedure follow affir...,sexual


In [24]:
vectorizer = CountVectorizer(min_df=0.01 ,ngram_range=(2,2))
X = vectorizer.fit_transform(sexual.Text)

In [25]:
pmi= pd.DataFrame(X.todense(), columns=vectorizer.vocabulary_)

In [26]:
asint = pmi.astype(int)
df_pmi = asint.T.dot(asint)

In [27]:
df_pmi.values[tuple([np.arange(sexual_pmi.shape[0])]*2)] = 0

Unnamed: 0,stated following,breach duty,notice stated,stated affirm,expressly stated,address argument,reject argument,insufficient establish,necessary establish,exercise discretion,...,statute unconstitutional,sufficient sustain,sustaining demurrer,testimony tending,duty ordinary,prove evidence,statute consideration,dismissal prejudice,clearly shown,effect effect
stated following,0,10,0,1,3,15,8,3,5,9,...,8,3,25,2,1,8,6,3,6,4
breach duty,10,0,0,0,7,6,10,12,7,12,...,9,3,41,0,7,32,6,1,13,54
notice stated,0,0,0,2,2,3,2,0,0,4,...,1,1,10,0,0,6,0,1,1,0
stated affirm,1,0,2,0,2,1,0,0,0,2,...,4,2,4,3,1,3,8,0,0,0
expressly stated,3,7,2,2,0,12,1,4,4,0,...,9,3,17,2,6,1,2,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
prove evidence,8,32,6,3,1,3,79,27,2,7,...,16,0,42,0,7,0,445,7,1,6882
statute consideration,6,6,0,8,2,19,32,25,2,0,...,23,0,22,0,0,445,0,3,0,1273
dismissal prejudice,3,1,1,0,3,6,0,1,0,0,...,4,3,16,0,1,7,3,0,3,7
clearly shown,6,13,1,0,2,12,8,3,1,1,...,9,2,26,0,0,1,0,3,0,1


In [37]:
sexual_occ = pmi(df_pmi)
sexual_occ

Unnamed: 0,stated following,breach duty,notice stated,stated affirm,expressly stated,address argument,reject argument,insufficient establish,necessary establish,exercise discretion,...,statute unconstitutional,sufficient sustain,sustaining demurrer,testimony tending,duty ordinary,prove evidence,statute consideration,dismissal prejudice,clearly shown,effect effect
stated following,0.0,0.000000,0.000000,0.000000,0.000000,0.548325,0.270604,0.000000,0.410730,0.273584,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
breach duty,0.0,0.000000,0.000000,0.000000,0.174690,0.000000,0.277671,0.698394,0.531126,0.345189,...,0.000000,0.000000,0.184148,0.000000,0.307791,0.000000,0.000000,0.000000,0.411651,0.000000
notice stated,0.0,0.000000,0.000000,0.943521,0.166919,0.000000,0.000000,0.000000,0.000000,0.491569,...,0.000000,0.000000,0.018153,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
stated affirm,0.0,0.000000,0.943521,0.000000,0.452596,0.000000,0.000000,0.000000,0.000000,0.084099,...,0.606970,0.582640,0.000000,1.549535,0.000000,0.000000,0.544645,0.000000,0.000000,0.000000
expressly stated,0.0,0.174690,0.166919,0.452596,0.000000,0.863173,0.000000,0.353849,0.725578,0.000000,...,0.641299,0.211504,0.057856,0.367468,0.907708,0.000000,0.000000,0.447994,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
prove evidence,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,1.026304,0.191094,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.714379,0.000000,0.000000,3.389304
statute consideration,0.0,0.000000,0.000000,0.544645,0.000000,0.028460,0.900644,0.892185,0.000000,0.000000,...,0.285323,0.000000,0.000000,0.000000,0.000000,1.714379,0.000000,0.000000,0.000000,2.479823
dismissal prejudice,0.0,0.000000,0.000000,0.000000,0.447994,0.536560,0.000000,0.000000,0.000000,0.000000,...,0.196903,0.578038,0.363766,0.000000,0.000000,0.000000,0.000000,0.000000,0.065916,0.000000
clearly shown,0.0,0.411651,0.000000,0.000000,0.000000,0.481095,0.426517,0.000000,0.000000,0.000000,...,0.259221,0.000000,0.100661,0.000000,0.000000,0.000000,0.000000,0.065916,0.000000,0.000000


In [None]:
max_cooccurrences(sexual_occ)

## PMI - DIVORCE

In [38]:
#Reload save pkl file
with open('divorce.pkl', 'rb') as f:
    divorce = pickle.load(f)

In [39]:
vectorizer = CountVectorizer(min_df=0.01, ngram_range=(2,2))
X = vectorizer.fit_transform(divorce.Text)

In [40]:
pmi= pd.DataFrame(X.todense(), columns=vectorizer.vocabulary_)

In [41]:
asint = pmi.astype(int)
_pmi = asint.T.dot(asint)
_pmi.values[tuple([np.arange(_pmi.shape[0])]*2)] = 0

In [49]:
divorce_occ = pmi(_pmi, positive=True)
divorce_occ

Unnamed: 0,according evidence,decision decision,decision evidence,evidence insufficient,evidence contrary,contrary manifest,manifest evidence,shall shall,ordinary meaning,affidavit affidavit,...,sustaining demurrer,clearly expressed,competent testimony,shall testator,entirely different,testator gave,hear determine,leaving leaving,making appropriation,appropriation provided
according evidence,0.000000,0.472590,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.419634,...,0.000000,0.000000,0.000000,0.000000,0.975600,0.000000,0.558421,0.000000,1.036476,0.000000
decision decision,0.472590,0.000000,0.000000,0.843213,0.153241,0.000000,0.518954,0.637081,0.863228,0.000000,...,0.000000,0.117624,0.364999,0.000000,1.187643,0.316659,0.000000,0.000000,0.000000,0.000000
decision evidence,0.000000,0.000000,0.000000,0.048650,0.000000,0.000000,0.823003,1.634277,0.000000,0.000000,...,0.000000,2.031111,2.614959,3.731623,2.051308,0.000000,0.669048,1.194658,0.000000,0.000000
evidence insufficient,0.000000,0.843213,0.048650,0.000000,0.822646,0.759182,0.000000,0.795660,0.000000,1.831711,...,0.000000,0.276203,0.000000,0.000000,0.247610,0.000000,1.216726,0.000000,0.000000,0.000000
evidence contrary,0.000000,0.153241,0.000000,0.822646,0.000000,0.000000,0.680709,1.204301,1.024983,0.000000,...,0.000000,1.195670,0.000000,0.000000,0.656251,0.000000,0.000000,1.745511,0.023980,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
testator gave,0.000000,0.316659,0.000000,0.000000,0.000000,0.000000,0.000000,0.269106,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.039174,1.735959,0.000000,0.000000,0.000000,0.000000,0.000000
hear determine,0.558421,0.000000,0.669048,1.216726,0.000000,0.974116,0.892467,0.000000,1.103210,0.000000,...,1.018587,0.000000,0.000000,0.000000,0.868009,0.000000,0.000000,0.858657,0.000000,0.000000
leaving leaving,0.000000,0.000000,1.194658,0.000000,1.745511,0.000000,0.000000,0.437591,0.886882,0.837653,...,0.000000,0.611281,0.000000,0.900807,0.988154,0.000000,0.858657,0.000000,0.355883,0.009676
making appropriation,1.036476,0.000000,0.000000,0.000000,0.023980,0.471341,0.000000,0.000000,0.000000,0.000000,...,1.208960,0.000000,0.000000,0.000000,0.365235,0.000000,0.000000,0.355883,0.000000,0.000000


In [50]:
max_cooccurrences(divorce_occ)

(according evidence        sustained demurrer
 decision decision           affidavit stated
 decision evidence             shall testator
 evidence insufficient           void statute
 evidence contrary            statute require
                                  ...        
 testator gave               affidavit stated
 hear determine            sustained demurrer
 leaving leaving           testator remainder
 making appropriation        affidavit stated
 appropriation provided       shown testimony
 Length: 441, dtype: object,
 according evidence        65
 decision decision         17
 decision evidence         41
 evidence insufficient     72
 evidence contrary         26
                           ..
 testator gave             39
 hear determine            69
 leaving leaving           27
 making appropriation      22
 appropriation provided    80
 Length: 441, dtype: int32)

### PMI - BURGLARY

In [51]:
#Reload save pkl file
with open('burglary.pkl', 'rb') as f:
    burglary = pickle.load(f)

In [52]:
vectorizer = CountVectorizer(min_df=0.01,  ngram_range=(2,2))
X = vectorizer.fit_transform(burglary.Text)

In [53]:
pmi= pd.DataFrame(X.todense(), columns=vectorizer.vocabulary_)

In [54]:
asint = pmi.astype(int)
_pmi = asint.T.dot(asint)
_pmi.values[tuple([np.arange(_pmi.shape[0])]*2)] = 0

In [57]:
burglary_occ = pmi(_pmi, positive=True)
burglary_occ

Unnamed: 0,failure comply,convincing evidence,sexual conduct,reasonable basis,contrary manifest,manifest evidence,evidence believe,accordingly conclude,conduct offense,evidence reasonable,...,evidence failure,accused evidence,carefully considered,intent intent,penalty provided,sentence murder,giving refusing,tending prove,burglary larceny,guilty larceny
failure comply,0.000000,1.005310,0.000000,0.405868,0.000000,0.000000,0.360203,0.000000,0.000000,0.391887,...,0.000000,0.000000,0.412784,0.000000,0.62479,0.000000,0.424281,0.000000,0.000000,0.262085
convincing evidence,1.005310,0.000000,0.000000,0.781201,0.000000,0.328080,0.216742,0.000000,0.000000,0.000000,...,0.144914,0.000000,0.396075,0.739154,0.78271,0.492442,0.000000,0.000000,0.000000,0.000000
sexual conduct,0.000000,0.000000,0.000000,0.793828,0.000000,0.000000,0.000000,0.286459,0.472569,0.000000,...,0.294117,0.266764,0.109573,0.000000,0.00000,0.000000,0.248552,0.000000,0.000000,0.000000
reasonable basis,0.405868,0.781201,0.793828,0.000000,0.000000,0.383564,0.118075,0.000000,0.509347,0.000000,...,0.151608,0.000000,0.000000,0.000000,0.00000,0.000000,0.113161,0.000000,0.000000,0.000000
contrary manifest,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.126883,0.000000,0.000000,...,0.073117,0.350503,0.000000,0.000000,0.00000,0.226489,0.000000,1.021598,1.372588,0.648097
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sentence murder,0.000000,0.492442,0.000000,0.000000,0.226489,0.094805,0.256760,0.447243,0.066437,0.000000,...,0.818360,0.548261,0.000000,0.000000,0.00000,0.000000,0.000000,0.035331,0.086413,1.597066
giving refusing,0.424281,0.000000,0.248552,0.113161,0.000000,0.000000,0.381611,0.000000,0.000000,0.000000,...,0.000000,0.000000,1.563280,0.000000,0.90960,0.000000,0.000000,0.000000,0.000000,0.554312
tending prove,0.000000,0.000000,0.000000,0.000000,1.021598,0.000000,0.000000,0.000000,0.105485,0.000000,...,0.000000,1.181661,0.000000,0.000000,0.00000,0.035331,0.000000,0.000000,2.020048,0.000000
burglary larceny,0.000000,0.000000,0.000000,0.000000,1.372588,0.000000,0.000000,0.040416,0.000000,0.000000,...,0.474592,1.223750,0.000000,0.000000,0.00000,0.086413,0.000000,2.020048,0.000000,0.200827


In [None]:
max_cooccurrences(_pmi)