## POINTWISE MUTUAL INFORMATION - Bigrams Matrix


In [1]:
import spacy
import string
import re
import pandas as pd
import numpy as np
import seaborn as sns
import random
import pickle
from unidecode import unidecode
import nltk
nltk.download('wordnet')
nltk.download('words')
nltk.download('punkt')

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import skipgrams
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords 
from spacy.lang.en import English

from tqdm import tqdm
from gensim import corpora
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_non_alphanum
from gensim.models import CoherenceModel
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures,TrigramCollocationFinder, TrigramAssocMeasures

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## PMI - DRUG

In [3]:
#Reload save pkl file
with open('drug.pkl', 'rb') as f:
    drug = pickle.load(f)

In [4]:
def get_pmi_value(word1, word2, matrix):
    return matrix.loc[matrix.index == word1, [word2]]

def pmi(df, positive=True):
    cols = df.sum(axis=0)
    total = cols.sum()
    rows = df.sum(axis=1)
    expected = np.outer(rows, cols) / total
    df_pmi = df / expected
    with np.errstate(divide='ignore'):
        df_pmi = np.log(df_pmi)
    df_pmi[np.array(np.isinf(df_pmi))] = 0.0 
    if positive:
        df_pmi[df_pmi < 0] = 0.0
    return df_pmi

def max_cooccurrences(df_column):
    return df_column.idxmax() , df_column.max()

In [6]:
vectorizer = CountVectorizer(min_df=0.01,  ngram_range=(2,2))
X = vectorizer.fit_transform(drug.Text)

In [7]:
pmi_drug = pd.DataFrame(X.todense(), columns=vectorizer.vocabulary_)

In [8]:
asint = pmi_drug.astype(int)
drug_pmi = asint.T.dot(asint)

In [9]:
drug_pmi.values[[np.arange(drug_pmi.shape[0])]*2] = 0

  drug_pmi.values[[np.arange(drug_pmi.shape[0])]*2] = 0


In [10]:
drug_pmi

Unnamed: 0,cocaine undercover,conviction felony,minimum sentence,received sentence,included offense,vacate sentence,guilty reasonable,reasonable doubt,evidence sufficient,sufficient sustain,...,unlawful heroin,evidence unable,presumption innocence,tried guilty,statute require,statute necessary,giving notice,demurrer sustained,guilty armed,different evidence
cocaine undercover,0,7,33,12,4,18,1,22,28,15,...,39,14,15,3,61,7,16,0,0,14
conviction felony,7,0,5,1,6,3,0,10,2,2,...,5,0,2,3,3,4,2,0,7,3
minimum sentence,33,5,0,38,3,17,24,60,24,64,...,120,17,17,21,64,11,29,43,16,20
received sentence,12,1,38,0,3,3,3,17,4,7,...,9,7,6,1,15,1,8,3,2,4
included offense,4,6,3,3,0,0,0,4,0,1,...,0,2,5,3,4,1,2,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
statute necessary,7,4,11,1,1,4,3,16,1,0,...,12,2,5,2,39,0,6,2,6,8
giving notice,16,2,29,8,2,12,9,16,17,10,...,34,16,17,10,46,6,0,8,3,7
demurrer sustained,0,0,43,3,0,1,8,15,13,16,...,0,0,0,0,2,2,8,0,1117,11
guilty armed,0,7,16,2,0,4,3,11,8,5,...,0,0,0,3,0,6,3,1117,0,9


In [11]:
drug_occ = pmi(drug_pmi, positive=True)
drug_occ

Unnamed: 0,cocaine undercover,conviction felony,minimum sentence,received sentence,included offense,vacate sentence,guilty reasonable,reasonable doubt,evidence sufficient,sufficient sustain,...,unlawful heroin,evidence unable,presumption innocence,tried guilty,statute require,statute necessary,giving notice,demurrer sustained,guilty armed,different evidence
cocaine undercover,0.000000,0.586224,0.000000,0.301831,0.377331,0.632103,0.000000,0.000000,0.857328,0.114456,...,0.555792,0.102785,0.492235,0.000000,0.473293,0.101957,0.124287,0.000000,0.000000,0.000000
conviction felony,0.586224,0.000000,0.000000,0.000000,2.357859,0.415406,0.000000,0.366408,0.000000,0.000000,...,0.076731,0.000000,0.052395,0.715644,0.000000,1.117404,0.000000,0.000000,0.373166,0.000000
minimum sentence,0.000000,0.000000,0.000000,0.669874,0.000000,0.000000,0.175196,0.000000,0.000000,0.780653,...,0.895086,0.000000,0.000000,0.301856,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
received sentence,0.301831,0.000000,0.669874,0.000000,0.841322,0.000000,0.000000,0.073646,0.000000,0.103989,...,0.000000,0.161311,0.327618,0.000000,0.000000,0.000000,0.182813,0.000000,0.000000,0.000000
included offense,0.377331,2.357859,0.000000,0.841322,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.082660,1.319408,1.066367,0.000000,0.081832,0.000000,0.000000,0.000000,0.202424
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
statute necessary,0.101957,1.117404,0.000000,0.000000,0.081832,0.218821,0.000000,0.352144,0.000000,0.000000,...,0.467933,0.000000,0.484419,0.000000,1.116776,0.000000,0.234253,0.000000,0.000000,0.348263
giving notice,0.124287,0.000000,0.000000,0.182813,0.000000,0.513085,0.265450,0.000000,0.644784,0.000000,...,0.705038,0.522763,0.903845,0.631002,0.477507,0.234253,0.000000,0.000000,0.000000,0.000000
demurrer sustained,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.425435,0.000000
guilty armed,0.000000,0.373166,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.425435,0.000000,0.000000


In [13]:
max_cooccurrences(drug_occ)

(cocaine undercover         manifestly erroneous
 conviction felony              included offense
 minimum sentence      improbable unsatisfactory
 received sentence                sought damages
 included offense            discretion evidence
                                 ...            
 statute necessary              offense evidence
 giving notice               seriousness offense
 demurrer sustained                 guilty armed
 guilty armed                 demurrer sustained
 different evidence        testimony prosecution
 Length: 961, dtype: object,
 cocaine undercover    1.991926
 conviction felony     2.357859
 minimum sentence      1.993971
 received sentence     2.114093
 included offense      4.506532
                         ...   
 statute necessary     2.163177
 giving notice         1.727877
 demurrer sustained    3.425435
 guilty armed          3.425435
 different evidence    3.054178
 Length: 961, dtype: float64)

## PMI - WEAPONS

In [14]:
#Reload save pkl file
with open('weapons.pkl', 'rb') as f:
    weapons = pickle.load(f)

In [15]:
vectorizer = CountVectorizer(min_df=0.01, ngram_range=(2,2))
X = vectorizer.fit_transform(weapons.Text)

In [16]:
pmi_weapons = pd.DataFrame(X.todense(), columns=vectorizer.vocabulary_)

In [17]:
asint = pmi_weapons.astype(int)
weapons_pmi = asint.T.dot(asint)

In [18]:
weapons_pmi.values[[np.arange(weapons_pmi.shape[0])]*2] = 0

  weapons_pmi.values[[np.arange(weapons_pmi.shape[0])]*2] = 0


In [20]:
weapons_occ = pmi(weapons_pmi, positive=True)
weapons_occ

Unnamed: 0,guilty degree,degree murder,proven guilty,guilty reasonable,reasonable doubt,pled guilty,knew knew,determine testimony,testimony evidence,evidence actually,...,later outside,closeness evidence,ineffective objective,simply stated,pocket knife,liable damages,evidence far,intent felony,begun recover,brought evidence
guilty degree,0.000000,0.000000,0.478187,1.020261,0.000000,1.344625,0.047875,0.031695,0.000000,0.417662,...,0.407047,0.346544,0.415702,0.063929,0.350928,0.265894,0.000000,0.000000,0.413592,0.000000
degree murder,0.000000,0.000000,0.000000,0.000000,0.751672,0.616070,0.237692,0.000000,0.000000,0.322642,...,0.516821,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
proven guilty,0.478187,0.000000,0.000000,1.197307,0.000000,0.107978,0.091389,0.000000,0.907721,0.343394,...,0.000000,0.000000,0.133216,0.135615,0.000000,0.337580,0.000000,0.000000,0.148805,0.149933
guilty reasonable,1.020261,0.000000,1.197307,0.000000,0.000000,0.984254,0.361530,0.527672,0.341613,0.000000,...,0.278869,0.000000,0.962972,0.000290,0.000000,0.000000,0.000000,0.000000,0.000000,0.738527
reasonable doubt,0.000000,0.751672,0.000000,0.000000,0.000000,0.043904,0.294015,0.000000,0.000000,0.101639,...,0.012097,0.000000,0.310720,0.000000,0.052605,0.000000,0.211119,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
liable damages,0.265894,0.000000,0.337580,0.000000,0.000000,0.000000,0.000000,0.178770,1.022331,0.734636,...,0.671905,0.000000,0.950543,0.798791,0.623166,0.000000,0.000000,0.000000,0.694198,0.200004
evidence far,0.000000,0.000000,0.000000,0.000000,0.211119,0.000000,0.655084,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.812543,0.000000,0.647235
intent felony,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.018548,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.812543,0.000000,0.000000,0.425398
begun recover,0.413592,0.000000,0.148805,0.000000,0.000000,0.000000,0.000000,0.000000,0.710954,0.000000,...,0.000000,0.000000,0.000000,0.022229,0.166127,0.694198,0.000000,0.000000,0.000000,0.624334


In [21]:
max_cooccurrences(weapons_occ)

(guilty degree                 decision admit
 degree murder         inconsistent statement
 proven guilty                    stated gave
 guilty reasonable             decision admit
 reasonable doubt             noted testimony
                               ...           
 liable damages       credibility credibility
 evidence far                   intent felony
 intent felony              evidence reversal
 begun recover          favorable prosecution
 brought evidence         irrelevant evidence
 Length: 1800, dtype: object,
 guilty degree        2.336831
 degree murder        2.856531
 proven guilty        3.177001
 guilty reasonable    1.999898
 reasonable doubt     1.629223
                        ...   
 liable damages       2.742677
 evidence far         3.812543
 intent felony        3.982414
 begun recover        2.697281
 brought evidence     1.592183
 Length: 1800, dtype: float64)

## PMI - FINANCE

In [62]:
#Reload save pkl file
with open('finance.pkl', 'rb') as f:
    finance = pickle.load(f)

In [63]:
vectorizer = CountVectorizer(min_df=0.01,  ngram_range=(2,2))
X = vectorizer.fit_transform(finance.Text)

In [64]:
pmi_finance= pd.DataFrame(X.todense(), columns=vectorizer.vocabulary_)

In [65]:
asint = pmi_finance.astype(int)
finance_pmi = asint.T.dot(asint)

In [66]:
finance_pmi.values[[np.arange(finance_pmi.shape[0])]*2] = 0

  finance_pmi.values[[np.arange(finance_pmi.shape[0])]*2] = 0


In [67]:
finance_pmi

Unnamed: 0,necessary necessary,month month,think sufficient,verdict damages,damages damages,verdict evidence,notwithstanding verdict,contrary manifest,manifest evidence,evidence decision,...,consideration testimony,plea demurrer,demurrer demurrer,absolutely void,giving refusing,prove evidence,brought assumpsit,semi annually,shall lawful,shall satisfaction
necessary necessary,0,20,5,13,6,3,3,4,10,4,...,5,33,13,6,31,6,3,3,7,5
month month,20,0,32,10,17,13,11,24,41,14,...,18,84,28,23,59,13,4,11,18,10
think sufficient,5,32,0,1,10,1,2,24,55,63,...,51,131,69,17,85,3,0,6,4,4
verdict damages,13,10,1,0,11,19,10,6,17,2,...,2,14,2,5,17,67,101,104,14,6
damages damages,6,17,10,11,0,18,10,10,24,0,...,4,9,7,5,7,18,21,13,9,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
prove evidence,6,13,3,67,18,13,10,24,28,3,...,7,5,2,21,18,0,27,94,13,3
brought assumpsit,3,4,0,101,21,18,6,1,13,2,...,2,10,9,1,7,27,0,57,21,2
semi annually,3,11,6,104,13,18,14,3,10,2,...,6,1,3,2,10,94,57,0,5,5
shall lawful,7,18,4,14,9,2,12,11,54,0,...,4,58,0,13,44,13,21,5,0,11


In [69]:
max_cooccurrences(finance_pmi)

(necessary necessary         brief argument
 month month            evidence contention
 think sufficient            shall continue
 verdict damages             brief argument
 damages damages             evidence shown
                               ...         
 prove evidence              brief argument
 brought assumpsit          receipt receipt
 semi annually               brief argument
 shall lawful                   breach duty
 shall satisfaction     evidence contention
 Length: 490, dtype: object,
 necessary necessary    105
 month month            409
 think sufficient       707
 verdict damages        163
 damages damages        315
                       ... 
 prove evidence         292
 brought assumpsit      185
 semi annually          144
 shall lawful           494
 shall satisfaction     200
 Length: 490, dtype: int32)

## PMI - SEXUAL

In [70]:
#Reload save pkl file
with open('sexual.pkl', 'rb') as f:
    sexual = pickle.load(f)

In [71]:
vectorizer = CountVectorizer(min_df=0.01 ,ngram_range=(2,2))
X = vectorizer.fit_transform(sexual.Text)

In [72]:
pmi= pd.DataFrame(X.todense(), columns=vectorizer.vocabulary_)

In [73]:
asint = pmi.astype(int)
_pmi = asint.T.dot(asint)

In [74]:
_pmi.values[[np.arange(_pmi.shape[0])]*2] = 0

  _pmi.values[[np.arange(_pmi.shape[0])]*2] = 0


In [75]:
_pmi

Unnamed: 0,stated following,breach duty,notice stated,stated affirm,expressly stated,address argument,reject argument,insufficient establish,necessary establish,exercise discretion,...,statute unconstitutional,sufficient sustain,sustaining demurrer,testimony tending,duty ordinary,prove evidence,statute consideration,dismissal prejudice,clearly shown,effect effect
stated following,0,10,0,1,3,15,8,3,5,9,...,8,3,25,2,1,8,6,3,6,4
breach duty,10,0,0,0,7,6,10,12,7,12,...,9,3,41,0,7,32,6,1,13,54
notice stated,0,0,0,2,2,3,2,0,0,4,...,1,1,10,0,0,6,0,1,1,0
stated affirm,1,0,2,0,2,1,0,0,0,2,...,4,2,4,3,1,3,8,0,0,0
expressly stated,3,7,2,2,0,12,1,4,4,0,...,9,3,17,2,6,1,2,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
prove evidence,8,32,6,3,1,3,79,27,2,7,...,16,0,42,0,7,0,445,7,1,6882
statute consideration,6,6,0,8,2,19,32,25,2,0,...,23,0,22,0,0,445,0,3,0,1273
dismissal prejudice,3,1,1,0,3,6,0,1,0,0,...,4,3,16,0,1,7,3,0,3,7
clearly shown,6,13,1,0,2,12,8,3,1,1,...,9,2,26,0,0,1,0,3,0,1


In [76]:
max_cooccurrences(_pmi)

(stated following            want prosecution
 breach duty                       shall hold
 notice stated            testimony establish
 stated affirm                shall necessary
 expressly stated                  shall hold
                                 ...         
 prove evidence                 effect effect
 statute consideration    remainder remainder
 dismissal prejudice         want prosecution
 clearly shown                     shall hold
 effect effect                 prove evidence
 Length: 588, dtype: object,
 stated following          124
 breach duty               176
 notice stated             104
 stated affirm              38
 expressly stated          124
                          ... 
 prove evidence           6882
 statute consideration    1296
 dismissal prejudice        73
 clearly shown             285
 effect effect            6882
 Length: 588, dtype: int32)

## PMI - DIVORCE

In [77]:
#Reload save pkl file
with open('divorce.pkl', 'rb') as f:
    divorce = pickle.load(f)

In [78]:
vectorizer = CountVectorizer(min_df=0.01, ngram_range=(2,2))
X = vectorizer.fit_transform(divorce.Text)

In [79]:
pmi= pd.DataFrame(X.todense(), columns=vectorizer.vocabulary_)

In [80]:
asint = pmi.astype(int)
_pmi = asint.T.dot(asint)

In [81]:
_pmi.values[[np.arange(_pmi.shape[0])]*2] = 0

  _pmi.values[[np.arange(_pmi.shape[0])]*2] = 0


In [82]:
_pmi

Unnamed: 0,according evidence,decision decision,decision evidence,evidence insufficient,evidence contrary,contrary manifest,manifest evidence,shall shall,ordinary meaning,affidavit affidavit,...,sustaining demurrer,clearly expressed,competent testimony,shall testator,entirely different,testator gave,hear determine,leaving leaving,making appropriation,appropriation provided
according evidence,0,2,1,2,1,0,0,0,1,4,...,4,2,1,0,4,0,3,0,4,0
decision decision,2,0,0,3,1,0,1,1,4,0,...,1,1,1,0,2,1,0,0,0,0
decision evidence,1,0,0,2,0,0,2,4,1,1,...,0,10,14,41,7,1,2,3,0,1
evidence insufficient,2,3,2,0,5,3,1,3,4,17,...,0,3,1,0,2,0,6,0,0,0
evidence contrary,1,1,0,5,0,0,2,3,8,1,...,0,5,0,1,2,1,1,6,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
testator gave,0,1,1,0,1,0,0,1,2,1,...,0,1,0,1,5,0,1,0,0,0
hear determine,3,0,2,6,1,2,2,0,7,1,...,7,1,0,0,2,1,0,2,0,0
leaving leaving,0,0,3,0,6,0,0,1,5,3,...,0,2,0,2,2,0,2,0,1,1
making appropriation,4,0,0,0,1,1,0,0,1,0,...,7,0,0,0,1,0,0,1,0,0


In [83]:
max_cooccurrences(_pmi)

(according evidence        sustained demurrer
 decision decision           affidavit stated
 decision evidence             shall testator
 evidence insufficient           void statute
 evidence contrary            statute require
                                  ...        
 testator gave               affidavit stated
 hear determine            sustained demurrer
 leaving leaving           testator remainder
 making appropriation        affidavit stated
 appropriation provided       shown testimony
 Length: 441, dtype: object,
 according evidence        65
 decision decision         17
 decision evidence         41
 evidence insufficient     72
 evidence contrary         26
                           ..
 testator gave             39
 hear determine            69
 leaving leaving           27
 making appropriation      22
 appropriation provided    80
 Length: 441, dtype: int32)

### PMI - BURGLARY

In [91]:
#Reload save pkl file
with open('burglary.pkl', 'rb') as f:
    burglary = pickle.load(f)

In [92]:
vectorizer = CountVectorizer(min_df=0.01,  ngram_range=(2,2))
X = vectorizer.fit_transform(burglary.Text)

In [93]:
pmi= pd.DataFrame(X.todense(), columns=vectorizer.vocabulary_)

In [94]:
asint = pmi.astype(int)
_pmi = asint.T.dot(asint)

In [95]:
_pmi.values[[np.arange(_pmi.shape[0])]*2] = 0

  _pmi.values[[np.arange(_pmi.shape[0])]*2] = 0


In [96]:
_pmi

Unnamed: 0,failure comply,convincing evidence,sexual conduct,reasonable basis,contrary manifest,manifest evidence,evidence believe,accordingly conclude,conduct offense,evidence reasonable,...,evidence failure,accused evidence,carefully considered,intent intent,penalty provided,sentence murder,giving refusing,tending prove,burglary larceny,guilty larceny
failure comply,0,11,22,10,2,5,28,8,11,12,...,5,6,37,61,29,8,12,16,10,13
convincing evidence,11,0,9,6,5,4,10,1,4,3,...,6,2,15,58,14,6,2,13,27,3
sexual conduct,22,9,0,41,21,15,53,39,55,19,...,47,53,76,90,27,12,28,164,124,23
reasonable basis,10,6,41,0,2,7,15,6,14,5,...,10,3,10,31,7,3,6,10,9,1
contrary manifest,2,5,21,2,0,0,0,15,9,9,...,17,26,2,20,2,14,0,410,344,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sentence murder,8,6,12,3,14,7,23,15,12,1,...,26,23,10,53,9,0,1,111,69,45
giving refusing,12,2,28,6,0,3,23,7,7,5,...,4,4,94,37,31,1,0,23,0,14
tending prove,16,13,164,10,410,3,8,91,165,33,...,143,573,1,68,1,111,23,0,6309,114
burglary larceny,10,27,124,9,344,0,10,78,26,24,...,144,353,1,53,1,69,0,6309,0,87


In [97]:
max_cooccurrences(_pmi)

(failure comply             reaching decision
 convincing evidence        reaching decision
 sexual conduct             reaching decision
 reasonable basis           reaching decision
 contrary manifest      conflicting testimony
                                ...          
 sentence murder               drawn evidence
 giving refusing            reaching decision
 tending prove          conflicting testimony
 burglary larceny       conflicting testimony
 guilty larceny         conflicting testimony
 Length: 767, dtype: object,
 failure comply           505
 convincing evidence      176
 sexual conduct          1179
 reasonable basis         334
 contrary manifest       2075
                        ...  
 sentence murder          440
 giving refusing          483
 tending prove          37746
 burglary larceny       15026
 guilty larceny           603
 Length: 767, dtype: int32)