In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess
import os


In [4]:
corpus_path = r"D:\Files\preprocessed_data12.txt"
with open(corpus_path, 'r', encoding='utf-8') as file:
    corpus = file.readlines()


In [3]:
tokenized_corpus = [simple_preprocess(doc) for doc in corpus]


### Training the Model

In [4]:
tagged_data = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(tokenized_corpus)]


In [5]:
vector_size = 200
window = 5
min_count = 1
epochs = 20
model = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, epochs=epochs)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)


#### In the above code I have taken vector size to 200 it can also be 100

In [6]:
model_save_path = os.path.join(os.path.dirname(corpus_path), 'doc2vec_model12_test')
model.save(model_save_path)

# --------------------------------------------

In [5]:
from os import listdir
from os.path import isfile, join
from gensim.models.doc2vec import Doc2Vec
from gensim.utils import simple_preprocess
from sklearn.metrics.pairwise import cosine_similarity


In [6]:
model_save_path = os.path.join(os.path.dirname(corpus_path), 'doc2vec_model12_test')
model = Doc2Vec.load(model_save_path)


In [7]:
def get_document_vector(model, doc_path):
    with open(doc_path, 'r', encoding='utf-8') as file:
        doc_text = file.read()
    return model.infer_vector(simple_preprocess(doc_text))


In [8]:
test_file = ['1992_47' , '1992_76',
             '1992_76' , '1992_182',
             '1972_11' , '1984_115',
             '1969_57' , '1980_91',
             '1959_151' , '1982_28',
             '1976_200' , '1959_151',
             '1985_114' , '1959_151',
             '1966_236' , '1967_267',
             '1961_34' , '1979_110',
             '1961_34' , '1987_37',
             '1992_47' , '1987_315',
             '1971_138' , '1992_47',
             '1992_47' , '1992_76',
             '1984_115' , '1987_315',
             '1983_129' , '1983_27',
             '1979_110' , '1953_28',
             '1963_170' , '1979_158',
             '1983_27' , '1983_37',
             '1983_27' , '1979_33',
             '1984_115' , '1981_49',
             '1979_110' , '1989_233',
             '1983_129' , '1976_176',
             '1971_111' , '1972_291',
             '1990_171' , '1988_88',
             '1972_31' , '1984_115',
             '1984_118' , '1971_336',
             '1961_232' , '1987_380',
             '1964_25' , '1955_79',
             '1976_43' , '1985_257',
             '1987_154' , '1964_144',
             '1973_186' , '1986_218',
             '1990_96' , '1990_171',
             '1958_3' , '1992_144',
             '1979_158' , '1965_111',
             '1962_303' , '1972_291',
             '1987_37' , '1989_233',
             '1953_40' , '1953_24',
             '1966_154' , '1976_43',
             '1953_24' , '1957_52',
             '1984_115' , '1971_49',
             '1980_221' , '1984_115',
             '1980_39' , '1969_324',
             '1991_48' , '1987_189',
             '1979_104' , '1979_110',
             '1985_113' , '1969_324',
             '1979_33' , '1979_110',
             '1968_197' , '1972_62',
             '1992_47' , '1984_115',
             '1991_12' , '1985_113',
             '1983_37' , '1979_33'
            ]
legal_score = [0]*14 + [1]*1 + [2]*5 + [3]*1 + [5]*12 + [7]*7 + [8]*2 + [9]*4 + [10]*4
testing_file = [(test_file[i]+'.txt',test_file[i+1]+'.txt') for i in range(0,len(test_file),2)]

In [9]:
import os
import gensim
import numpy as np
import pandas as pd
from numpy.linalg import norm


  from pandas.core import (


In [10]:
def cosim(vec1, vec2): 
    cos = np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))
    return cos

In [11]:
df = pd.DataFrame(columns=['Case_1', 'Case_2', 'Cos_Similarity'])


In [12]:
base_dir = r"D:\Files\DataSet"


for doc1, doc2 in testing_file:
    year1 = doc1.split("_")[0]
    year2 = doc2.split("_")[0]

    
    doc1_path = os.path.join(base_dir, year1, doc1)
    doc2_path = os.path.join(base_dir, year2, doc2)

    with open(doc1_path, 'rb') as f:
        text1 = f.read().decode('utf-8', errors='ignore')
    with open(doc2_path, 'rb') as f:
        text2 = f.read().decode('utf-8', errors='ignore')


    vector1 = model.infer_vector(text1.split())
    vector2 = model.infer_vector(text2.split())

    sim = cosim(vector1, vector2)

    df.loc[len(df)] = [doc1, doc2, sim]

In [13]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [14]:
df

Unnamed: 0,Case_1,Case_2,Cos_Similarity
0,1992_47.txt,1992_76.txt,0.277961
1,1992_76.txt,1992_182.txt,0.279482
2,1972_11.txt,1984_115.txt,0.121031
3,1969_57.txt,1980_91.txt,0.366385
4,1959_151.txt,1982_28.txt,0.386851
5,1976_200.txt,1959_151.txt,0.2381
6,1985_114.txt,1959_151.txt,0.33172
7,1966_236.txt,1967_267.txt,0.380725
8,1961_34.txt,1979_110.txt,0.299225
9,1961_34.txt,1987_37.txt,0.335277


In [15]:
print(df)

          Case_1        Case_2  Cos_Similarity
0    1992_47.txt   1992_76.txt        0.277961
1    1992_76.txt  1992_182.txt        0.279482
2    1972_11.txt  1984_115.txt        0.121031
3    1969_57.txt   1980_91.txt        0.366385
4   1959_151.txt   1982_28.txt        0.386851
5   1976_200.txt  1959_151.txt        0.238100
6   1985_114.txt  1959_151.txt        0.331720
7   1966_236.txt  1967_267.txt        0.380725
8    1961_34.txt  1979_110.txt        0.299225
9    1961_34.txt   1987_37.txt        0.335277
10   1992_47.txt  1987_315.txt        0.384198
11  1971_138.txt   1992_47.txt        0.487341
12   1992_47.txt   1992_76.txt        0.258591
13  1984_115.txt  1987_315.txt        0.418575
14  1983_129.txt   1983_27.txt        0.642085
15  1979_110.txt   1953_28.txt        0.305152
16  1963_170.txt  1979_158.txt        0.469809
17   1983_27.txt   1983_37.txt        0.606873
18   1983_27.txt   1979_33.txt        0.538635
19  1984_115.txt   1981_49.txt        0.572566
20  1979_110.

In [16]:
legal_score

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 2,
 2,
 2,
 2,
 3,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 8,
 8,
 9,
 9,
 9,
 9,
 10,
 10,
 10,
 10]

In [17]:
x = legal_score
y = df['Cos_Similarity']
np.corrcoef(x,y)[0][1]

0.6031118873417262

In [18]:
df['LSE'] = legal_score 

In [19]:
df['Cos_Similarity_class'] = np.where(df['Cos_Similarity']>.5,1,0)
df['LSE_class'] = np.where(df['LSE']>5,1,0)

In [20]:
df

Unnamed: 0,Case_1,Case_2,Cos_Similarity,LSE,Cos_Similarity_class,LSE_class
0,1992_47.txt,1992_76.txt,0.277961,0,0,0
1,1992_76.txt,1992_182.txt,0.279482,0,0,0
2,1972_11.txt,1984_115.txt,0.121031,0,0,0
3,1969_57.txt,1980_91.txt,0.366385,0,0,0
4,1959_151.txt,1982_28.txt,0.386851,0,0,0
5,1976_200.txt,1959_151.txt,0.2381,0,0,0
6,1985_114.txt,1959_151.txt,0.33172,0,0,0
7,1966_236.txt,1967_267.txt,0.380725,0,0,0
8,1961_34.txt,1979_110.txt,0.299225,0,0,0
9,1961_34.txt,1987_37.txt,0.335277,0,0,0


In [21]:
from sklearn.metrics import accuracy_score,classification_report

In [22]:
y_true = df['LSE_class']
y_pred = df['Cos_Similarity_class']
accuracy_score(y_true, y_pred)

0.78

In [23]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.82      0.83        33
           1       0.67      0.71      0.69        17

    accuracy                           0.78        50
   macro avg       0.76      0.76      0.76        50
weighted avg       0.78      0.78      0.78        50



In [24]:
df.to_excel('Doc2vec_table.xlsx', index=False)

In [37]:
report = classification_report(y_true, y_pred, output_dict=True)

# Convert report to dataframe
df = pd.DataFrame(report).transpose()

# Save dataframe to Excel
df.to_excel("classification_report.xlsx", sheet_name="classification_report")