In [1]:
import glob
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
filenames = glob.glob('corpus-20090418/*')
filenames[:3]

['corpus-20090418/g2pA_taskd.txt',
 'corpus-20090418/g2pC_taska.txt',
 'corpus-20090418/g1pA_taske.txt']

In [3]:
content = [open(filename,encoding='latin-1').read() for filename in filenames]

In [4]:
content[:2]

[" In probability theory, Bayes' theorem (often called Bayes' law after Rev Thomas Bayes) relates the conditional and marginal probabilities of two random events. It is often used to compute posterior probabilities given observations. For example, a patient may be observed to have certain symptoms. Bayes' theorem can be used to compute the probability that a proposed diagnosis is correct, given that observation.\nAs a formal theorem, Bayes' theorem is valid in all common interpretations of probability. However, it plays a central role in the debate around the foundations of statistics: frequentist and Bayesian interpretations disagree about the ways in which probabilities should be assigned in applications. \nSuppose there is a co-ed school having 60% boys and 40% girls as students. The girl students wear trousers or skirts in equal numbers; the boys all wear trousers. An observer sees a (random) student from a distance; all they can see is that this student is wearing trousers. What i

In [5]:
df = pd.DataFrame({
    'filename': filenames,
    'content': content
})
df
# df.shape

Unnamed: 0,filename,content
0,corpus-20090418/g2pA_taskd.txt,"In probability theory, Bayes' theorem (often ..."
1,corpus-20090418/g2pC_taska.txt,Inheritance is a way to form new classes (inst...
2,corpus-20090418/g1pA_taske.txt,Dynamic programming is an algorithmic techniqu...
3,corpus-20090418/g0pE_taske.txt,dynamic programming is a method of solving pro...
4,corpus-20090418/g4pB_taskb.txt,Page rank algorithm is used to determine a web...
...,...,...
95,corpus-20090418/g0pA_taske.txt,Dynamic Programming is an algorithm design tec...
96,corpus-20090418/g1pA_taskd.txt,Bayes' theorem relates the conditional and mar...
97,corpus-20090418/g0pB_taskb.txt,PageRank (PR) refers to both the concept and t...
98,corpus-20090418/g4pD_taskb.txt,PageRank is a probability distribution used to...


In [6]:
df.filename = df.filename.str.replace("corpus-20090418/","")

In [7]:
df

Unnamed: 0,filename,content
0,g2pA_taskd.txt,"In probability theory, Bayes' theorem (often ..."
1,g2pC_taska.txt,Inheritance is a way to form new classes (inst...
2,g1pA_taske.txt,Dynamic programming is an algorithmic techniqu...
3,g0pE_taske.txt,dynamic programming is a method of solving pro...
4,g4pB_taskb.txt,Page rank algorithm is used to determine a web...
...,...,...
95,g0pA_taske.txt,Dynamic Programming is an algorithm design tec...
96,g1pA_taskd.txt,Bayes' theorem relates the conditional and mar...
97,g0pB_taskb.txt,PageRank (PR) refers to both the concept and t...
98,g4pD_taskb.txt,PageRank is a probability distribution used to...


In [8]:
vectorizer = CountVectorizer()

# vectorize = CountVectorizer(binary=True) # only YES or NOW

matrix = vectorizer.fit_transform(df.content)

words_df = pd.DataFrame(matrix.toarray(),
            columns=vectorizer.get_feature_names(),
            index=df.filename)

words_df.head()

Unnamed: 0_level_0,10,15,1702â,1761,1940s,1953,1967,1982,2005,2007,...,yang,year,years,yn,yo,you,your,yours,yourself,zero
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
g2pA_taskd.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
g2pC_taska.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
g1pA_taske.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
g0pE_taske.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
g4pB_taskb.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# words_df.index = filenames

In [10]:
words_df.head()


Unnamed: 0_level_0,10,15,1702â,1761,1940s,1953,1967,1982,2005,2007,...,yang,year,years,yn,yo,you,your,yours,yourself,zero
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
g2pA_taskd.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
g2pC_taska.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
g1pA_taske.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
g0pE_taske.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
g4pB_taskb.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Compute the similarities using the word count
similarities = cosine_similarity(matrix)

# Make a fancy colored dataframe about it
similarity = pd.DataFrame(similarities, index=df.filename, columns=df.filename)

similarity.style.background_gradient(axis=None, high=0.25)

filename,g2pA_taskd.txt,g2pC_taska.txt,g1pA_taske.txt,g0pE_taske.txt,g4pB_taskb.txt,g3pA_taskd.txt,g0pA_taska.txt,g3pC_taskc.txt,g0pD_taskd.txt,g3pA_taska.txt,g3pA_taske.txt,g3pC_taske.txt,g1pA_taskb.txt,g1pD_taske.txt,g0pD_taskb.txt,g3pA_taskb.txt,g1pD_taskc.txt,g0pD_taske.txt,g4pE_taske.txt,g4pD_taskc.txt,g4pB_taska.txt,orig_taskc.txt,g0pC_taskb.txt,g1pA_taska.txt,g0pB_taska.txt,g2pA_taska.txt,g4pC_taske.txt,g2pE_taskc.txt,g0pB_taskc.txt,g1pB_taskb.txt,g4pC_taskc.txt,g0pA_taskc.txt,g0pB_taske.txt,g0pD_taskc.txt,orig_taske.txt,g1pB_taskc.txt,g4pB_taskd.txt,g4pE_taskd.txt,g2pB_taska.txt,g4pC_taskb.txt,orig_taskb.txt,g1pB_taska.txt,g4pB_taske.txt,g2pE_taska.txt,g2pC_taske.txt,g1pB_taskd.txt,orig_taska.txt,g0pE_taskc.txt,g2pA_taske.txt,g3pC_taska.txt,g2pE_taskd.txt,g3pB_taskb.txt,g0pC_taskc.txt,g3pA_taskc.txt,g4pD_taskd.txt,g0pC_taska.txt,g0pA_taskd.txt,g4pC_taska.txt,g2pC_taskc.txt,g2pE_taske.txt,g1pD_taskd.txt,g2pC_taskb.txt,g3pC_taskb.txt,g4pE_taska.txt,g4pE_taskc.txt,g0pE_taskd.txt,g0pC_taskd.txt,g2pC_taskd.txt,g2pB_taskb.txt,g0pB_taskd.txt,g3pB_taske.txt,g2pE_taskb.txt,orig_taskd.txt,g3pB_taskc.txt,g0pC_taske.txt,g3pB_taskd.txt,g2pA_taskc.txt,g4pB_taskc.txt,g0pA_taskb.txt,g3pC_taskd.txt,g1pB_taske.txt,g4pD_taske.txt,g0pD_taska.txt,g2pB_taske.txt,g4pD_taska.txt,g1pA_taskc.txt,g2pB_taskd.txt,g1pD_taska.txt,g3pB_taska.txt,g1pD_taskb.txt,g4pE_taskb.txt,g4pC_taskd.txt,g0pE_taskb.txt,g2pB_taskc.txt,g0pE_taska.txt,g0pA_taske.txt,g1pA_taskd.txt,g0pB_taskb.txt,g4pD_taskb.txt,g2pA_taskb.txt
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1
g2pA_taskd.txt,1.0,0.47358,0.496586,0.513807,0.565915,0.778648,0.576924,0.508678,0.324398,0.617814,0.612772,0.524031,0.607979,0.581789,0.557124,0.62762,0.540505,0.430111,0.549538,0.633778,0.461038,0.617145,0.542856,0.466509,0.554124,0.469765,0.679217,0.388366,0.625838,0.575306,0.460831,0.624887,0.626392,0.655616,0.624552,0.626211,0.7712,0.558716,0.540703,0.673198,0.621143,0.546697,0.447728,0.554105,0.570302,0.751807,0.561044,0.507748,0.57385,0.453506,0.713273,0.612083,0.646283,0.670411,0.661095,0.561659,0.752688,0.544479,0.613682,0.577652,0.616353,0.593541,0.584627,0.494708,0.57813,0.372549,0.683589,0.888428,0.613484,0.798602,0.592063,0.65324,0.778346,0.628192,0.627155,0.681108,0.607071,0.682271,0.652001,0.759145,0.630289,0.583362,0.53064,0.62848,0.673168,0.564111,0.77454,0.481532,0.537938,0.696303,0.571499,0.78371,0.435521,0.559363,0.546393,0.413753,0.71428,0.606694,0.626838,0.628633
g2pC_taska.txt,0.47358,1.0,0.409469,0.402117,0.521896,0.477802,0.540583,0.340955,0.256571,0.652719,0.466825,0.411157,0.511751,0.462548,0.41558,0.471154,0.355311,0.34731,0.464539,0.433657,0.557591,0.409042,0.465691,0.645894,0.478507,0.510023,0.513256,0.313512,0.428523,0.464005,0.393368,0.438894,0.431553,0.47473,0.49695,0.457243,0.455452,0.403795,0.509119,0.43173,0.484204,0.541921,0.401499,0.795544,0.46142,0.481132,0.848437,0.342439,0.486684,0.718851,0.536618,0.514645,0.455754,0.441852,0.516625,0.761324,0.418755,0.858831,0.442981,0.459242,0.423952,0.462885,0.437134,0.493138,0.403319,0.240836,0.420487,0.453115,0.422945,0.497905,0.455502,0.493043,0.476924,0.475496,0.476071,0.411253,0.423325,0.493295,0.46054,0.460423,0.494454,0.50468,0.825993,0.472696,0.721145,0.429963,0.476098,0.810323,0.579473,0.441455,0.474719,0.484795,0.367811,0.43777,0.843372,0.382179,0.471113,0.451817,0.438883,0.479676
g1pA_taske.txt,0.496586,0.409469,1.0,0.440128,0.481822,0.494969,0.446367,0.355121,0.322104,0.538725,0.701747,0.481178,0.528083,0.775315,0.361617,0.509228,0.449684,0.515451,0.539415,0.470977,0.471025,0.407611,0.449388,0.498642,0.453638,0.450882,0.5553,0.403614,0.419338,0.46191,0.401542,0.441501,0.53659,0.486685,0.614943,0.437192,0.473958,0.486717,0.515659,0.464496,0.473086,0.504483,0.561978,0.552458,0.667406,0.41716,0.488457,0.322822,0.509703,0.414266,0.48239,0.54061,0.448655,0.477088,0.521987,0.512355,0.51529,0.469764,0.410941,0.567343,0.422556,0.467984,0.419688,0.444436,0.356957,0.31272,0.476626,0.485107,0.576527,0.504346,0.553051,0.457694,0.496368,0.514576,0.533428,0.389692,0.407515,0.49091,0.425991,0.473628,0.675107,0.629637,0.457988,0.572648,0.487414,0.413216,0.516832,0.463868,0.520489,0.45162,0.490837,0.49886,0.318584,0.446993,0.472854,0.646888,0.448149,0.454088,0.413002,0.419348
g0pE_taske.txt,0.513807,0.402117,0.440128,1.0,0.45494,0.479975,0.42657,0.345807,0.256776,0.516914,0.610066,0.934129,0.468351,0.55162,0.456004,0.477266,0.398885,0.424918,0.647145,0.485905,0.420653,0.490172,0.464221,0.412444,0.467488,0.44031,0.754177,0.346594,0.502986,0.440471,0.398966,0.487533,0.666841,0.510417,0.684873,0.502971,0.441498,0.416767,0.430027,0.515924,0.507015,0.485625,0.594445,0.474248,0.611943,0.504001,0.480749,0.380304,0.617191,0.435278,0.544005,0.473218,0.471974,0.531212,0.488568,0.465061,0.452945,0.482208,0.472124,0.533581,0.424785,0.469975,0.433956,0.445512,0.44383,0.228755,0.407938,0.512745,0.425643,0.501641,0.859683,0.537699,0.479093,0.496124,0.841069,0.478412,0.481786,0.545982,0.513988,0.473127,0.773226,0.693666,0.474927,0.783802,0.535421,0.464749,0.504294,0.442314,0.442237,0.516309,0.450432,0.496185,0.364069,0.486758,0.482112,0.526433,0.514871,0.448868,0.49354,0.491347
g4pB_taskb.txt,0.565915,0.521896,0.481822,0.45494,1.0,0.559925,0.545916,0.513958,0.246114,0.617523,0.631895,0.456302,0.699589,0.580607,0.612635,0.741247,0.545262,0.341768,0.628159,0.620848,0.520519,0.552293,0.702692,0.543938,0.55767,0.524712,0.667012,0.402434,0.583424,0.731833,0.580054,0.55853,0.600539,0.577442,0.620971,0.639043,0.518704,0.505632,0.523494,0.642272,0.733066,0.582938,0.500391,0.609929,0.551199,0.617094,0.589719,0.426427,0.610069,0.47521,0.677967,0.690982,0.656449,0.608939,0.598929,0.584085,0.464437,0.591641,0.605072,0.570926,0.546386,0.778066,0.673621,0.406976,0.583576,0.242178,0.557653,0.533218,0.591722,0.607966,0.571061,0.674961,0.559879,0.622759,0.55319,0.56276,0.553459,0.639344,0.708951,0.522426,0.612172,0.625479,0.550851,0.617848,0.679257,0.573556,0.568866,0.580768,0.597526,0.628459,0.637595,0.577771,0.553707,0.61978,0.583878,0.43012,0.635217,0.705103,0.621128,0.731403
g3pA_taskd.txt,0.778648,0.477802,0.494969,0.479975,0.559925,1.0,0.509716,0.427434,0.40631,0.621315,0.620462,0.515332,0.595428,0.554244,0.537665,0.586083,0.51005,0.398836,0.569624,0.563328,0.498941,0.612943,0.527277,0.474009,0.540362,0.501946,0.639614,0.416551,0.636994,0.537208,0.495373,0.630713,0.533097,0.646454,0.582,0.625862,0.916487,0.601604,0.496945,0.591968,0.599209,0.542862,0.453686,0.590438,0.523423,0.828134,0.554876,0.491737,0.51849,0.439404,0.754074,0.573176,0.552603,0.631582,0.790557,0.57448,0.717337,0.547657,0.58212,0.540114,0.688986,0.553056,0.502352,0.508262,0.539188,0.400779,0.912449,0.846618,0.512238,0.87585,0.549077,0.64141,0.998898,0.606751,0.581562,0.777589,0.618314,0.673421,0.604549,0.80321,0.630798,0.578164,0.513485,0.581137,0.631752,0.568604,0.96506,0.539415,0.521667,0.618763,0.573132,0.991907,0.470802,0.521506,0.5524,0.428778,0.847539,0.526391,0.623392,0.604382
g0pA_taska.txt,0.576924,0.540583,0.446367,0.42657,0.545916,0.509716,1.0,0.465712,0.173047,0.697649,0.590341,0.449118,0.51492,0.482756,0.468011,0.543313,0.498879,0.322235,0.546877,0.567169,0.570521,0.516498,0.527581,0.571793,0.623242,0.570528,0.592388,0.392968,0.556806,0.504078,0.509209,0.515246,0.547727,0.545214,0.557412,0.581079,0.471945,0.535149,0.68065,0.548832,0.566793,0.640766,0.433312,0.623285,0.496301,0.511672,0.574161,0.439363,0.483826,0.46146,0.557143,0.505703,0.581277,0.582064,0.537191,0.621271,0.496952,0.576163,0.555289,0.50082,0.50633,0.515499,0.570669,0.543476,0.512313,0.195566,0.468197,0.525408,0.438097,0.569005,0.484519,0.584963,0.510516,0.540254,0.518753,0.505809,0.509181,0.575385,0.587237,0.471207,0.555643,0.53612,0.562488,0.563957,0.674183,0.544149,0.512502,0.566215,0.648926,0.515438,0.533145,0.5246,0.395115,0.516392,0.572311,0.365522,0.575901,0.526775,0.536643,0.544944
g3pC_taskc.txt,0.508678,0.340955,0.355121,0.345807,0.513958,0.427434,0.465712,1.0,0.131924,0.461654,0.471329,0.358733,0.489182,0.453951,0.44132,0.550216,0.650528,0.274287,0.490588,0.568153,0.358067,0.58834,0.477005,0.399195,0.468925,0.456872,0.554721,0.327151,0.604917,0.490516,0.476326,0.581549,0.554795,0.561731,0.535697,0.598953,0.398819,0.496894,0.444005,0.519635,0.543676,0.42021,0.396303,0.383999,0.462127,0.410548,0.38065,0.503898,0.492973,0.285022,0.438124,0.476753,0.638242,0.647897,0.395683,0.433927,0.402362,0.379427,0.62999,0.4943,0.419162,0.510672,0.572523,0.33226,0.608722,0.182167,0.398557,0.416148,0.42991,0.494629,0.419162,0.515098,0.426649,0.565309,0.453025,0.450621,0.579414,0.599788,0.530391,0.368808,0.484695,0.469385,0.342574,0.544576,0.50474,0.546425,0.432177,0.329117,0.421957,0.520403,0.475936,0.446118,0.357091,0.567389,0.370406,0.312598,0.474943,0.534169,0.48189,0.547105
g0pD_taskd.txt,0.324398,0.256571,0.322104,0.256776,0.246114,0.40631,0.173047,0.131924,1.0,0.3279,0.306234,0.283473,0.257703,0.304177,0.247797,0.217602,0.219883,0.230341,0.221562,0.191097,0.260341,0.277408,0.210219,0.302003,0.222931,0.196933,0.292473,0.20628,0.284335,0.220678,0.152204,0.292466,0.186695,0.316557,0.249002,0.247289,0.350895,0.247797,0.217285,0.26714,0.23877,0.248847,0.216542,0.375718,0.235173,0.354495,0.324199,0.244989,0.211307,0.265908,0.327196,0.304411,0.208084,0.254515,0.407052,0.32119,0.364174,0.309363,0.211591,0.233794,0.285094,0.240432,0.160954,0.314489,0.194996,0.458162,0.357176,0.376219,0.269097,0.327011,0.277592,0.267987,0.405563,0.26782,0.2717,0.322623,0.302899,0.306226,0.214163,0.344913,0.300511,0.290449,0.307971,0.248536,0.311545,0.228728,0.423756,0.316794,0.274795,0.285055,0.236413,0.40153,0.25779,0.198009,0.329741,0.211933,0.35155,0.171905,0.273338,0.250498
g3pA_taska.txt,0.617814,0.652719,0.538725,0.516914,0.617523,0.621315,0.697649,0.461654,0.3279,1.0,0.682158,0.533325,0.579223,0.592307,0.532805,0.580171,0.533196,0.394132,0.632089,0.581911,0.654044,0.592329,0.569526,0.692518,0.658958,0.597514,0.671482,0.43939,0.629746,0.543407,0.526894,0.608108,0.595498,0.630601,0.611587,0.635676,0.548451,0.548256,0.651296,0.631346,0.604071,0.677389,0.483548,0.734831,0.586507,0.625836,0.703704,0.492767,0.55992,0.589613,0.645625,0.612033,0.624144,0.632074,0.640875,0.742598,0.579935,0.700623,0.610309,0.561167,0.569122,0.575386,0.573335,0.669097,0.556038,0.241355,0.58295,0.626301,0.533415,0.618631,0.582673,0.65568,0.620174,0.609585,0.610002,0.597269,0.600265,0.664596,0.609883,0.565583,0.671949,0.609593,0.684236,0.626405,0.74779,0.591608,0.631844,0.666685,0.698309,0.585145,0.581779,0.637399,0.47669,0.557486,0.704448,0.453223,0.664343,0.538104,0.601399,0.60863


In [12]:
similarity['g3pC_taske.txt'].sort_values()

filename
g0pE_taskd.txt    0.246926
g0pD_taskd.txt    0.283473
g3pC_taskc.txt    0.358733
g0pE_taskb.txt    0.373503
g2pE_taskc.txt    0.383618
                    ...   
g2pB_taske.txt    0.779113
g3pB_taske.txt    0.820455
g0pC_taske.txt    0.825347
g0pE_taske.txt    0.934129
g3pC_taske.txt    1.000000
Name: g3pC_taske.txt, Length: 100, dtype: float64

In [13]:
words_df.loc[['g3pC_taske.txt','g0pE_taske.txt']]

Unnamed: 0_level_0,10,15,1702â,1761,1940s,1953,1967,1982,2005,2007,...,yang,year,years,yn,yo,you,your,yours,yourself,zero
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
g3pC_taske.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
g0pE_taske.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
words_df.loc[['g3pC_taske.txt','g0pE_taske.txt']].T

filename,g3pC_taske.txt,g0pE_taske.txt
10,0,0
15,0,0
1702â,0,0
1761,0,0
1940s,0,0
...,...,...
you,0,0
your,0,0
yours,0,0
yourself,0,0


In [15]:
comparing = words_df.loc[['g3pC_taske.txt','g0pE_taske.txt']].T.replace(0,np.nan).dropna()

comparing.head()

filename,g3pC_taske.txt,g0pE_taske.txt
acceptable,1.0,1.0
action,1.0,2.0
algorithm,1.0,1.0
all,1.0,1.0
an,3.0,3.0


# Melting similiarity table and sorting

In [16]:
colunas = len(similarity.columns)
indices = len(similarity.index)

print(colunas,indices)


100 100


In [20]:
similarity['file'] = similarity.index

similarity.melt(id_vars='file',value_name='total')

Unnamed: 0,file,filename,total
0,g2pA_taskd.txt,g2pA_taskd.txt,1.000000
1,g2pC_taska.txt,g2pA_taskd.txt,0.473580
2,g1pA_taske.txt,g2pA_taskd.txt,0.496586
3,g0pE_taske.txt,g2pA_taskd.txt,0.513807
4,g4pB_taskb.txt,g2pA_taskd.txt,0.565915
...,...,...,...
9995,g0pA_taske.txt,g2pA_taskb.txt,0.405048
9996,g1pA_taskd.txt,g2pA_taskb.txt,0.689278
9997,g0pB_taskb.txt,g2pA_taskb.txt,0.779769
9998,g4pD_taskb.txt,g2pA_taskb.txt,0.807536


In [26]:
simdf = similarity.melt(id_vars='file',value_name='total')

rank = simdf[simdf['file'] != simdf['filename']]

# rank['hash'] = rank['file'] + rank['filename']



# rank.style.background_gradient(axis=None, high=0.25)

In [27]:
rank['hash'] = rank.apply(lambda row: ''.join(sorted([row['file'], row['filename']])), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rank['hash'] = rank.apply(lambda row: ''.join(sorted([row['file'], row['filename']])), axis=1)


In [28]:
rank.drop_duplicates('hash').sort_values(by='total',ascending=False)

Unnamed: 0,file,filename,total,hash
572,orig_taskd.txt,g3pA_taskd.txt,0.998898,g3pA_taskd.txtorig_taskd.txt
4657,g4pC_taska.txt,orig_taska.txt,0.991982,g4pC_taska.txtorig_taska.txt
591,g4pC_taskd.txt,g3pA_taskd.txt,0.991907,g3pA_taskd.txtg4pC_taskd.txt
7291,g4pC_taskd.txt,orig_taskd.txt,0.990085,g4pC_taskd.txtorig_taskd.txt
4694,g0pE_taska.txt,orig_taska.txt,0.988922,g0pE_taska.txtorig_taska.txt
...,...,...,...,...
830,g4pC_taskc.txt,g0pD_taskd.txt,0.152204,g0pD_taskd.txtg4pC_taskc.txt
6592,g0pE_taskb.txt,g0pE_taskd.txt,0.150918,g0pE_taskb.txtg0pE_taskd.txt
3065,g0pE_taskd.txt,g4pC_taskc.txt,0.149153,g0pE_taskd.txtg4pC_taskc.txt
2765,g0pE_taskd.txt,g2pE_taskc.txt,0.134231,g0pE_taskd.txtg2pE_taskc.txt


# Let's vectorizer ngrams!

In [None]:
vectorizer = CountVectorizer(ngram_range=(3,6))

matrix = vectorizer.fit_transform(df.content)

words_df = pd.DataFrame(matrix.toarray(),
            columns=vectorizer.get_feature_names(),
            index=df.filename)

words_df.head()

# Compute the similarities using the word count
similarities = cosine_similarity(matrix)

# Make a fancy colored dataframe about it
similarity = pd.DataFrame(similarities, index=df.filename, columns=df.filename)

similarity.style.background_gradient(axis=None, high=0.25)

filename,g2pA_taskd.txt,g2pC_taska.txt,g1pA_taske.txt,g0pE_taske.txt,g4pB_taskb.txt,g3pA_taskd.txt,g0pA_taska.txt,g3pC_taskc.txt,g0pD_taskd.txt,g3pA_taska.txt,g3pA_taske.txt,g3pC_taske.txt,g1pA_taskb.txt,g1pD_taske.txt,g0pD_taskb.txt,g3pA_taskb.txt,g1pD_taskc.txt,g0pD_taske.txt,g4pE_taske.txt,g4pD_taskc.txt,g4pB_taska.txt,orig_taskc.txt,g0pC_taskb.txt,g1pA_taska.txt,g0pB_taska.txt,g2pA_taska.txt,g4pC_taske.txt,g2pE_taskc.txt,g0pB_taskc.txt,g1pB_taskb.txt,g4pC_taskc.txt,g0pA_taskc.txt,g0pB_taske.txt,g0pD_taskc.txt,orig_taske.txt,g1pB_taskc.txt,g4pB_taskd.txt,g4pE_taskd.txt,g2pB_taska.txt,g4pC_taskb.txt,orig_taskb.txt,g1pB_taska.txt,g4pB_taske.txt,g2pE_taska.txt,g2pC_taske.txt,g1pB_taskd.txt,orig_taska.txt,g0pE_taskc.txt,g2pA_taske.txt,g3pC_taska.txt,g2pE_taskd.txt,g3pB_taskb.txt,g0pC_taskc.txt,g3pA_taskc.txt,g4pD_taskd.txt,g0pC_taska.txt,g0pA_taskd.txt,g4pC_taska.txt,g2pC_taskc.txt,g2pE_taske.txt,g1pD_taskd.txt,g2pC_taskb.txt,g3pC_taskb.txt,g4pE_taska.txt,g4pE_taskc.txt,g0pE_taskd.txt,g0pC_taskd.txt,g2pC_taskd.txt,g2pB_taskb.txt,g0pB_taskd.txt,g3pB_taske.txt,g2pE_taskb.txt,orig_taskd.txt,g3pB_taskc.txt,g0pC_taske.txt,g3pB_taskd.txt,g2pA_taskc.txt,g4pB_taskc.txt,g0pA_taskb.txt,g3pC_taskd.txt,g1pB_taske.txt,g4pD_taske.txt,g0pD_taska.txt,g2pB_taske.txt,g4pD_taska.txt,g1pA_taskc.txt,g2pB_taskd.txt,g1pD_taska.txt,g3pB_taska.txt,g1pD_taskb.txt,g4pE_taskb.txt,g4pC_taskd.txt,g0pE_taskb.txt,g2pB_taskc.txt,g0pE_taska.txt,g0pA_taske.txt,g1pA_taskd.txt,g0pB_taskb.txt,g4pD_taskb.txt,g2pA_taskb.txt
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1
g2pA_taskd.txt,1.0,0.0,0.0,0.0,0.0,0.331233,0.0,0.004205,0.0,0.0,0.0,0.0,0.0,0.003808,0.0,0.0,0.00375,0.0,0.000874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002562,0.0,0.0,0.001929,0.0,0.0,0.004415,0.0,0.001789,0.0,0.095816,0.0,0.000845,0.002113,0.0,0.0,0.00219,0.0,0.003713,0.092744,0.0,0.0,0.0,0.0,0.006474,0.0,0.00196,0.0,0.006279,0.0,0.107088,0.0,0.0,0.001015,0.019545,0.002049,0.002977,0.0,0.0,0.008523,0.363686,0.230531,0.0,0.048292,0.003509,0.0,0.318227,0.0,0.0,0.116942,0.0,0.0,0.0,0.033231,0.0,0.0,0.0,0.002457,0.0,0.002713,0.215051,0.0,0.002809,0.0,0.0,0.266426,0.0,0.000953,0.0,0.0,0.134066,0.003482,0.0,0.0
g2pC_taska.txt,0.0,1.0,0.0,0.0,0.0,0.0,0.001201,0.0,0.0,0.0,0.0,0.0,0.002578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001241,0.0,0.0,0.0,0.0,0.002196,0.0,0.0,0.0,0.0,0.0,0.0,0.001185,0.0,0.0008,0.0,0.0,0.0,0.0,0.0,0.0,0.001427,0.000979,0.189977,0.002491,0.0,0.505423,0.0,0.0,0.258204,0.0,0.0,0.0,0.0,0.0,0.055204,0.0,0.520978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.332251,0.0,0.037927,0.0,0.0,0.055439,0.001256,0.0,0.0,0.0,0.0,0.0,0.494756,0.0,0.0,0.0,0.0,0.0
g1pA_taske.txt,0.0,0.0,1.0,0.003856,0.0,0.0011,0.0,0.001917,0.0,0.0,0.009465,0.009429,0.0,0.014324,0.0,0.0,0.0,0.0,0.00598,0.0,0.0,0.0,0.001332,0.003847,0.0,0.0,0.009346,0.0,0.0,0.0,0.0,0.0,0.013286,0.0,0.009787,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007987,0.000998,0.024127,0.0,0.0,0.0,0.008803,0.0,0.001476,0.0,0.0,0.001007,0.001227,0.0,0.0,0.0,0.0,0.001389,0.0,0.0,0.0,0.001276,0.0,0.0,0.0,0.001143,0.001292,0.001348,0.0112,0.0,0.001094,0.0,0.009164,0.0,0.0,0.0,0.0,0.0,0.01264,0.005891,0.0,0.008964,0.0,0.0,0.001172,0.0,0.0,0.0,0.0,0.001098,0.0,0.0,0.0,0.013175,0.001183,0.001191,0.0,0.0
g0pE_taske.txt,0.0,0.0,0.003856,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001734,0.356441,0.0,0.015264,0.0,0.0,0.0,0.0,0.014018,0.0,0.0,0.0,0.0,0.003758,0.0,0.0,0.297858,0.0,0.0,0.0,0.0,0.0,0.090254,0.0,0.412291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.267709,0.0,0.020467,0.0,0.0,0.0,0.003685,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002035,0.0,0.0,0.0,0.00187,0.0,0.0,0.0,0.0,0.0,0.0,0.632966,0.0,0.0,0.0,0.286463,0.0,0.0,0.0,0.0,0.0,0.435216,0.075953,0.0,0.530298,0.0,0.0,0.0,0.00184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007722,0.0,0.0,0.0,0.0
g4pB_taskb.txt,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.001249,0.0,0.0,0.003772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001148,0.001274,0.0,0.0,0.0,0.0,0.0,0.002018,0.003787,0.0,0.00116,0.0,0.004023,0.0,0.0,0.0,0.0,0.0,0.013829,0.0,0.001392,0.0,0.000955,0.0,0.0,0.0,0.0,0.0,0.0,0.001412,0.003065,0.001283,0.001927,0.0,0.0,0.0,0.0,0.0,0.0,0.001505,0.0,0.002598,0.0,0.001135,0.0,0.001451,0.002187,0.001237,0.0,0.0,0.0,0.0,0.001189,0.0,0.003092,0.003444,0.00365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002367,0.0,0.0,0.0,0.00109,0.0,0.0,0.0,0.001248,0.0,0.0,0.0,0.0,0.001189,0.0
g3pA_taskd.txt,0.331233,0.0,0.0011,0.0,0.0,1.0,0.0,0.0,0.0,0.001091,0.000989,0.0,0.0,0.003265,0.0,0.001091,0.005717,0.0,0.001,0.0,0.0,0.0,0.0,0.001072,0.000908,0.0,0.002929,0.0,0.0,0.0,0.0,0.0,0.005047,0.003514,0.002045,0.0,0.342859,0.0,0.000966,0.0,0.00272,0.0,0.002503,0.0,0.005307,0.117553,0.0,0.0,0.0,0.0,0.002467,0.0,0.0,0.000841,0.004102,0.0,0.094602,0.0,0.0,0.001161,0.03286,0.0,0.003404,0.0,0.0,0.005847,0.60592,0.254957,0.0,0.27154,0.004012,0.0,0.968936,0.0,0.0,0.144495,0.0,0.0,0.004337,0.068384,0.0,0.002954,0.0,0.002809,0.0,0.003101,0.664098,0.0,0.003212,0.000952,0.002882,0.877066,0.0,0.00109,0.0,0.0,0.30456,0.000995,0.0,0.002896
g0pA_taska.txt,0.0,0.001201,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.004859,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001177,0.0,0.0,0.002386,0.010111,0.001041,0.0,0.0,0.0,0.002455,0.009092,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01829,0.0,0.0,0.014885,0.0,0.005573,0.0,0.0,0.001985,0.004063,0.0,0.00156,0.0,0.0,0.0,0.0,0.0,0.002436,0.0,0.002046,0.0,0.0,0.0,0.0,0.0,0.002375,0.0,0.0,0.0,0.0,0.0,0.002508,0.0,0.002751,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006539,0.0,0.001199,0.0,0.0,0.001169,0.004767,0.0,0.0,0.0,0.0,0.0,0.002054,0.0,0.0,0.0,0.0,0.0
g3pC_taskc.txt,0.004205,0.0,0.001917,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.003795,0.0,0.0,0.01744,0.0,0.001742,0.003404,0.0,0.012237,0.0,0.0,0.0,0.0,0.003404,0.0,0.012288,0.0,0.007119,0.015891,0.00352,0.010209,0.002377,0.012041,0.0,0.0,0.0,0.0,0.0,0.0,0.001455,0.0,0.00185,0.0,0.0,0.006363,0.001832,0.0,0.0,0.001556,0.007814,0.016137,0.0,0.0,0.0,0.0,0.016935,0.002024,0.0,0.0,0.0,0.0,0.022458,0.0,0.0,0.0,0.001883,0.0,0.002331,0.0,0.0,0.02716,0.002225,0.0,0.008741,0.011115,0.0,0.0,0.001535,0.0,0.0,0.003265,0.0,0.007209,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003799,0.0,0.0,0.0,0.0,0.0,0.0
g0pD_taskd.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
g3pA_taska.txt,0.0,0.0,0.0,0.0,0.001249,0.001091,0.004859,0.0,0.0,1.0,0.0,0.0,0.001304,0.0,0.0,0.001295,0.001697,0.0,0.002373,0.003478,0.017575,0.0,0.001321,0.012725,0.00647,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001171,0.0,0.0,0.0,0.001434,0.0,0.008659,0.0,0.003962,0.00126,0.0,0.011644,0.0,0.0,0.0183,0.0,0.0,0.00133,0.0,0.0,0.01039,0.001321,0.012002,0.0,0.0,0.0,0.0,0.001347,0.005065,0.0,0.0,0.00301,0.001134,0.001282,0.001338,0.0,0.0,0.001085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001169,0.005578,0.0,0.001279,0.0,0.001163,0.0,0.007625,0.0,0.0,0.001089,0.0,0.0,0.012047,0.0,0.001174,0.0,0.0,0.0


In [None]:
words_df.loc[['g3pC_taske.txt','g0pE_taske.txt']].T.replace(0,np.nan).dropna()

filename,g3pC_taske.txt,g0pE_taske.txt
action that is,1.0,1.0
an acceptable plan,1.0,1.0
an exhibition is,1.0,1.0
an exhibition is sometimes,1.0,1.0
an exhibition is sometimes called,1.0,1.0
...,...,...
thus the program is the,1.0,1.0
thus the program is the optimal,1.0,1.0
to computer programming,1.0,1.0
to computer programming at,1.0,1.0
