# Reading Files And Storing them in a list

In [1]:
import os
folderpath = r"C:\Users\Real_\OneDrive\Documents\Articles"
filepaths = [os.path.join(folderpath,name) for name in os.listdir(folderpath)]
files = []
for path in filepaths:
    with open(path, 'r',encoding = 'ISO-8859-1') as f:
        file = f.readlines()
        files.append(file)

In [2]:
files

[['Anarchism is a political philosophy that advocates stateless societies often defined as self-governed voluntary institutions, "ANARCHISM, a social philosophy that rejects authoritarian government and maintains that voluntary institutions are best suited to express man\\\'s natural social tendencies." George Woodcock. "Anarchism" at The Encyclopedia of Philosophy "In a society developed on these lines, the voluntary associations which already now begin to cover all the fields of human activity would take a still greater extension so as to substitute themselves for the state in all its functions." Peter Kropotkin. "Anarchism" from the EncyclopÃ\x83Â¦dia Britannica "Anarchism." The Shorter Routledge Encyclopedia of Philosophy. 2005. p. 14 "Anarchism is the view that a society without the state, or government, is both possible and desirable." Sheehan, Sean. Anarchism, London: Reaktion Books Ltd., 2004. p. 85 but that several authors have defined as more specific institutions based on no

# Removing punctuations

In [3]:
all_data=" ".join(map(str,files))

In [4]:
all_data

'[\'Anarchism is a political philosophy that advocates stateless societies often defined as self-governed voluntary institutions, "ANARCHISM, a social philosophy that rejects authoritarian government and maintains that voluntary institutions are best suited to express man\\\\\\\'s natural social tendencies." George Woodcock. "Anarchism" at The Encyclopedia of Philosophy "In a society developed on these lines, the voluntary associations which already now begin to cover all the fields of human activity would take a still greater extension so as to substitute themselves for the state in all its functions." Peter Kropotkin. "Anarchism" from the EncyclopÃ\\x83Â¦dia Britannica "Anarchism." The Shorter Routledge Encyclopedia of Philosophy. 2005. p. 14 "Anarchism is the view that a society without the state, or government, is both possible and desirable." Sheehan, Sean. Anarchism, London: Reaktion Books Ltd., 2004. p. 85 but that several authors have defined as more specific institutions based

In [5]:
import re
data_after = re.sub(r'[^\w\s]', '', all_data)

In [6]:
data_after

'Anarchism is a political philosophy that advocates stateless societies often defined as selfgoverned voluntary institutions ANARCHISM a social philosophy that rejects authoritarian government and maintains that voluntary institutions are best suited to express mans natural social tendencies George Woodcock Anarchism at The Encyclopedia of Philosophy In a society developed on these lines the voluntary associations which already now begin to cover all the fields of human activity would take a still greater extension so as to substitute themselves for the state in all its functions Peter Kropotkin Anarchism from the EncyclopÃx83Âdia Britannica Anarchism The Shorter Routledge Encyclopedia of Philosophy 2005 p 14 Anarchism is the view that a society without the state or government is both possible and desirable Sheehan Sean Anarchism London Reaktion Books Ltd 2004 p 85 but that several authors have defined as more specific institutions based on nonhierarchical free associations as many ana

# Removing Stop Words

In [7]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize


text_tokens = word_tokenize(data_after)

remove_stopWords = [word for word in text_tokens if not word in stopwords.words()]

print(remove_stopWords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Real_\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Real_\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['Anarchism', 'political', 'philosophy', 'advocates', 'stateless', 'societies', 'often', 'defined', 'selfgoverned', 'voluntary', 'institutions', 'ANARCHISM', 'social', 'philosophy', 'rejects', 'authoritarian', 'government', 'maintains', 'voluntary', 'institutions', 'best', 'suited', 'express', 'mans', 'natural', 'social', 'tendencies', 'George', 'Woodcock', 'Anarchism', 'The', 'Encyclopedia', 'Philosophy', 'In', 'society', 'developed', 'lines', 'voluntary', 'associations', 'already', 'begin', 'cover', 'fields', 'human', 'activity', 'would', 'still', 'greater', 'extension', 'substitute', 'state', 'functions', 'Peter', 'Kropotkin', 'Anarchism', 'EncyclopÃx83Âdia', 'Britannica', 'Anarchism', 'The', 'Shorter', 'Routledge', 'Encyclopedia', 'Philosophy', '2005', 'p', '14', 'Anarchism', 'view', 'society', 'without', 'state', 'government', 'possible', 'desirable', 'Sheehan', 'Sean', 'Anarchism', 'London', 'Reaktion', 'Books', 'Ltd', '2004', 'p', '85', 'several', 'authors', 'defined', 'specific

# Applying K-Means Clustering

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from warnings import simplefilter
simplefilter(action = 'ignore',category = FutureWarning)


vectorizer = TfidfVectorizer(stop_words = 'english')
X = vectorizer.fit_transform(remove_stopWords)

true_k =5
model = KMeans(n_clusters = true_k,init = 'k-means++',max_iter = 100,n_init = 1)
model.fit(X)
print("Top terms per cluster :")
order_centroids = model.cluster_centers_.argsort()[:,::-1]
terms = vectorizer.get_feature_names()
for i in range (true_k):
    print("Cluster %d: " % i)
    for j in order_centroids[i,:10]:
        print("%s " % terms[j])

Top terms per cluster :
Cluster 0: 
london 
ãx80x94 
fields 
demands 
desirable 
destroy 
developed 
education 
encyclopedia 
encyclopãx83âdia 
Cluster 1: 
anarchism 
philosophy 
government 
authority 
society 
institutions 
social 
voluntary 
anarchists 
ãx80x94 
Cluster 2: 
state 
ãx80x94 
faq 
demands 
desirable 
destroy 
developed 
education 
encyclopedia 
encyclopãx83âdia 
Cluster 3: 
hierarchical 
ãx80x94 
fields 
demands 
desirable 
destroy 
developed 
education 
encyclopedia 
encyclopãx83âdia 
Cluster 4: 
hierarchy 
ãx80x94 
fields 
demands 
desirable 
destroy 
developed 
education 
encyclopedia 
encyclopãx83âdia 


In [9]:
print('Guessing: ')
Y = vectorizer.transform(['restraint'])
guessing = model.predict(Y)
print(guessing)

Guessing: 
[1]
