In [1]:
# Importing libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re
import numpy as np
from collections import Counter

In [2]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

In [3]:
# Cleaning the text sentences so that punctuation marks, stop words & digits are removed  
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    processed = re.sub(r"\d+","",normalized)
    y = processed.split()
    return y

In [5]:
print("There are 10 sentences of following three classes on which K-NN classification and K-means clustering"\
         " is performed : \n1. Cricket \n2. Artificial Intelligence \n3. Chemistry")

There are 10 sentences of following three classes on which K-NN classification and K-means clustering is performed : 
1. Cricket 
2. Artificial Intelligence 
3. Chemistry


In [6]:
path = "Sentences.txt"

In [7]:
train_clean_sentences = []
fp = open(path,'r')

for line in fp:
    line = line.strip()
    cleaned = clean(line)
    cleaned = ' '.join(cleaned)
    train_clean_sentences.append(cleaned)

In [42]:
vocab = " ".join(train_clean_sentences)
vocab_size = len(list(set(vocab.split())))
vocab_size

252

In [8]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(train_clean_sentences)

In [9]:
# Creating true labels for 30 training sentences 
y_train = np.zeros(30)
y_train[10:20] = 1
y_train[20:30] = 2

In [10]:
y_train

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.])

In [11]:
# Clustering the document with KNN classifier
modelknn = KNeighborsClassifier(n_neighbors=5)
modelknn.fit(X,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [12]:
# Clustering the training 30 sentences with K-means technique
modelkmeans = KMeans(n_clusters=3, init='k-means++', max_iter=200, n_init=100)
modelkmeans.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=200,
    n_clusters=3, n_init=100, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [13]:
test_sentences = ["Chemical compunds are used for preparing bombs based on some reactions",\
                  "Cricket is a boring game where the batsman only enjoys the game",\
                  "Machine learning is an area of Artificial intelligence"]

In [14]:
test_clean_sentence = []
for test in test_sentences:
    cleaned_test = clean(test)
    cleaned = ' '.join(cleaned_test)
    cleaned = re.sub(r"\d+","",cleaned)
    test_clean_sentence.append(cleaned)
    
Test = vectorizer.transform(test_clean_sentence) 

In [15]:
true_test_labels = ['Cricket','AI','Chemistry']
predicted_labels_knn = modelknn.predict(Test)
predicted_labels_kmeans = modelkmeans.predict(Test)

In [21]:
print("\nBelow 3 sentences will be predicted against the learned nieghbourhood and learned clusters:\n1. ",\
        test_sentences[0],"\n2. ",test_sentences[1],"\n3. ",test_sentences[2])


Below 3 sentences will be predicted against the learned nieghbourhood and learned clusters:
1.  Chemical compunds are used for preparing bombs based on some reactions 
2.  Cricket is a boring game where the batsman only enjoys the game 
3.  Machine learning is an area of Artificial intelligence


In [23]:
print("\n-------------------------------PREDICTIONS BY KNN------------------------------------------")
print("\n",test_sentences[0],":",true_test_labels[np.int(predicted_labels_knn[0])],\
       "\n",test_sentences[1],":",true_test_labels[np.int(predicted_labels_knn[1])],\
       "\n",test_sentences[2],":",true_test_labels[np.int(predicted_labels_knn[2])])


-------------------------------PREDICTIONS BY KNN------------------------------------------

 Chemical compunds are used for preparing bombs based on some reactions : Chemistry 
 Cricket is a boring game where the batsman only enjoys the game : Cricket 
 Machine learning is an area of Artificial intelligence : AI


In [24]:
print("\n-------------------------------PREDICTIONS BY K-Means--------------------------------------")
print("\nIndex of Cricket cluster : ",Counter(modelkmeans.labels_[0:10]).most_common(1)[0][0])
print("Index of Artificial Intelligence cluster : ",Counter(modelkmeans.labels_[10:20]).most_common(1)[0][0] )
print("Index of Chemistry cluster : ",Counter(modelkmeans.labels_[20:30]).most_common(1)[0][0])
print("\n",test_sentences[0],":",predicted_labels_kmeans[0],\
        "\n",test_sentences[1],":",predicted_labels_kmeans[1],\
        "\n",test_sentences[2],":",predicted_labels_kmeans[2])


-------------------------------PREDICTIONS BY K-Means--------------------------------------

Index of Cricket cluster :  2
Index of Artificial Intelligence cluster :  0
Index of Chemistry cluster :  1

 Chemical compunds are used for preparing bombs based on some reactions : 1 
 Cricket is a boring game where the batsman only enjoys the game : 2 
 Machine learning is an area of Artificial intelligence : 0


# END............

In [90]:
tarray = Test.toarray()
tarray.shape

(3, 223)

In [94]:
for i in range(tarray.shape[1]):
    print("{0}, {1}".format(i, tarray[0][i]))#, end="||||")

0, 0.0
1, 0.0
2, 0.0
3, 0.0
4, 0.0
5, 0.0
6, 0.0
7, 0.0
8, 0.0
9, 0.0
10, 0.0
11, 0.0
12, 0.0
13, 0.0
14, 0.0
15, 0.0
16, 0.0
17, 0.0
18, 0.0
19, 0.0
20, 0.0
21, 0.0
22, 0.0
23, 0.0
24, 0.0
25, 0.0
26, 0.0
27, 0.0
28, 0.0
29, 0.0
30, 0.0
31, 0.0
32, 0.0
33, 0.0
34, 0.0
35, 0.0
36, 0.0
37, 0.0
38, 0.0
39, 0.0
40, 0.0
41, 0.6209517720912319
42, 0.0
43, 0.0
44, 0.0
45, 0.0
46, 0.0
47, 0.0
48, 0.0
49, 0.0
50, 0.0
51, 0.0
52, 0.0
53, 0.0
54, 0.0
55, 0.0
56, 0.0
57, 0.0
58, 0.0
59, 0.0
60, 0.0
61, 0.0
62, 0.0
63, 0.0
64, 0.0
65, 0.0
66, 0.0
67, 0.0
68, 0.0
69, 0.0
70, 0.0
71, 0.0
72, 0.0
73, 0.0
74, 0.0
75, 0.0
76, 0.0
77, 0.0
78, 0.0
79, 0.0
80, 0.0
81, 0.0
82, 0.0
83, 0.0
84, 0.0
85, 0.0
86, 0.0
87, 0.0
88, 0.0
89, 0.0
90, 0.0
91, 0.0
92, 0.0
93, 0.0
94, 0.0
95, 0.0
96, 0.0
97, 0.0
98, 0.0
99, 0.0
100, 0.0
101, 0.0
102, 0.0
103, 0.0
104, 0.0
105, 0.0
106, 0.0
107, 0.0
108, 0.0
109, 0.0
110, 0.0
111, 0.0
112, 0.0
113, 0.0
114, 0.0
115, 0.0
116, 0.0
117, 0.0
118, 0.0
119, 0.0
120, 0.0
121, 0

In [87]:
xx = predicted_labels_kmeans[0]
xx

1

In [33]:
print(modelkmeans.labels_)
print(len(modelkmeans.labels_))
modelkmeans

[2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 1 1 1 1 1]
30


0

In [53]:
##Now execute the below code to get the centroids and features
order_centroids = modelkmeans.cluster_centers_.argsort()[:, ::-1]

In [58]:
print(order_centroids.shape)
print(order_centroids[0][:10])

(3, 223)
[  7 108 120 109 202 101  50 115 169  12]


In [55]:
terms = vectorizer.get_feature_names()

In [56]:
print(len(terms))
print(terms[:10])

223
['ability', 'access', 'action', 'advance', 'advanced', 'advancement', 'agent', 'ai', 'allows', 'analytical']


In [62]:
for i in range(3):
    print("\nCluster {0}------------------------------".format(i))
    for index in order_centroids[i, :10]:
        print("{0}".format(terms[index]), end=",")


Cluster 0------------------------------
ai,intelligence,machine,intelligent,success,human,computer,learning,research,artificial,
Cluster 1------------------------------
property,chemical,element,atom,substance,chemistry,science,compound,reaction,called,
Cluster 2------------------------------
team,run,ball,inning,bat,score,batsman,cricket,opponent,main,

In [68]:
order_centroids[0]

array([  7, 108, 120, 109, 202, 101,  50, 115, 169,  12,  79,   6, 144,
        38, 127,   2, 113, 210,  95, 209, 188,  83, 212,  40, 137,  82,
        10,  72,  81, 105,  98,  37, 159, 123, 141,  91,   8,  66,  60,
       166,   0, 160,  84, 218, 181,  96,   5, 211, 167,  16,  35, 192,
        53,  71,  55, 177, 198, 132,   9, 161,  45, 111, 207, 184, 173,
       186,  75, 114,  85, 145,  58, 118, 194,   1,  59,   4,   3, 216,
       125, 221, 171, 128, 178, 182, 170,  65, 200, 189, 191,  87, 217,
       134, 133, 140, 154, 163, 136,  90, 112,  52,  43,  33,  28,  11,
        19,  18,  51,  17,  15, 199,  14,  13,  54, 208, 197, 213,  29,
        56,  57, 214, 215, 219, 196,  61, 220,  62,  63,  64,  49,  48,
        47,  20,  27,  30,  31,  32,  26, 195,  25,  24,  34, 206,  23,
        36, 205, 204,  39, 203,  41,  42,  22,  44,  21, 201,  46, 162,
        80,  67, 165, 117, 119, 121, 122, 172, 124, 126, 129, 130, 131,
       135, 168, 138, 139, 142,  68, 143, 164, 146, 147, 148, 14

In [69]:
order_centroids[1]

array([162,  41,  74,  13, 201,  42, 177,  49, 165,  33,  72,  67, 215,
       203, 122, 222, 135, 106,  93,  48,  39,  31, 149, 197, 126,  36,
        94,  32, 103,  26, 148, 198,  27, 102, 104, 213,  51, 168,  64,
       143,  25, 146, 116,  73, 110, 172,  78, 142, 156,  80, 157, 193,
       187,  47, 117, 164, 183,  23, 132,  62, 219, 204,  14,  19,  54,
       190,  55,  90, 137,  43,  52,  81,   6,  79,   7,   8,  77,  46,
         5,  75,   9,  10,  34,  11,  12,  76,  82,  70,   4,   3,   2,
        83,  84,  85,  86,  87,  88,  89,  38,   1,  71,  69,  30,  58,
        22,  50,  45,  21,  20,  44,  53,  24,  18,  28,  56,  57,  59,
        91,  17,  60,  61,  29,  37,  63,  16,  65,  66,  15,  40,  68,
        35, 111,  92, 189, 186, 185, 184, 182, 181, 180, 179, 178, 176,
       175, 174, 173, 171, 170, 169, 167, 166, 163, 161, 188, 191, 159,
       192, 220, 218, 217, 216, 214, 212, 211, 210, 209, 208, 207, 206,
       205, 202, 200, 199, 196, 195, 194, 160, 158,  95, 124, 12

## test 3 --- Good one

In [95]:
features_all = vectorizer.get_feature_names()

In [98]:
print(type(features_all))
print(len(features_all))
print(features_all[:10])

<class 'list'>
223
['ability', 'access', 'action', 'advance', 'advanced', 'advancement', 'agent', 'ai', 'allows', 'analytical']


In [100]:
#create data frame for saving it
import pandas as pd

In [117]:
df = pd.DataFrame(columns=features_all)
df

Unnamed: 0,ability,access,action,advance,advanced,advancement,agent,ai,allows,analytical,...,topic,toss,transformation,truly,uncharged,understand,unit,using,verifiable,wanted


In [118]:
cluster_coordinates = modelkmeans.cluster_centers_
cluster_coordinates.shape

(3, 223)

In [119]:
#cluster_coordinates[0]

In [120]:
for row_num in range(cluster_coordinates.shape[0]):
    df.loc[row_num] = cluster_coordinates[row_num]

In [121]:
df

Unnamed: 0,ability,access,action,advance,advanced,advancement,agent,ai,allows,analytical,...,topic,toss,transformation,truly,uncharged,understand,unit,using,verifiable,wanted
0,0.026667,0.022093,0.056713,0.022093,0.022093,0.026473,0.056713,0.105568,0.02828,0.022192,...,0.0,0.0,0.0,0.021861,0.020859,0.026667,0.0,0.0,0.021861,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.032521,0.0,0.04367,0.0,0.0,0.0,0.030275,0.0,0.0,0.039948
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.032034,0.0,0.0,0.0,0.0,0.0,0.031332,0.0,0.0


In [123]:
test_start_index = df.shape[0]
test_start_index

3

In [124]:
for row_num in range(tarray.shape[0]):
    #print(row_num)
    df.loc[test_start_index + row_num] = tarray[row_num]

In [125]:
df

Unnamed: 0,ability,access,action,advance,advanced,advancement,agent,ai,allows,analytical,...,topic,toss,transformation,truly,uncharged,understand,unit,using,verifiable,wanted
0,0.026667,0.022093,0.056713,0.022093,0.022093,0.026473,0.056713,0.105568,0.02828,0.022192,...,0.0,0.0,0.0,0.021861,0.020859,0.026667,0.0,0.0,0.021861,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.032521,0.0,0.04367,0.0,0.0,0.0,0.030275,0.0,0.0,0.039948
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.032034,0.0,0.0,0.0,0.0,0.0,0.031332,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [135]:
single_prediction_result = df.loc[0]

In [161]:
single_prediction_result.sort_values(ascending=False)[:5]

ai              0.105568
intelligence    0.102370
machine         0.098083
intelligent     0.074862
success         0.069901
Name: 0, dtype: float64

### good one test ends here

In [140]:
def greater_vals(val):
    if val > 0:
        return val

In [143]:
greaters = single_prediction_result.apply(lambda x: greater_vals(x))
pd.notna(greaters)

ability            True
access             True
action             True
advance            True
advanced           True
advancement        True
agent              True
ai                 True
allows             True
analytical         True
animal             True
apiece            False
artificial         True
atom              False
atomic            False
attempting        False
autonomous         True
bail              False
ball              False
basic             False
bat               False
batsman           False
batting           False
bear              False
begin             False
behaviour         False
biology           False
bond              False
boundary          False
bowled            False
                  ...  
state             False
statistical        True
strike            False
striker           False
structure         False
study              True
stump             False
subproblems        True
substance         False
success            True
suitable        