# scikit-learn's TF-IDF vectorizer and KMeans clustering to analyze a set of documents and cluster them into two groups

In [1]:
document = ['This is the most beautiful place in the world.',
            'This man has more skills to show in cricket than any other game.',
            'Hi there! how was your ladakh trip last month?',
            'There was a player who had scored 200+ runs in single cricket innings in his career.',
            'I have got the opportunity to travel to Paris next year for my internship.',
            'May be he is better than you in batting but you are much better than him in bowling.',
            'That was really a great day for me when I was there at Lavasa for the whole night.',
            'That’s exactly I wanted to become, a highest ratting batsmen ever with top scores.',
            'Does it really matter wether you go to Thailand or Goa, its just you have spend your holidays.',
            'Why don’t you go to Switzerland next year for your 25th Wedding anniversary?',
            'Travel is fatal to prejudice, bigotry, and narrow mindedness., and many of our people need it sorely on these accounts.',
            'Stop worrying about the potholes in the road and enjoy the journey.',
            'No cricket team in the world depends on one or two players. The team always plays to win.',
            'Cricket is a team game. If you want fame for yourself, go play an individual game.',
            'Because in the end, you won’t remember the time you spent working in the office or mowing your lawn. Climb that goddamn mountain.',
            'Isn’t cricket supposed to be a team sport? I feel people should decide first whether cricket is a team game or an individual sport.']

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd

In [3]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(document)

In [4]:
true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=500, n_init=1)
model.fit(X)

In [6]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
terms

array(['200', '25th', 'accounts', 'anniversary', 'batsmen', 'batting',
       'beautiful', 'better', 'bigotry', 'bowling', 'career', 'climb',
       'cricket', 'day', 'decide', 'depends', 'does', 'don', 'end',
       'enjoy', 'exactly', 'fame', 'fatal', 'feel', 'game', 'goa',
       'goddamn', 'got', 'great', 'hi', 'highest', 'holidays',
       'individual', 'innings', 'internship', 'isn', 'journey', 'just',
       'ladakh', 'lavasa', 'lawn', 'man', 'matter', 'mindedness', 'month',
       'mountain', 'mowing', 'narrow', 'need', 'night', 'office',
       'opportunity', 'paris', 'people', 'place', 'play', 'player',
       'players', 'plays', 'potholes', 'prejudice', 'ratting', 'really',
       'remember', 'road', 'runs', 'scored', 'scores', 'single', 'skills',
       'sorely', 'spend', 'spent', 'sport', 'stop', 'supposed',
       'switzerland', 'team', 'thailand', 'time', 'travel', 'trip',
       'want', 'wanted', 'wedding', 'wether', 'win', 'won', 'working',
       'world', 'worrying', 

In [7]:
for i in range(true_k):
    print('Cluster %d:' % i),
    for ind in order_centroids[i, :10]:
        print(' %s'%terms[ind])
    print('----------------------')

Cluster 0:
 scores
 exactly
 batsmen
 ratting
 wanted
 highest
 holidays
 game
 goa
 goddamn
----------------------
Cluster 1:
 cricket
 team
 game
 world
 better
 year
 really
 travel
 place
 beautiful
----------------------


In [8]:
print('Prediction')
X = vectorizer.transform(['Nothing is easy in cricket.it`s a complicated sport game'])
predicted = model.predict(X)
print(predicted)

Prediction
[1]


In [9]:
print('Prediction')
X = vectorizer.transform(['my travel to india was awful last month'])
predicted = model.predict(X)
print(predicted)

Prediction
[1]
