In [67]:
# make vocabulary

import pandas as pd
import numpy as np
from collections import Counter
import pickle


In [35]:
def save_obj(obj,name):

    with open ('data/' + name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
        
def load_obj (name):
    
    with open ('data/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)
        

In [7]:
df = pd.read_csv('data/data_preprocessed.csv', usecols=['description'])


In [207]:
def make_vocabulary(df):
    
    # vocabulary dict
    voc = {}
    
    # term_id initialized to 0
    cnt = 0
    
    for i in range(len(df)):
        
        # get the list 
        word_list = set(df.iloc[i].description.split())
        
        for word in word_list:
            if word not in voc:
                voc[word] = cnt 
                cnt += 1
                
    save_obj(voc,'vocabulary')
    
    return
        

In [264]:
df = pd.read_csv('data/data_preprocessed.csv', usecols=['description'])

In [265]:
make_vocabulary(df)

In [266]:
voc = load_obj('vocabulary')

In [267]:
from math import log
from time import time

In [270]:
start = time()
# make tfidf numpy matrix

# initialize a numpy matrix to zero
n = len(df) # number of announcements
tot_words = len(voc) # number of words
s = (n, tot_words) # nrow and ncol
TF_matrix = np.zeros(s)

# call the vocabulary
voc = load_obj('vocabulary')

# words in all the announcements (with repetitions, max 1 word for each announcement)
total_words_occurrencies = []

# for each announcement
for i in range(len(df)):
    
    # empty words_list
    words_list = []
    # create the list from the description (stemmed) string
    words_list = (df.iloc[i].description).split()
    
    # map all the words to their vocabulary term_id  
    words_list = list(map(lambda x: voc[x], words_list))
    
    # the term_id is the index pointing the columns of TF matrix, IDF array and TFIDF matrix
    # use counter to count the TF with a dictionary
    counter_dict = Counter(words_list)
    
    # for each term_id in the dict, put into the current announcement (row(i))
    # the values of the occurrencies pointed by the array
    for term_id in counter_dict.keys():
        TF_matrix[i][term_id] = counter_dict[term_id]
    
    # set the words (max 1 word for each announcement!) and add it the main list
    total_words_occurrencies += list(set(words_list))

# Count the occurrencies 
counter_dict = Counter(total_words_occurrencies)

# initialize IDF_array
IDF_array = np.zeros(tot_words)

# for each term_id, fill IDF_array[term_id] with its occurrencies in counter_dict[term_id]
for term_id in counter_dict.keys():
    IDF_array[term_id] = log(n/counter_dict[term_id])

# broadcast operation: we can multiply each TFIDF row element wise with IDF_array
TFIDF_matrix = TF_matrix*IDF_array


end = time()
print(end-start)

6.712216138839722


In [271]:
TFIDF_matrix

array([[ 4.83628191, 42.74926987,  0.90182941, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.90182941, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  9.24300115,
         9.24300115,  0.        ],
       [ 0.        ,  0.        ,  1.80365881, ...,  0.        ,
         0.        , 18.48600231]])

In [246]:
len(TFIDF_matrix)

10332

In [248]:
len(TFIDF_matrix[2])

25040

In [None]:
TFIDF_matrix

## Copy of the same code above with only few modifies, just to understand

In [249]:
# EXAMPLE TO SHOW
df = pd.DataFrame(columns=['description'], data = [['hello my name'],['my Michele my']])
df

Unnamed: 0,description
0,hello my name
1,my Michele my


In [250]:
make_vocabulary(df)

In [251]:
voc = load_obj('vocabulary')

In [252]:
voc

{'name': 0, 'hello': 1, 'my': 2, 'Michele': 3}

In [263]:
# make tfidf numpy matrix

# initialize a numpy matrix to zero
n = len(df) # number of announcements
tot_words = len(voc) # number of words
s = (n, tot_words) # nrow and ncol
TF_matrix = np.zeros(s)

# call the vocabulary
voc = load_obj('vocabulary')

# words in all the announcements (with repetitions, max 1 word for each announcement)
total_words_occurrencies = []

# for each announcement
for i in range(len(df)):
    
    # empty words_list
    words_list = []
    # create the list from the description (stemmed) string
    words_list = (df.iloc[i].description).split()
    
    # map all the words to their vocabulary term_id  
    words_list = list(map(lambda x: voc[x], words_list))
    
    # the term_id is the index pointing the columns of TF matrix, IDF array and TFIDF matrix
    # use counter to count the TF with a dictionary
    counter_dict = Counter(words_list)
    
    # for each term_id in the dict, put into the current announcement (row(i))
    # the values of the occurrencies pointed by the array
    for term_id in counter_dict.keys():
        TF_matrix[i][term_id] = counter_dict[term_id]
    
    # set the words (max 1 word for each announcement!) and add it the main list
    total_words_occurrencies += list(set(words_list))

# Count the occurrencies 
counter_dict = Counter(total_words_occurrencies)

# initialize IDF_array
IDF_array = np.zeros(tot_words)
IDF_only_occurrencies_in_all_documents = np.zeros(tot_words) # JUST TO SHOW!!!


# for each term_id, fill IDF_array[term_id] with its occurrencies in counter_dict[term_id]
for term_id in counter_dict.keys():
    IDF_array[term_id] = log(n/counter_dict[term_id])
    IDF_only_occurrencies_in_all_documents[term_id] = counter_dict[term_id]
# broadcast operation: we can multiply each TFIDF row element wise with IDF_array
TFIDF_matrix = TF_matrix*IDF_array




In [257]:
df

Unnamed: 0,description
0,hello my name
1,my Michele my


In [261]:
list(voc.keys())

['name', 'hello', 'my', 'Michele']

In [258]:
TF_matrix

array([[1., 1., 1., 0.],
       [0., 0., 2., 1.]])

In [259]:
# variable created only to show how it works
IDF_only_occurrencies_in_all_documents

array([1., 1., 2., 1.])

In [236]:
# to show how broadcast works (and that we can do TF*IDF)
TFIDF_fake_matrix = TF_matrix*IDF_only_occurrencies_in_all_documents 
TFIDF_fake_matrix

array([[1., 2., 1., 0.],
       [0., 4., 0., 1.]])

In [204]:
IDF_array #real IDF obtained with log(n/occurrencies in all documents)

array([0.69314718, 0.        , 0.69314718, 0.69314718])

In [237]:
TFIDF_matrix # real TFIDF obtained with real algorithm

array([[0.69314718, 0.        , 0.69314718, 0.        ],
       [0.        , 0.        , 0.        , 0.69314718]])

In [272]:
pd.DataFrame(TFIDF_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25030,25031,25032,25033,25034,25035,25036,25037,25038,25039
0,4.836282,42.74927,0.901829,2.426265,4.812184,3.234188,2.792531,3.949696,1.953391,4.520277,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.00000,0.000000,2.426265,0.000000,0.000000,0.000000,3.949696,1.953391,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.953391,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.953391,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [273]:
# Import required packages
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt



In [275]:
tfidf_dataframe = pd.DataFrame(TFIDF_matrix)

In [277]:
mms = MinMaxScaler()
mms.fit(tfidf_dataframe)
data_transformed = mms.transform(tfidf_dataframe)

In [None]:
start = time()
Sum_of_squared_distances = []
K = range(1,500)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(data_transformed)
    Sum_of_squared_distances.append(km.inertia_)
end = time()
print(end-start)

In [None]:
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.grid()
plt.show()