# TF VS TF-IDF GLOVE

#12S18022 Alex Conro Manuel
#12S18035 Angeline Naomi Christina Sinaga 

**Load Dataset**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/spam (1).csv", encoding="latin-1")

dataset = dataset.dropna(how="any", axis=1)
dataset.columns = ['target', 'message']

dataset.head()

Unnamed: 0,target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Count Message Len 

In [None]:
dataset['message_len'] = dataset['message'].apply(lambda x: len(x.split(' ')))
dataset.head()

Unnamed: 0,target,message,message_len
0,ham,"Go until jurong point, crazy.. Available only ...",20
1,ham,Ok lar... Joking wif u oni...,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28
3,ham,U dun say so early hor... U c already then say...,11
4,ham,"Nah I don't think he goes to usf, he lives aro...",13


In [None]:
max(dataset['message_len'])

171

**Exploratory Data Analysis**

In [None]:
balance_counts = dataset.groupby('target')['target'].agg('count').values
balance_counts

array([4825,  747])

In [None]:
import plotly.graph_objs as go
fig = go.Figure()
fig.add_trace(go.Bar(
    x=['ham'],
    y=[balance_counts[0]],
    name='ham',
    text=[balance_counts[0]],
    textposition='auto',
   
))
fig.add_trace(go.Bar(
    x=['spam'],
    y=[balance_counts[1]],
    name='spam',
    text=[balance_counts[1]],
    textposition='auto',
    
))
fig.update_layout(
    title='<span style="font-size:32px; font-family:Times New Roman">Dataset distribution by target</span>'
)
fig.show()

In [None]:
ham_dataset = dataset[dataset['target'] == 'ham']['message_len'].value_counts().sort_index()
spam_dataset = dataset[dataset['target'] == 'spam']['message_len'].value_counts().sort_index()

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=ham_dataset.index,
    y=ham_dataset.values,
    name='ham',
    fill='tozeroy',
    
))
fig.add_trace(go.Scatter(
    x=spam_dataset.index,
    y=spam_dataset.values,
    name='spam',
    fill='tozeroy',
    
))
fig.update_layout(
    title='<span style="font-size:32px; font-family:Times New Roman">Data Roles in Different Fields</span>'
)
fig.update_xaxes(range=[0, 70])
fig.show()

**Data Preprocessing**

Case Folding

In [None]:

def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
import re
import string
dataset['message_clean'] = dataset['message'].apply(clean_text)
dataset.head()

Unnamed: 0,target,message,message_len,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",20,go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,6,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entry in a wkly comp to win fa cup final...
3,ham,U dun say so early hor... U c already then say...,11,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",13,nah i dont think he goes to usf he lives aroun...


**Tokenisasi**

In [None]:
def tokenize(message_clean):
    token = nltk.word_tokenize(message_clean)
    return token

In [None]:

import nltk
nltk.download('punkt')
dataset['tokenized'] = dataset['message_clean'].apply(tokenize)

print('Tokenizing Result : \n') 
print(dataset['tokenized'].head())
print('\n\n\n')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Tokenizing Result : 

0    [go, jurong, point, crazi, avail, bugi, n, gre...
1                            [ok, lar, joke, wif, oni]
2    [free, entri, wkli, comp, win, fa, cup, final,...
3                 [dun, say, earli, hor, alreadi, say]
4    [nah, dont, think, goe, usf, live, around, tho...
Name: tokenized, dtype: object






Stopwords

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

def remove_stop_words(message_clean):
    removed_stop_words = []
    for review in message_clean:
        removed_stop_words.append(
            ' '.join([word for word in review.split() 
                      if word not in stop_words])
        )
    return removed_stop_words

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
dataset['stopwords_removed'] = dataset['tokenized'].apply(remove_stop_words)

print('Stopwords Result : \n') 
print(dataset['stopwords_removed'].head())
print('\n\n\n')

Stopwords Result : 

0    [go, jurong, point, crazi, avail, bugi, n, gre...
1                            [ok, lar, joke, wif, oni]
2    [free, entri, wkli, comp, win, fa, cup, final,...
3                 [dun, say, earli, hor, alreadi, say]
4    [nah, dont, think, goe, usf, live, around, tho...
Name: stopwords_removed, dtype: object






**Stemming**

In [None]:
# create stemmer
from nltk.stem.porter import PorterStemmer
    
stemmer = PorterStemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in dataset['stopwords_removed']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])
    
print(term_dict)
print("------------------------")


# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document] 



[1;30;43mOutput streaming akan dipotong hingga 5000 baris terakhir.[0m
cheap : cheap
perhap : perhap
thatû÷ : thatû÷
silli : silli
isnû÷t : isnû÷t
uv : uv
mutat : mutat
sunscreen : sunscreen
essenti : essenti
theseday : theseday
lunchyou : lunchyou
onlinewhi : onlinewhi
pic : pic
aiyo : aiyo
fast : fast
workin : workin
bao : bao
sugardad : sugardad
ahge : ahg
meim : meim
browni : browni
geeeee : geeeee
bare : bare
ninish : ninish
icki : icki
american : american
freek : freek
callin : callin
jen : jen
oooh : oooh
ridden : ridden
ey : ey
gym : gym
whatev : whatev
daddi : daddi
dick : dick
missi : missi
yar : yar
mum : mum
sch : sch
clean : clean
lab : lab
goggl : goggl
door : door
arngd : arngd
marriag : marriag
walkin : walkin
unfortunt : unfortunt
snake : snake
bite : bite
danc : danc
frnt : frnt
sayin : sayin
izzit : izzit
textand : textand
exwif : exwif
kid : kid
jjc : jjc
tendenc : tendenc
headach : headach
jazz : jazz
yogasana : yogasana
em : em
meiv : meiv
gotani : gotani
sr

In [None]:
dataset['message_clean'] = dataset['message_clean'].apply(stemm_text)
dataset.head()

Unnamed: 0,target,message,message_len,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",20,go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,6,ok lar joke wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entri wkli comp win fa cup final tkts m...
3,ham,U dun say so early hor... U c already then say...,11,dun say earli hor alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",13,nah dont think goe usf live around though


In [None]:
pip install swifter



In [None]:
dataset.to_csv('Preprocessing12.csv', index=False)

**Feature Extraction (TF-IDF)**

Vectorization

In [None]:
# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
x = dataset['message_clean']
y = dataset['target']


print(len(x), len(y))

5572 5572


In [None]:
# Split into train and test sets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y,  test_size=0.2)
x_train

4860                                    nokia phone lovli
4334    sound like manki scous boy stevelik travel da ...
4374    ur tonex subscript renew charg å£ choos  poli ...
1163    new theori argument win situat lose person don...
2770                                   problem talk later
                              ...                        
1515                                wonder youll get text
2971                    sari need tim bollox hurt lot tol
1777    buy space invad  chanc  win orig arcad game co...
441                                        suppos wake gt
3710                              sorri pa dont knw ru pa
Name: message_clean, Length: 4457, dtype: object

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# instantiate the vectorizer
vect = CountVectorizer()
vect.fit(x_train)

CountVectorizer()

In [None]:
# Use the trained to create a document-term matrix from train and test sets
x_train_dtm = vect.transform(x_train)
x_test_dtm = vect.transform(x_test)

Tunning CountVectorizer

In [None]:
vect_tunned = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.1, max_df=0.7, max_features=100)

**TF-IDF**

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()

tfidf_transformer.fit(x_train_dtm)
x_train_tfidf = tfidf_transformer.transform(x_train_dtm)

x_train_tfidf

<4457x5950 sparse matrix of type '<class 'numpy.float64'>'
	with 34834 stored elements in Compressed Sparse Row format>

**Word Embeddings: GloVe**

In [None]:
text = dataset['message_clean']
target = dataset['target']

In [76]:
def calc_TF(message_clean):
    # Counts the number of times the word appears in review
    TF_dict = {}
    for term in message_clean:
        if term in TF_dict:
            TF_dict[term] += 1
        else:
            TF_dict[term] = 1
    # Computes tf for each word
    for term in TF_dict:
        TF_dict[term] = TF_dict[term] / len(target)
    return TF_dict

dataset["TF_dict"] = dataset['message_clean'].apply(calc_TF)

dataset["TF_dict"].head()

0    {'g': 0.0008973438621679827, 'o': 0.0010768126...
1    {'o': 0.0005384063173007897, 'k': 0.0003589375...
2    {'f': 0.0007178750897343862, 'r': 0.0008973438...
3    {'d': 0.0003589375448671931, 'u': 0.0001794687...
4    {'n': 0.0007178750897343862, 'a': 0.0003589375...
Name: TF_dict, dtype: object

In [None]:
index = 90

print('%20s' % "term", "\t", "TF\n")
for key in dataset["TF_dict"][index]:
    print('%20s' % key, "\t", dataset["TF_dict"][index][key])

                term 	 TF

                   y 	 0.0003589375448671931
                   e 	 0.0005384063173007897
                   a 	 0.0005384063173007897
                   h 	 0.0007178750897343862
                     	 0.001256281407035176
                   d 	 0.0003589375448671931
                   o 	 0.0008973438621679827
                   n 	 0.0003589375448671931
                    	 0.0003589375448671931
                   û 	 0.0003589375448671931
                   ÷ 	 0.0003589375448671931
                   t 	 0.0008973438621679827
                   s 	 0.0005384063173007897
                   c 	 0.0005384063173007897
                   l 	 0.0005384063173007897
                   u 	 0.00017946877243359656
                   m 	 0.00017946877243359656


In [None]:
def calc_DF(tfDict):
    count_DF = {}
    # Run through each document's tf dictionary and increment countDict's (term, doc) pair
    for document in tfDict:
        for term in document:
            if term in count_DF:
                count_DF[term] += 1
            else:
                count_DF[term] = 1
    return count_DF

DF = calc_DF(dataset["TF_dict"])
DF

{'\r': 2,
 ' ': 5469,
 'a': 4876,
 'b': 2229,
 'c': 3525,
 'd': 3617,
 'e': 5048,
 'f': 2051,
 'g': 3156,
 'h': 3730,
 'i': 4575,
 'j': 481,
 'k': 2780,
 'l': 4245,
 'm': 3493,
 'n': 4430,
 'o': 4771,
 'p': 2989,
 'q': 193,
 'r': 4413,
 's': 4138,
 't': 4767,
 'u': 3229,
 'v': 1826,
 'w': 2879,
 'x': 891,
 'y': 2296,
 'z': 367,
 '\x89': 51,
 '\x8b': 1,
 '\x8e': 1,
 '£': 257,
 '©': 1,
 'ª': 2,
 '¬': 2,
 '´': 1,
 'á': 1,
 'â': 1,
 'ä': 1,
 'å': 288,
 'ì': 139,
 'ï': 52,
 'ð': 3,
 'ò': 8,
 'ó': 4,
 'ô': 2,
 'õ': 27,
 'ö': 1,
 '÷': 29,
 'û': 51}

In [None]:
import numpy as np
n_document = len(dataset)

def calc_IDF(__n_document, __DF):
    IDF_Dict = {}
    for term in __DF:
        IDF_Dict[term] = np.log(__n_document / (__DF[term] + 1))
    return IDF_Dict
  
#Stores the idf dictionary
IDF = calc_IDF(n_document, DF)

In [None]:
#calc TF-IDF
def calc_TF_IDF(TF):
    TF_IDF_Dict = {}
    #For each word in the review, we multiply its tf and its idf.
    for key in TF:
        TF_IDF_Dict[key] = TF[key] * IDF[key]
    return TF_IDF_Dict

#Stores the TF-IDF Series
dataset["TF-IDF_dict"] = dataset["TF_dict"].apply(calc_TF_IDF)

In [None]:
# Check TF-IDF result
index = 1

print('%20s' % "term", "\t", '%10s' % "TF", "\t", '%20s' % "TF-IDF\n")
for key in dataset["TF-IDF_dict"][index]:
    print('%20s' % key, "\t", dataset["TF_dict"][index][key] ,"\t" , dataset["TF-IDF_dict"][index][key])
    
TF_IDF = pd.DataFrame(dataset, columns=['term', 'rank'])

                term 	         TF 	              TF-IDF

                   o 	 0.0005384063173007897 	 8.34468153348293e-05
                   k 	 0.0003589375448671931 	 0.0002494413067716194
                     	 0.0007178750897343862 	 1.326305777722148e-05
                   l 	 0.00017946877243359656 	 4.8775429403534634e-05
                   a 	 0.00017946877243359656 	 2.3909508110129883e-05
                   r 	 0.00017946877243359656 	 4.181133310894016e-05
                   j 	 0.00017946877243359656 	 0.0004392615256369519
                   e 	 0.00017946877243359656 	 1.7689133532085555e-05
                   w 	 0.00017946877243359656 	 0.00011844288617546747
                   i 	 0.0003589375448671931 	 7.068514064611003e-05
                   f 	 0.00017946877243359656 	 0.00017927837914735034
                   n 	 0.00017946877243359656 	 4.1121457950503746e-05


In [None]:
# sort descending by value for DF dictionary 
sorted_DF = sorted(DF.items(), key=lambda kv: kv[1], reverse=True)[:50]

# Create a list of unique words from sorted dictionay `sorted_DF`
unique_term = [item[0] for item in sorted_DF]

def calc_TF_IDF_Vec(__TF_IDF_Dict):
    TF_IDF_vector = [0.0] * len(unique_term)

    # For each unique word, if it is in the review, store its TF-IDF value.
    for i, term in enumerate(unique_term):
        if term in __TF_IDF_Dict:
            TF_IDF_vector[i] = __TF_IDF_Dict[term]
    return TF_IDF_vector

dataset["TF_IDF_Vec"] = dataset["TF-IDF_dict"].apply(calc_TF_IDF_Vec)

print("print first row matrix TF_IDF_Vec Series\n")
print(dataset["TF_IDF_Vec"][0])

print("\nmatrix size : ", len(dataset["TF_IDF_Vec"][0]))

print first row matrix TF_IDF_Vec Series

[4.973646666458055e-05, 7.075653412834222e-05, 0.00016736655677090917, 0.0001668936306696586, 0.00013983051521402606, 0.00017671285161527506, 0.00016448583180201498, 0.00020905666554470078, 0.00014632628821060388, 0.0, 0.0, 7.75004789921852e-05, 0.00016424622108466644, 8.375930441595377e-05, 0.0002935778456656465, 0.0005098096252168922, 0.00011171584144202517, 0.00023688577235093494, 0.0, 0.0, 0.00032869794344778607, 0.0003585567582947007, 0.00020012181954804192, 0.0, 0.0004392615256369519, 0.000487693179599922, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

matrix size :  50


In [None]:
# Convert Series to List
TF_IDF_Vec_List = np.array(dataset["TF_IDF_Vec"].to_list())

# Sum element vector in axis=0 
sums = TF_IDF_Vec_List.sum(axis=0)

data = []

for col, term in enumerate(unique_term):
    data.append((term, sums[col]))
    
ranking = pd.DataFrame(data, columns=['term', 'rank'])
ranking.sort_values('rank', ascending=False)

Unnamed: 0,term,rank
16,p,0.654431
12,c,0.641874
8,l,0.636714
11,d,0.602721
14,u,0.599682
9,s,0.597854
13,m,0.58263
20,b,0.581631
21,f,0.579966
15,g,0.577308
