In [107]:
import pandas as pd
import numpy as np
import re
import nltk
from collections import Counter
import scipy.sparse as sp
from numpy.linalg import norm
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bravo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [108]:
corpus = [
   "So there is no way for me to plug it in here in the US unless I go by a converter.",
   "Good case, Excellent value.",
   "Great for the jawbone.",
   "Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!",
   "The mic is great.",
   "I have to jiggle the plug to get it to line up right to get decent volume.",
   "If you have several dozen or several hundred contacts, then imagine the fun of sending each of them one by one.",
   "If you are Razr owner...you must have this!",
   "Needless to say, I wasted my money.",
   "What a waste of money and time!.",
]


In [109]:
cleaned_corpus = [re.sub(r'[^a-zA-Z0-9\s]', '', sentence) for sentence in corpus] # Menghapus semua karakter non-alphanumeric dari setiap string dalam corpus.
cleaned_corpus = [sentence.lower().strip() for sentence in cleaned_corpus] # Menghilangkan Whitespace dan menjadikan teks lowercase.

for i in cleaned_corpus :
    print(i)

so there is no way for me to plug it in here in the us unless i go by a converter
good case excellent value
great for the jawbone
tied to charger for conversations lasting more than 45 minutesmajor problems
the mic is great
i have to jiggle the plug to get it to line up right to get decent volume
if you have several dozen or several hundred contacts then imagine the fun of sending each of them one by one
if you are razr owneryou must have this
needless to say i wasted my money
what a waste of money and time


In [110]:
stemmer = PorterStemmer()

import nltk
nltk.download('punkt')

stemmed_corpus = []
for sentence in corpus:
    words = word_tokenize(sentence)
    stemmed_words = [stemmer.stem(word) for word in words]
    stemmed_sentence = ' '.join(stemmed_words)
    stemmed_corpus.append(stemmed_sentence)

for i, stemmed_sentence in enumerate(stemmed_corpus):
    print(f"Sentence {i+1}: {stemmed_sentence}")

Sentence 1: so there is no way for me to plug it in here in the us unless i go by a convert .
Sentence 2: good case , excel valu .
Sentence 3: great for the jawbon .
Sentence 4: tie to charger for convers last more than 45 minutes.major problem ! !
Sentence 5: the mic is great .
Sentence 6: i have to jiggl the plug to get it to line up right to get decent volum .
Sentence 7: if you have sever dozen or sever hundr contact , then imagin the fun of send each of them one by one .
Sentence 8: if you are razr owner ... you must have thi !
Sentence 9: needless to say , i wast my money .
Sentence 10: what a wast of money and time ! .


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Bravo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [111]:
tokens = [nltk.word_tokenize(sentence) for sentence in cleaned_corpus]
for i in tokens :
    print(i)

['so', 'there', 'is', 'no', 'way', 'for', 'me', 'to', 'plug', 'it', 'in', 'here', 'in', 'the', 'us', 'unless', 'i', 'go', 'by', 'a', 'converter']
['good', 'case', 'excellent', 'value']
['great', 'for', 'the', 'jawbone']
['tied', 'to', 'charger', 'for', 'conversations', 'lasting', 'more', 'than', '45', 'minutesmajor', 'problems']
['the', 'mic', 'is', 'great']
['i', 'have', 'to', 'jiggle', 'the', 'plug', 'to', 'get', 'it', 'to', 'line', 'up', 'right', 'to', 'get', 'decent', 'volume']
['if', 'you', 'have', 'several', 'dozen', 'or', 'several', 'hundred', 'contacts', 'then', 'imagine', 'the', 'fun', 'of', 'sending', 'each', 'of', 'them', 'one', 'by', 'one']
['if', 'you', 'are', 'razr', 'owneryou', 'must', 'have', 'this']
['needless', 'to', 'say', 'i', 'wasted', 'my', 'money']
['what', 'a', 'waste', 'of', 'money', 'and', 'time']


In [112]:
stop_words = nltk.corpus.stopwords.words('english')
filtered_tokens = [[word for word in sentence if word.lower() not in stop_words] for sentence in tokens]

filtered_text = [' '.join(sentence) for sentence in filtered_tokens]

for i in filtered_text :
    print(i)

way plug us unless go converter
good case excellent value
great jawbone
tied charger conversations lasting 45 minutesmajor problems
mic great
jiggle plug get line right get decent volume
several dozen several hundred contacts imagine fun sending one one
razr owneryou must
needless say wasted money
waste money time


In [113]:
class TFIDF(object):

    def __init__(self, corpus):
        self.corpus = corpus
        self.norm_corpus  = None

    def __normalize_corpus(self, d):
        stop_words = nltk.corpus.stopwords.words('english')
        d = re.sub(r'[^a-zA-Z0-9\s]', '', d, re.I|re.A)
        d = d.lower().strip()
        tks = nltk.word_tokenize(d)
        f_tks = [t for t in tks if t not in stop_words]
        return ' '.join(f_tks)

    def preprocessing_text(self):
        n_c = np.vectorize(self.__normalize_corpus)
        self.norm_corpus = n_c(self.corpus)

    def tf(self):
        words_array = [doc.split() for doc in self.norm_corpus]
        words = list(set([word for words in words_array for word in words]))
        features_dict = {w:0 for w in words}
        tf = []
        for doc in self.norm_corpus:
            bowf_doc = Counter(doc.split())
            all_f = Counter(features_dict)
            bowf_doc.update(all_f)
            tf.append(bowf_doc)
        return pd.DataFrame(tf)

    def df(self, tf):
        features_names = list(tf.columns)
        df = np.diff(sp.csc_matrix(tf, copy=True).indptr)
        return df

    def idf(self, df):
        N = 1 + len(self.norm_corpus)
        idf = (1.0 + np.log(float(N) / df))
        idf_d = sp.spdiags(idf, diags= 0, m=len(df), n= len(df)).todense()
        return idf, idf_d

    def tfidf(self, tf, idf):
        tf = np.array(tf, dtype='float64')
        print("TF : ", tf)
        print("IDF : ", idf)
        tfidf = tf * idf
        print("TFIDF : ", tfidf)
        norms = norm(tfidf , axis=1)
        print("Normal : ", norms)
        return (tfidf / norms[:,None])

In [114]:
obj = TFIDF(corpus)
obj.preprocessing_text()

tf = obj.tf()

In [115]:
df = obj.df(tf)
df = pd.DataFrame(df, index=tf.columns, columns=['df'])

In [116]:
from tabulate import tabulate

# Data TF-IDF
data = [
    ["no", 3.397895273, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    ["good", 0, 3.397895273, 0, 0, 0, 0, 0, 0, 0, 0],
    ["great", 0, 0, 3.48490665, 0, 3.48490665, 0, 0, 0, 0, 0],
    ["tied", 0, 0, 0, 3.397895273, 0, 0, 0, 0, 0, 0],
    ["mic", 0, 0, 0, 0, 3.397895273, 0, 0, 0, 0, 0],
    ["jiggle", 0, 0, 0, 0, 0, 3.397895273, 0, 0, 0, 0],
    ["several", 0, 0, 0, 0, 0, 0, 6.9698133, 0, 0, 0],
    ["razr", 0, 0, 0, 0, 0, 0, 0, 3.397895273, 0, 0],
    ["needless", 0, 0, 0, 0, 0, 0, 0, 0, 3.397895273, 0],
    ["what", 0, 0, 0, 0, 0, 0, 0, 0, 0, 3.397895273],
    ["way", 3.397895273, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    ["case", 0, 3.397895273, 0, 0, 0, 0, 0, 0, 0, 0],
    ["jawbone", 0, 0, 3.397895273, 0, 0, 0, 0, 0, 0, 0],
    ["charger", 0, 0, 0, 3.397895273, 0, 0, 0, 0, 0, 0],
    ["plug", 3.48490665, 0, 0, 0, 0, 3.48490665, 0, 0, 0, 0],
    ["dozen", 0, 0, 0, 0, 0, 0, 3.397895273, 0, 0, 0],
    ["owner", 0, 0, 0, 0, 0, 0, 0, 3.397895273, 0, 0],
    ["say", 0, 0, 0, 0, 0, 0, 0, 0, 3.397895273, 0],
    ["waste", 0, 0, 0, 0, 0, 0, 0, 0, 3.48490665, 3.48490665],
    ["excellent", 0, 3.397895273, 0, 0, 0, 0, 0, 0, 0, 0],
    ["conversations", 0, 0, 0, 3.397895273, 0, 0, 0, 0, 0, 0],
    ["get", 0, 0, 0, 0, 0, 6.9698133, 0, 0, 0, 0],
    ["must", 0, 0, 0, 0, 0, 0, 0, 3.397895273, 0, 0],
    ["wasted", 0, 0, 0, 0, 0, 0, 0, 0, 3.397895273, 0],
    ["money", 0, 0, 0, 0, 0, 0, 0, 0, 3.48490665, 3.48490665],
    ["here", 3.397895273, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    ["value", 0, 3.397895273, 0, 0, 0, 0, 0, 0, 0, 0],
    ["lasting", 0, 0, 0, 3.397895273, 0, 0, 0, 0, 0, 0],
    ["line", 0, 0, 0, 0, 0, 3.397895273, 0, 0, 0, 0],
    ["hundred", 0, 0, 0, 0, 0, 0, 3.397895273, 0, 0, 0],
    ["and", 0, 0, 0, 0, 0, 0, 0, 0, 0, 3.397895273],
    ["us", 3.48490665, 0, 0, 0, 0, 0, 0, 3.48490665, 0, 0],
    ["more", 0, 0, 0, 3.397895273, 0, 0, 0, 0, 0, 0],
    ["right", 0, 0, 0, 0, 0, 3.397895273, 0, 0, 0, 0],
    ["contacts", 0, 0, 0, 0, 0, 0, 3.397895273, 0, 0, 0],
    ["time", 0, 0, 0, 0, 0, 0, 0, 0, 0, 3.397895273],
    ["unless", 3.397895273, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    ["minutes", 0, 0, 0, 3.397895273, 0, 0, 0, 0, 0, 0],
    ["imagine", 0, 0, 0, 0, 0, 0, 3.397895273, 0, 0, 0],
    ["go", 3.48490665, 3.48490665, 0, 0, 0, 0, 0, 0, 0, 0],
    ["major", 0, 0, 0, 3.397895273, 0, 0, 0, 0, 0, 0],
    ["decent", 0, 0, 0, 0, 0, 3.397895273, 0, 0, 0, 0],
    ["fun", 0, 0, 0, 0, 0, 0, 3.397895273, 0, 0, 0],
    ["converter", 3.397895273, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    ["problems", 0, 0, 0, 3.397895273, 0, 0, 0, 0, 0, 0],
    ["volume", 0, 0, 0, 0, 0, 3.397895273, 0, 0, 0, 0],
    ["sending", 0, 0, 0, 0, 0, 0, 3.397895273, 0, 0, 0],
    ["one", 0, 0, 3.708050201, 0, 0, 0, 7.416100402, 0, 3.708050201, 3.708050201],
]

# Headers
headers = ["Term", "Doc 1", "Doc 2", "Doc 3", "Doc 4", "Doc 5", "Doc 6", "Doc 7", "Doc 8", "Doc 9", "Doc 10"]

# Mencetak dengan tabulate
print(tabulate(data, headers=headers, tablefmt="grid"))


+---------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+
| Term          |   Doc 1 |   Doc 2 |   Doc 3 |   Doc 4 |   Doc 5 |   Doc 6 |   Doc 7 |   Doc 8 |   Doc 9 |   Doc 10 |
| no            | 3.3979  | 0       | 0       |  0      | 0       | 0       | 0       | 0       | 0       |  0       |
+---------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+
| good          | 0       | 3.3979  | 0       |  0      | 0       | 0       | 0       | 0       | 0       |  0       |
+---------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+
| great         | 0       | 0       | 3.48491 |  0      | 3.48491 | 0       | 0       | 0       | 0       |  0       |
+---------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+
| tied          | 0       | 0       | 0       | 

In [117]:
from tabulate import tabulate

# Data Covariant
data_covariant = [
    ["Doc", "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10"],
    ["Doc 1", 3.503173839, -4.831682224, 3.503173839, -4.831682224, 3.503173839, -4.831682224, 3.503173839, -4.831682224, 3.503173839, -4.831682224],
    ["Doc 2", -7.939217015, -6.05379296, -7.939217015, -6.05379296, -7.939217015, -6.05379296, -7.939217015, -6.05379296, -7.939217015, -6.05379296],
    ["Doc 3", -7.939217015, -6.05379296, -7.939217015, -6.05379296, -7.939217015, -6.05379296, -7.939217015, -6.05379296, -7.939217015, -6.05379296],
    ["Doc 4", -9.399149719, -7.167017389, -9.399149719, -7.167017389, -9.399149719, -7.167017389, -9.399149719, -7.167017389, -9.399149719, -7.167017389],
    ["Doc 5", -4.382010785, -3.341360489, -4.382010785, -3.341360489, -4.382010785, -3.341360489, -4.382010785, -3.341360489, -4.382010785, -3.341360489],
    ["Doc 6", -1.85636371, -6.26240726, -1.85636371, -6.26240726, -1.85636371, -6.26240726, -1.85636371, -6.26240726, -1.85636371, -6.26240726],
    ["Doc 7", -8.680958066, -6.619383591, -8.680958066, -6.619383591, -8.680958066, -6.619383591, -8.680958066, -6.619383591, -8.680958066, -6.619383591],
    ["Doc 8", 3.503173839, -4.831682224, 3.503173839, -4.831682224, 3.503173839, -4.831682224, 3.503173839, -4.831682224, 3.503173839, -4.831682224],
    ["Doc 9", -7.939217015, -6.05379296, -7.939217015, -6.05379296, -7.939217015, -6.05379296, -7.939217015, -6.05379296, -7.939217015, -6.05379296],
    ["Doc 10", -7.939217015, -6.05379296, -7.939217015, -6.05379296, -7.939217015, -6.05379296, -7.939217015, -6.05379296, -7.939217015, -6.05379296]
]

# Mencetak dengan tabulate
print(tabulate(data_covariant, headers="firstrow", tablefmt="grid"))


+--------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+
| Doc    |      PC1 |      PC2 |      PC3 |      PC4 |      PC5 |      PC6 |      PC7 |      PC8 |      PC9 |     PC10 |
| Doc 1  |  3.50317 | -4.83168 |  3.50317 | -4.83168 |  3.50317 | -4.83168 |  3.50317 | -4.83168 |  3.50317 | -4.83168 |
+--------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+
| Doc 2  | -7.93922 | -6.05379 | -7.93922 | -6.05379 | -7.93922 | -6.05379 | -7.93922 | -6.05379 | -7.93922 | -6.05379 |
+--------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+
| Doc 3  | -7.93922 | -6.05379 | -7.93922 | -6.05379 | -7.93922 | -6.05379 | -7.93922 | -6.05379 | -7.93922 | -6.05379 |
+--------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+
| Doc 4  | -9.39915 | -7.16702 |

In [None]:
from tabulate import tabulate

# Data Term Frequency
data_tf = [
    ["Term", "Doc 1", "Doc 2", "Doc 3", "Doc 4", "Doc 5", "Doc 6", "Doc 7", "Doc 8", "Doc 9", "Doc 10", "Value"],
    ["no", 3.397895273, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.074508831],
    ["good", 0, 3.397895273, 0, 0, 0, 0, 0, 0, 0, 0, 1.074508831],
    ["tied", 0, 0, 0, 3.397895273, 0, 0, 0, 0, 0, 0, 1.074508831],
    ["mic", 0, 0, 0, 0, 3.397895273, 0, 0, 0, 0, 0, 1.074508831],
    ["jiggle", 0, 0, 0, 0, 0, 3.397895273, 0, 0, 0, 0, 1.074508831]
]

# Mencetak dengan tabulate
print(tabulate(data_tf, headers="firstrow", tablefmt="grid"))
