In [1]:
def to_lowercase(text):
    return text.lower()

# Contoh penggunaan
sample_text = "Sihabudin going to campus."
print(to_lowercase(sample_text))


sihabudin going to campus.


In [2]:
import string

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Contoh penggunaan
sample_text = "Sihab, stay here plese!"
print(remove_punctuation(sample_text))


Sihab stay here plese


In [3]:
import re

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# Contoh penggunaan
sample_text = "Dzikri can speak 2 languages, Indonesia and English."
print(remove_numbers(sample_text))


Dzikri can speak  languages, Indonesia and English.


In [4]:
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')

def tokenize(text):
    return word_tokenize(text)

# Contoh penggunaan
sample_text = "I like machine learning course."
print(tokenize(sample_text))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['I', 'like', 'machine', 'learning', 'course', '.']


In [5]:
from nltk.corpus import stopwords

nltk.download('stopwords')

def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    return [word for word in words if word not in stop_words]

# Contoh penggunaan
sample_text = "I choose Informatics Engineering for my major."
tokenized_text = tokenize(sample_text)
print(remove_stopwords(tokenized_text))


['I', 'choose', 'Informatics', 'Engineering', 'major', '.']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
from nltk.stem import PorterStemmer

def stem_words(words):
    ps = PorterStemmer()
    return [ps.stem(word) for word in words]

# Contoh penggunaan
sample_text = "Dont forget my name!."
tokenized_text = tokenize(sample_text)
filtered_words = remove_stopwords(tokenized_text)
print(stem_words(filtered_words))


['dont', 'forget', 'name', '!', '.']


In [7]:
sentence1 = "I love Harry Potter movies"
sentence2 = "Harmonie Granger is so beatifull in Harry Potter movies"
sentence3 = "in Harry Potter movies Harmonie so smart too"

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
docs = [sentence1, sentence2, sentence3]
print(docs)

['I love Harry Potter movies', 'Harmonie Granger is so beatifull in Harry Potter movies', 'in Harry Potter movies Harmonie so smart too']


In [9]:
#Mendefinisikan dan menyesuaikan count vectorizer pada dokumen.

vec = CountVectorizer()
X = vec.fit_transform(docs)
#Mengonversi vektor pada DataFrame menggunakan pandas

df = pd.DataFrame(X.toarray(),
    columns=vec.get_feature_names_out())
df.head()

Unnamed: 0,beatifull,granger,harmonie,harry,in,is,love,movies,potter,smart,so,too
0,0,0,0,1,0,0,1,1,1,0,0,0
1,1,1,1,1,1,1,0,1,1,0,1,0
2,0,0,1,1,1,0,0,1,1,1,1,1


In [10]:
import numpy as np
from collections import Counter
from math import log

# Tiga dokumen dalam korpus
documents = [
    "Severus Snape is so danger",
    "Harry Potter can kill Severus Snape"
    ]

# Preprocessing: Lowercasing and tokenizing
tokenized_documents = [doc.lower().split() for doc in documents]

# Menghitung TF
def compute_tf(tokenized_doc):
    tf_dict = {}
    term_count = Counter(tokenized_doc)
    total_terms = len(tokenized_doc)
    for term, count in term_count.items():
        tf_dict[term] = count / total_terms
    return tf_dict

tf_list = [compute_tf(doc) for doc in tokenized_documents]

print("Term Frequency (TF):")
for idx, tf in enumerate(tf_list):
    print(f"Document {idx + 1} TF:")
    for term, score in tf.items():
        print(f"    {term}: {score:.4f}")


Term Frequency (TF):
Document 1 TF:
    severus: 0.2000
    snape: 0.2000
    is: 0.2000
    so: 0.2000
    danger: 0.2000
Document 2 TF:
    harry: 0.1667
    potter: 0.1667
    can: 0.1667
    kill: 0.1667
    severus: 0.1667
    snape: 0.1667


In [11]:
# Menghitung IDF
def compute_idf(tokenized_docs):
    idf_dict = {}
    total_docs = len(tokenized_docs)
    all_terms = set(term for doc in tokenized_docs for term in doc)
    for term in all_terms:
        doc_containing_term = sum(1 for doc in tokenized_docs if term in doc)
        idf_dict[term] = log(total_docs / (1 + doc_containing_term)) + 1
    return idf_dict

idf_dict = compute_idf(tokenized_documents)

print("\nInverse Document Frequency (IDF):")
for term, score in idf_dict.items():
    print(f"    {term}: {score:.4f}")



Inverse Document Frequency (IDF):
    danger: 1.0000
    harry: 1.0000
    so: 1.0000
    potter: 1.0000
    snape: 0.5945
    can: 1.0000
    kill: 1.0000
    is: 1.0000
    severus: 0.5945


In [12]:
# Menghitung TF-IDF
def compute_tfidf(tf_list, idf_dict):
    tfidf_list = []
    for tf in tf_list:
        tfidf_dict = {}
        for term, tf_value in tf.items():
            tfidf_dict[term] = tf_value * idf_dict.get(term, 0)
        tfidf_list.append(tfidf_dict)
    return tfidf_list

tfidf_list = compute_tfidf(tf_list, idf_dict)

print("\nTF-IDF:")
for idx, tfidf in enumerate(tfidf_list):
    print(f"Document {idx + 1} TF-IDF:")
    for term, score in tfidf.items():
        print(f"    {term}: {score:.4f}")


TF-IDF:
Document 1 TF-IDF:
    severus: 0.1189
    snape: 0.1189
    is: 0.2000
    so: 0.2000
    danger: 0.2000
Document 2 TF-IDF:
    harry: 0.1667
    potter: 0.1667
    can: 0.1667
    kill: 0.1667
    severus: 0.0991
    snape: 0.0991


In [13]:
from gensim.models import Word2Vec
import numpy as np

corpus = [
    'Harry Potter is my favorite one character in Harry Potter movies.',
    'Harmonie is my second favorite character in Harry Potter movies.',
    'Ron Weasles is my third favorite character in Harry Potter movies..'
]

sentences = [doc.split() for doc in corpus]
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

def document_vector(doc):
    return np.mean([model.wv[word] for word in doc.split() if word in model.wv], axis=0)

doc_vectors = [document_vector(doc) for doc in corpus]
print(doc_vectors)


[array([-1.66641141e-03,  2.16215174e-03,  1.32544164e-03,  1.41314091e-03,
        6.08421920e-04, -1.92189845e-03,  3.26976390e-03,  4.17150371e-03,
       -2.51867925e-03, -3.46732140e-03,  1.98364211e-03, -2.18229182e-03,
       -2.32981960e-03,  1.10365823e-03, -4.92488965e-04, -9.39447928e-05,
        4.30912664e-03,  2.27175048e-03, -3.49220261e-03, -4.25478024e-03,
        1.34532363e-03, -4.49883402e-04,  5.71221160e-03, -1.06209423e-03,
        7.26817671e-05,  6.84585655e-04,  2.29056939e-04,  2.18429137e-03,
       -2.61831866e-03,  1.76909124e-03,  1.32741581e-03, -1.67309691e-03,
       -1.51058717e-04, -5.31912269e-03, -7.29974301e-04,  3.60981561e-04,
        3.51012964e-03,  3.40464787e-04,  1.57709917e-04,  1.55780511e-03,
        2.73082382e-03, -1.83449755e-03, -3.04894522e-03,  1.19402306e-04,
        2.51020421e-03,  2.13022064e-03, -1.43106515e-03,  3.36136931e-04,
        1.38517015e-03,  1.12066616e-03,  5.59474400e-04, -1.98119925e-03,
       -1.89652244e-04, 

In [14]:
!pip install numpy pandas scikit-learn




Klasifikasi teks dengan Machine Learning.
This dataset is a collection newsgroup documents. The 20 newsgroups collection has become a popular data set for experiments in text applications of machine learning techniques, such as text classification and text clustering.


In [15]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics

# 1. Mengumpulkan data
newsgroups = fetch_20newsgroups(subset='all')

# 2. Preprocessing data
# Tidak perlu preprocessing khusus karena kita akan menggunakan TfidfVectorizer

# 3. Membagi data menjadi training dan testing set
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.25, random_state=42)

# 4. Melatih model
# Membuat pipeline yang mencakup TfidfVectorizer dan MultinomialNB
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Melatih model menggunakan training set
model.fit(X_train, y_train)

# 5. Mengevaluasi model
# Prediksi pada testing set
y_pred = model.predict(X_test)

# Evaluasi kinerja model
print(f"Accuracy: {metrics.accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(metrics.classification_report(y_test, y_pred, target_names=newsgroups.target_names))

# Confusion Matrix
print("Confusion Matrix:")
print(metrics.confusion_matrix(y_test, y_pred))


Accuracy: 0.8425297113752123
Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.88      0.72      0.79       198
           comp.graphics       0.86      0.79      0.82       245
 comp.os.ms-windows.misc       0.88      0.83      0.85       242
comp.sys.ibm.pc.hardware       0.66      0.86      0.75       238
   comp.sys.mac.hardware       0.95      0.84      0.89       250
          comp.windows.x       0.96      0.80      0.87       260
            misc.forsale       0.96      0.66      0.78       241
               rec.autos       0.89      0.93      0.91       244
         rec.motorcycles       0.91      0.95      0.93       219
      rec.sport.baseball       0.96      0.94      0.95       261
        rec.sport.hockey       0.90      0.98      0.94       245
               sci.crypt       0.78      0.98      0.87       251
         sci.electronics       0.92      0.80      0.86       249
                 sci.me