<a href="https://colab.research.google.com/github/muhajirakbarhsb/NLP_class_2023/blob/main/Class_Meeting_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Representation

## OneHot Encoding

In [3]:
from sklearn.preprocessing import OneHotEncoder
import itertools

# two example documents
docs = ["cat", "dog", "bat", "ate"]

# Split documents into tokens
tokens_docs = [doc.split(" ") for doc in docs]

# Convert list of token-lists to one flat list of tokens
# and then create a dictionary that maps word to id of word
all_tokens = itertools.chain.from_iterable(tokens_docs)
word_to_id = {token: idx for idx, token in enumerate(set(all_tokens))}

# Convert token lists to token-id lists
token_ids = [[word_to_id[token] for token in tokens_doc] for tokens_doc in tokens_docs]

# Convert list of token-id lists to one-hot representation
vec = OneHotEncoder(categories="auto")
X = vec.fit_transform(token_ids)

# Convert the one-hot encoded vectors back to text representations
inverse_mapping = {idx: token for token, idx in word_to_id.items()}

# Display the original text representations
for i, row in enumerate(X.toarray()):
    print(f"Original Text: {docs[i]}")
    print("One-Hot Encoded Vector:", row)
    decoded_text = [inverse_mapping[idx] for idx, val in enumerate(row) if val == 1]
    print("Decoded Text:", " ".join(decoded_text))
    print()


Original Text: cat
One-Hot Encoded Vector: [0. 0. 1. 0.]
Decoded Text: cat

Original Text: dog
One-Hot Encoded Vector: [0. 0. 0. 1.]
Decoded Text: dog

Original Text: bat
One-Hot Encoded Vector: [0. 1. 0. 0.]
Decoded Text: bat

Original Text: ate
One-Hot Encoded Vector: [1. 0. 0. 0.]
Decoded Text: ate



In [4]:
tokens_docs

[['cat'], ['dog'], ['bat'], ['ate']]

In [5]:
word_to_id

{'ate': 0, 'bat': 1, 'cat': 2, 'dog': 3}

## Bag of Words countVectorizer


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
text = ["i love nlp. nlp is so cool"]
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
print(vectorizer.vocabulary_)
# Output: {'love': 2, 'nlp': 3, 'is': 1, 'so': 4, 'cool': 0}
# encode document
vector = vectorizer.transform(text)
# summarize encoded vector
print(vector.shape) # Output: (1, 5)
print(vector.toarray())

{'love': 2, 'nlp': 3, 'is': 1, 'so': 4, 'cool': 0}
(1, 5)
[[1 1 1 2 1]]


## TF-IDF

In [7]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re


paragraph = """The news mentioned here is fake. Audience do not encourage fake news. Fake news is false or misleading"""

sentences = nltk.sent_tokenize(paragraph)

lemmatizer = WordNetLemmatizer()

corpus = []

# Lemmatization
for i in range(len(sentences)):
    sent = re.sub('[^a-zA-Z]', ' ', sentences[i])
    sent = sent.lower()
    sent= sent.split()
    sent = [lemmatizer.lemmatize(word) for word in sent if not word in set(stopwords.words('english'))]
    sent = ' '.join(sent)
    corpus.append(sent)


print(corpus)


words_unique = []
for i in range(len(corpus)):
    unique = nltk.word_tokenize(corpus[i])
    words_unique.append(unique)

print(words_unique)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alharidt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alharidt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Alharidt\AppData\Roaming\nltk_data...


['news mentioned fake', 'audience encourage fake news', 'fake news false misleading']
[['news', 'mentioned', 'fake'], ['audience', 'encourage', 'fake', 'news'], ['fake', 'news', 'false', 'misleading']]


In [8]:
# Creating the TF-IDF model
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
independentFeatures_tfIDF = tfidf.fit_transform(corpus).toarray()
tfidf_df = pd.DataFrame(data=independentFeatures_tfIDF, columns=tfidf.get_feature_names_out())

# Add an index column to represent the documents
tfidf_df.index = [f"Document {i+1}" for i in range(len(corpus))]

# Display the TF-IDF DataFrame
print(tfidf_df)

            audience  encourage      fake     false  mentioned  misleading  \
Document 1  0.000000   0.000000  0.453295  0.000000   0.767495    0.000000   
Document 2  0.608845   0.608845  0.359594  0.000000   0.000000    0.000000   
Document 3  0.000000   0.000000  0.359594  0.608845   0.000000    0.608845   

                news  
Document 1  0.453295  
Document 2  0.359594  
Document 3  0.359594  


## n-gram

In [9]:
import re
from nltk.util import ngrams

def generate_ngrams(text, n):
    # Tokenize the text into words
    words = re.findall(r'\w+', text)

    # Generate n-grams using NLTK's ngrams function
    ngrams_list = list(ngrams(words, n))

    return ngrams_list

# Example text
text = "This is an example sentence for generating n-grams."

# Generate bi-grams (2-grams)
bigrams = generate_ngrams(text, 2)
print("Bi-grams:")
for gram in bigrams:
    print(gram)

# Generate tri-grams (3-grams)
trigrams = generate_ngrams(text, 3)
print("\nTri-grams:")
for gram in trigrams:
    print(gram)

Bi-grams:
('This', 'is')
('is', 'an')
('an', 'example')
('example', 'sentence')
('sentence', 'for')
('for', 'generating')
('generating', 'n')
('n', 'grams')

Tri-grams:
('This', 'is', 'an')
('is', 'an', 'example')
('an', 'example', 'sentence')
('example', 'sentence', 'for')
('sentence', 'for', 'generating')
('for', 'generating', 'n')
('generating', 'n', 'grams')


## Load Data

In [10]:
import pandas as pd
import numpy as np

In [13]:
df = pd.read_csv('clean_review_all.csv')
df.head(10)

Unnamed: 0,at,content,score,userName,contentp,contentp_clean,text_length
0,2023-11-19 08:08:17,Paketnya mahal2 dan data internetnya juga boro...,1,Wildan Cell,paketnya mahal dan data internetnya juga boros...,paket mahal data internetnya boros pokok rekom...,72
1,2023-11-19 00:02:23,Ulasan lama : Parah ini kenapa ga bisa login d...,4,Deyubi kyubi,ulasan lama parah ini kenapa ga bisa login di ...,ulas lama parah kenapa bisa login aplikasi upd...,73
2,2023-11-18 22:02:21,Susah banget mau login ke aplikasi sampe jengkel,1,Asyraf Kaylani,susah banget mau login ke aplikasi sampe jengkel,susah banget mau login aplikasi jengkel,39
3,2023-11-18 18:21:34,Cukup bagus tapi sayang tidak ada menu blokir ...,5,Ardi Ardian,cukup bagus tapi sayang tidak ada menu blokir ...,cukup bagus sayang ada menu blokir buat lain m...,60
4,2023-11-18 17:02:54,"Kencang dan stabil tapi mahaaallll,.. dimurahi...",3,Bagas Eksanudin Aziz,kencang dan stabil tapi mahaaallll dimurahin d...,kencang stabil mahaaallll dimurahin sedikit ka...,70
5,2023-11-18 12:13:53,Sekarang sering ilang ilangan sinyal e,4,Ajeng Tyas,sekarang sering ilang ilangan sinyal,sekarang sering ilang ilangan sinyal,36
6,2023-11-18 10:56:10,"waspada sama orbit, aktifin paket. sampai 2 ha...",1,Sahwana Shugra,waspada sama orbit aktifin paket sampai hari b...,waspada sama orbit aktifin paket hari aktif la...,222
7,2023-11-18 10:36:19,Akun dan imei orbit tidak terdaftar sendiri pa...,1,Ilhan Mansiz,akun dan imei orbit tidak terdaftar sendiri pa...,akun imei orbit daftar sendiri padahal langgan...,59
8,2023-11-18 07:39:59,😲😮😧 larang² paketane om 🤦,2,Putro Ragil,larang² paketane om,larang paketane om,18
9,2023-11-17 19:51:10,"Produk gagal, jatingan tidak stabil.",1,Jonny Arung,produk gagal jatingan tidak stabil,produk gagal jatingan stabil,28


In [14]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
from nltk.util import ngrams

def text_representation(df, text_column, method='one-hot', ngram_range=(1, 1)):
    """
    Apply various text representation techniques to a DataFrame.

    Parameters:
    - df: DataFrame with 'date' and 'text' columns.
    - text_column: Name of the text column.
    - method: Text representation method ('one-hot', 'bag-of-words', 'ngram', 'countvectorize', 'tfidf').
    - ngram_range: Tuple specifying the n-gram range (e.g., (1, 1) for unigrams, (1, 2) for unigrams and bigrams).

    Returns:
    - Transformed DataFrame.
    """

    if method == 'one-hot':
        mlb = MultiLabelBinarizer()
        words = [text.split() for text in df[text_column]]
        one_hot_encoded = mlb.fit_transform(words)
        one_hot_df = pd.DataFrame(one_hot_encoded, columns=mlb.classes_)
        return pd.concat([df, one_hot_df], axis=1)

    elif method == 'bag-of-words':
        vectorizer = CountVectorizer()
        bow = vectorizer.fit_transform(df[text_column])
        bow_df = pd.DataFrame(bow.toarray(), columns=vectorizer.get_feature_names_out())
        return pd.concat([df, bow_df], axis=1)

    elif method == 'ngram':
        vectorizer = CountVectorizer(ngram_range=ngram_range)
        ngram_features = vectorizer.fit_transform(df[text_column])
        ngram_df = pd.DataFrame(ngram_features.toarray(), columns=vectorizer.get_feature_names_out())
        return pd.concat([df, ngram_df], axis=1)

    elif method == 'countvectorize':
        vectorizer = CountVectorizer()
        count_vectorized = vectorizer.fit_transform(df[text_column])
        countvectorize_df = pd.DataFrame(count_vectorized.toarray(), columns=vectorizer.get_feature_names_out())
        return pd.concat([df, countvectorize_df], axis=1)

    elif method == 'tfidf':
        vectorizer = TfidfVectorizer()
        tfidf = vectorizer.fit_transform(df[text_column])
        tfidf_df = pd.DataFrame(tfidf.toarray(), columns=vectorizer.get_feature_names_out())
        return pd.concat([df, tfidf_df], axis=1)

    else:
        raise ValueError("Invalid method. Supported methods are 'one-hot', 'bag-of-words', 'ngram', 'countvectorize', and 'tfidf'.")

# Example usage:
# Assuming you have a DataFrame called 'df' with 'date' and 'text' columns
# transformed_df = text_representation(df, 'text', method='one-hot')


In [16]:
# Example usage:
# Assuming you have a DataFrame called 'df' with 'date' and 'text' columns
transformed_df = text_representation(df, 'contentp_clean', method='tfidf')

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [17]:
transformed_df

NameError: name 'transformed_df' is not defined

In [None]:
# Calculate the sparsity of the TF-IDF matrix
tfidf_df =  transformed_df.drop(['at'	,'content','score','userName','contentp','contentp_clean'], axis=1)
sparsity = 1.0 - (np.count_nonzero(tfidf_df) / tfidf_df.size)

print(f"Sparsity of the TF-IDF matrix: {sparsity:.4f}")

Sparsity of the TF-IDF matrix: 0.9819
