## Tugas 1

In [45]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [46]:
# Load the dataset
file_path = 'Tugas 1_NIM Genap_Modul 4.csv'
df = pd.read_csv(file_path)

text_column = 'text'  # Replace with the actual column name containing text
label_column = 'label'  # Replace with the actual label column name

# Show the first few rows of the dataset
df.head()

Unnamed: 0,label,text
0,REAL,Payal has accused filmmaker Anurag Kashyap of ...
1,FAKE,A four-minute-long video of a woman criticisin...
2,FAKE,"Republic Poll, a fake Twitter account imitatin..."
3,REAL,"Delhi teen finds place on UN green list, turns..."
4,REAL,Delhi: A high-level meeting underway at reside...


In [47]:
# Initialize the stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andif\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andif\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Preprocessing

In [48]:
# 1. Case Folding
def case_folding(text):
    if isinstance(text, str):  # Check if the value is a string
        return text.lower()
    else:
        return ""  # Return an empty string for non-string values

In [49]:
# 2. Data Cleaning (remove unwanted characters)
def clean_text(text):
    # Remove HTML tags, URLs, mentions (@user), hashtags (#), emojis, digits, and punctuation
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[\U00010000-\U0010ffff]', '', text)  # Remove emojis
    return text

In [50]:
# 3. Tokenization
def tokenize(text):
    return text.split()

In [51]:
# 4. Stopwords Removal
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('indonesian'))  # Change to 'english' for English stopwords
    return [word for word in tokens if word not in stop_words]

In [52]:
# 5. Stemming and Lemmatization
def stem_and_lemmatize(tokens, use_stemming=True):
    if use_stemming:
        return [stemmer.stem(word) for word in tokens]
    else:
        return [lemmatizer.lemmatize(word) for word in tokens]

In [53]:
# 6. Manual Padding/Truncating
def padding_truncating(tokens, max_len=100):
    # If the sequence is shorter than max_len, pad with 'PAD' tokens
    if len(tokens) < max_len:
        tokens += ['PAD'] * (max_len - len(tokens))
    # If the sequence is longer than max_len, truncate it
    elif len(tokens) > max_len:
        tokens = tokens[:max_len]
    return tokens

In [54]:
# # 7. Text-to-Numeric Transformation (Bag of Words or TF-IDF)
# def text_to_numeric(tokens, method='bow'):
#     if method == 'bow':
#         vectorizer = CountVectorizer()
#         return vectorizer.fit_transform([' '.join(tokens)]).toarray()
#     elif method == 'tfidf':
#         vectorizer = TfidfVectorizer()
#         return vectorizer.fit_transform([' '.join(tokens)]).toarray()

In [55]:
# Apply all steps to the dataset
def preprocess_text(text, use_stemming=True):
    text = case_folding(text)  # Step 1: Case folding
    text = clean_text(text)  # Step 2: Data cleaning
    tokens = tokenize(text)  # Step 3: Tokenization
    tokens = remove_stopwords(tokens)  # Step 4: Stopwords removal
    tokens = stem_and_lemmatize(tokens, use_stemming)  # Step 5: Stemming/Lemmatization
    return tokens

In [56]:
# Apply preprocessing to each row in the dataset
df['processed_text'] = df[text_column].apply(preprocess_text)

In [57]:
# Example of Padding/Truncating and Text-to-Numeric Transformation
# Assuming 'processed_text' column contains tokenized text
df['padded_text'] = df['processed_text'].apply(lambda x: padding_truncating(x, max_len=100))

In [58]:
# # Example of text-to-numeric transformation using TF-IDF
# df['numeric_tfidf'] = df['processed_text'].apply(lambda x: text_to_numeric(x, method='tfidf'))

In [59]:
# 8. Label Encoding
def encode_labels(df, label_column):
    encoder = LabelEncoder()
    df[label_column] = encoder.fit_transform(df[label_column])
    return df

# Models Building

In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [61]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

In [62]:
from sklearn.model_selection import train_test_split

X = df['processed_text'].apply(lambda x: ' '.join(x))  # Join tokens back into text for TF-IDF
y = df['label']  # The encoded label column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 1. Text Vectorization using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Set max features to limit the number of tokens
X_train = tfidf_vectorizer.fit_transform(X_train)
X_test = tfidf_vectorizer.transform(X_test)

# 1. Logistic Regression

In [63]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train, y_train)

print_score(lr_clf, X_train, y_train, X_test, y_test, train=True)
print_score(lr_clf, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 98.97%
_______________________________________________
CLASSIFICATION REPORT:
                  FAKE         REAL  accuracy    macro avg  weighted avg
precision     0.997689     0.981707  0.989655     0.989698      0.989784
recall        0.981804     0.997676  0.989655     0.989740      0.989655
f1-score      0.989683     0.989627  0.989655     0.989655      0.989655
support    1319.000000  1291.000000  0.989655  2610.000000   2610.000000
_______________________________________________
Confusion Matrix: 
 [[1295   24]
 [   3 1288]]

Test Result:
Accuracy Score: 98.39%
_______________________________________________
CLASSIFICATION REPORT:
                 FAKE        REAL  accuracy    macro avg  weighted avg
precision    0.996324    0.972174  0.983914     0.984249      0.984216
recall       0.971326    0.996435  0.983914     0.983881      0.983914
f1-score     0.983666    0.984155  0.983914     0.983910      0.983911
support    558.000000  561.000000  0.983

In [64]:
test_score = accuracy_score(y_test, lr_clf.predict(X_test)) * 100
train_score = accuracy_score(y_train, lr_clf.predict(X_train)) * 100

results_df = pd.DataFrame(data=[["Logistic Regression", train_score, test_score]], columns=['Model', 'Training Accuracy %', 'Testing Accuracy %'])
results_df

Unnamed: 0,Model,Training Accuracy %,Testing Accuracy %
0,Logistic Regression,98.965517,98.391421


# 2. K-nearest neighbors

In [65]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)

print_score(knn_clf, X_train, y_train, X_test, y_test, train=True)
print_score(knn_clf, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 96.32%
_______________________________________________
CLASSIFICATION REPORT:
                  FAKE         REAL  accuracy    macro avg  weighted avg
precision     0.937098     0.993394  0.963218     0.965246      0.964944
recall        0.993935     0.931836  0.963218     0.962885      0.963218
f1-score      0.964680     0.961631  0.963218     0.963155      0.963172
support    1319.000000  1291.000000  0.963218  2610.000000   2610.000000
_______________________________________________
Confusion Matrix: 
 [[1311    8]
 [  88 1203]]

Test Result:
Accuracy Score: 95.26%
_______________________________________________
CLASSIFICATION REPORT:
                 FAKE        REAL  accuracy    macro avg  weighted avg
precision    0.924370    0.984733  0.952636     0.954551      0.954632
recall       0.985663    0.919786  0.952636     0.952725      0.952636
f1-score     0.954033    0.951152  0.952636     0.952593      0.952589
support    558.000000  561.000000  0.952

In [66]:
test_score = accuracy_score(y_test, knn_clf.predict(X_test)) * 100
train_score = accuracy_score(y_train, knn_clf.predict(X_train)) * 100

results_df_2 = pd.DataFrame(data=[["K-nearest neighbors", train_score, test_score]], 
                          columns=['Model', 'Training Accuracy %', 'Testing Accuracy %'])
results_df = pd.concat([results_df, results_df_2], ignore_index=True)
results_df

Unnamed: 0,Model,Training Accuracy %,Testing Accuracy %
0,Logistic Regression,98.965517,98.391421
1,K-nearest neighbors,96.321839,95.263628


# 3. Support Vector Machine

In [67]:
from sklearn.svm import SVC


svm_clf = SVC(kernel='rbf', gamma=0.1, C=1.0)
svm_clf.fit(X_train, y_train)

print_score(svm_clf, X_train, y_train, X_test, y_test, train=True)
print_score(svm_clf, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 98.81%
_______________________________________________
CLASSIFICATION REPORT:
                  FAKE         REAL  accuracy    macro avg  weighted avg
precision     0.996148     0.980183  0.988123     0.988165      0.988251
recall        0.980288     0.996127  0.988123     0.988208      0.988123
f1-score      0.988154     0.988091  0.988123     0.988123      0.988123
support    1319.000000  1291.000000  0.988123  2610.000000   2610.000000
_______________________________________________
Confusion Matrix: 
 [[1293   26]
 [   5 1286]]

Test Result:
Accuracy Score: 98.57%
_______________________________________________
CLASSIFICATION REPORT:
                 FAKE        REAL  accuracy    macro avg  weighted avg
precision    0.994526    0.977233  0.985702     0.985879      0.985856
recall       0.976703    0.994652  0.985702     0.985677      0.985702
f1-score     0.985533    0.985866  0.985702     0.985700      0.985700
support    558.000000  561.000000  0.985

In [68]:
test_score = accuracy_score(y_test, svm_clf.predict(X_test)) * 100
train_score = accuracy_score(y_train, svm_clf.predict(X_train)) * 100

results_df_2 = pd.DataFrame(data=[["Support Vector Machine", train_score, test_score]], 
                          columns=['Model', 'Training Accuracy %', 'Testing Accuracy %'])
results_df = pd.concat([results_df, results_df_2], ignore_index=True)
results_df

Unnamed: 0,Model,Training Accuracy %,Testing Accuracy %
0,Logistic Regression,98.965517,98.391421
1,K-nearest neighbors,96.321839,95.263628
2,Support Vector Machine,98.812261,98.570152


# 4. Decision Tree Classifier

In [69]:
from sklearn.tree import DecisionTreeClassifier


tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)

print_score(tree_clf, X_train, y_train, X_test, y_test, train=True)
print_score(tree_clf, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 99.92%
_______________________________________________
CLASSIFICATION REPORT:
                  FAKE         REAL  accuracy    macro avg  weighted avg
precision     0.998486     1.000000  0.999234     0.999243      0.999235
recall        1.000000     0.998451  0.999234     0.999225      0.999234
f1-score      0.999242     0.999225  0.999234     0.999234      0.999234
support    1319.000000  1291.000000  0.999234  2610.000000   2610.000000
_______________________________________________
Confusion Matrix: 
 [[1319    0]
 [   2 1289]]

Test Result:
Accuracy Score: 99.37%
_______________________________________________
CLASSIFICATION REPORT:
                 FAKE        REAL  accuracy    macro avg  weighted avg
precision    0.992844    0.994643  0.993744     0.993744      0.993746
recall       0.994624    0.992870  0.993744     0.993747      0.993744
f1-score     0.993733    0.993756  0.993744     0.993744      0.993744
support    558.000000  561.000000  0.993

In [70]:
test_score = accuracy_score(y_test, tree_clf.predict(X_test)) * 100
train_score = accuracy_score(y_train, tree_clf.predict(X_train)) * 100

results_df_2 = pd.DataFrame(data=[["Decision Tree Classifier", train_score, test_score]], 
                          columns=['Model', 'Training Accuracy %', 'Testing Accuracy %'])
results_df = pd.concat([results_df, results_df_2], ignore_index=True)
results_df



Unnamed: 0,Model,Training Accuracy %,Testing Accuracy %
0,Logistic Regression,98.965517,98.391421
1,K-nearest neighbors,96.321839,95.263628
2,Support Vector Machine,98.812261,98.570152
3,Decision Tree Classifier,99.923372,99.374441


#  5. Random Forest

In [71]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf_clf = RandomForestClassifier(n_estimators=1000, random_state=42)
rf_clf.fit(X_train, y_train)

print_score(rf_clf, X_train, y_train, X_test, y_test, train=True)
print_score(rf_clf, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 99.92%
_______________________________________________
CLASSIFICATION REPORT:
                  FAKE         REAL  accuracy    macro avg  weighted avg
precision     0.998486     1.000000  0.999234     0.999243      0.999235
recall        1.000000     0.998451  0.999234     0.999225      0.999234
f1-score      0.999242     0.999225  0.999234     0.999234      0.999234
support    1319.000000  1291.000000  0.999234  2610.000000   2610.000000
_______________________________________________
Confusion Matrix: 
 [[1319    0]
 [   2 1289]]

Test Result:
Accuracy Score: 98.93%
_______________________________________________
CLASSIFICATION REPORT:
                 FAKE        REAL  accuracy    macro avg  weighted avg
precision    0.978947    1.000000  0.989276     0.989474      0.989502
recall       1.000000    0.978610  0.989276     0.989305      0.989276
f1-score     0.989362    0.989189  0.989276     0.989275      0.989275
support    558.000000  561.000000  0.989

In [72]:
test_score = accuracy_score(y_test, rf_clf.predict(X_test)) * 100
train_score = accuracy_score(y_train, rf_clf.predict(X_train)) * 100

results_df_2 = pd.DataFrame(data=[["Random Forest Classifier", train_score, test_score]], 
                          columns=['Model', 'Training Accuracy %', 'Testing Accuracy %'])
results_df = pd.concat([results_df, results_df_2], ignore_index=True)
results_df



Unnamed: 0,Model,Training Accuracy %,Testing Accuracy %
0,Logistic Regression,98.965517,98.391421
1,K-nearest neighbors,96.321839,95.263628
2,Support Vector Machine,98.812261,98.570152
3,Decision Tree Classifier,99.923372,99.374441
4,Random Forest Classifier,99.923372,98.927614
