In [None]:
import pandas as pd

def reduce_csv_to_1000_rows(input_csv, output_csv):
    # Read the CSV file
    df = pd.read_csv(input_csv)

    # If the dataset has more than 1000 rows, randomly sample 1000 rows
    if len(df) > 1000:
        df_reduced = df.sample(n=1000, random_state=42)  # random_state for reproducibility
    else:
        df_reduced = df  # if there are fewer than 1000 rows, keep all rows

    # Save the reduced dataset to a new CSV file
    df_reduced.to_csv(output_csv, index=False)

    print(f"Data reduced and saved to {output_csv}")

# Example usage
input_csv = 'cleaned_data.csv'  # Replace with your input CSV file path
output_csv = '1000_rows_data.csv'  # Replace with your desired output file path
reduce_csv_to_1000_rows(input_csv, output_csv)


In [None]:
import pandas as pd
import ast
from datetime import datetime, timezone

df = pd.read_csv('reduced_output_file.csv')  # Replace 'your_file.csv' with your actual file path

# Create DataFrame
post_data = pd.DataFrame(df)

# Step 1: Combine Title, Text, and Comments Data into separate rows

# Convert Comments Data from string format to list
post_data['Comments Data'] = post_data['Comments Data'].apply(ast.literal_eval)

# Prepare the DataFrame for expanding to multiple rows
expanded_data = []

# Add the Title and Text as separate rows
for _, row in post_data.iterrows():
    # Add the title as a row
    expanded_data.append({'Text': row['Title'], 'Category': 'No Slang', 'Date': datetime.fromtimestamp(row['Timestamp'], tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S'), 'Subreddit': row['Subreddit']})
    
    # Add the text (if available)
    if row['Text']:
        expanded_data.append({'Text': row['Text'], 'Category': 'No Slang', 'Date': datetime.fromtimestamp(row['Timestamp'], tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S'), 'Subreddit': row['Subreddit']})
    
    # Add each comment as a separate row
    for comment in row['Comments Data']:
        expanded_data.append({'Text': comment[0], 'Category': 'No Slang', 'Date': datetime.fromtimestamp(comment[2], tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S'), 'Subreddit': row['Subreddit']})

# Create the expanded DataFrame
expanded_df = pd.DataFrame(expanded_data)

expanded_df.to_csv('cleaned_data.csv', index=False)  # Saves the cleaned data to a new file

print(expanded_df)



In [8]:
import pandas as pd

# Load the CSV file
file_path = "1000_rows_data.csv"
df = pd.read_csv(file_path)

# Display basic info and the first few rows
df_info = df.info()
df_head = df.head()

df_info, df_head

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       992 non-null    object
 1   Category   1000 non-null   object
 2   Date       1000 non-null   object
 3   Subreddit  1000 non-null   object
dtypes: object(4)
memory usage: 31.4+ KB


(None,
                                                 Text  Category  \
 0  This man is a cut above the rest, outdoor or i...  No Slang   
 1                                      Why not four?  No Slang   
 2                                          MAGA baby  No Slang   
 3  I doubt even ChatGPT would come up with a stor...  No Slang   
 4  I feel that's kinda true but at the same time ...  No Slang   
 
                   Date  Subreddit  
 0  2025-03-18 12:36:49     soccer  
 1  2025-03-15 16:30:40      funny  
 2  2025-03-18 04:43:41  AskReddit  
 3  2025-03-18 12:48:15     gaming  
 4  2025-03-18 12:43:17     soccer  )

In [9]:
import numpy as np
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK resources (if not already installed)
nltk.download('punkt')
nltk.download('stopwords')

# Salin dataframe untuk diolah
df_cleaned = df.copy()

# --- Step 1: Data Cleaning ---
# Hapus baris dengan Text yang kosong
df_cleaned = df_cleaned.dropna(subset=['Text'])

# Hapus duplikat (jika ada)
df_cleaned = df_cleaned.drop_duplicates()

# Hapus URL, simbol aneh, dan lowercase
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # hapus URL
    text = re.sub(r"[^a-z\s]", "", text)  # hapus semua non-huruf
    text = re.sub(r"\s+", " ", text).strip()  # hapus spasi berlebih
    return text

df_cleaned["Cleaned_Text"] = df_cleaned["Text"].apply(clean_text)

# --- Step 2: Tokenization & Stopwords Removal ---
# Tokenisasi dan menghapus stopwords
stop_words = set(stopwords.words("english"))  # set of common stopwords

def tokenize_and_remove_stopwords(text):
    # Tokenisasi
    tokens = word_tokenize(text)
    # Hapus stopwords
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return " ".join(filtered_tokens)

df_cleaned["Processed_Text"] = df_cleaned["Cleaned_Text"].apply(tokenize_and_remove_stopwords)

# --- Step 3: Data Transformation ---
# Ubah kolom Date ke datetime
df_cleaned["Date"] = pd.to_datetime(df_cleaned["Date"], errors="coerce")

# --- Step 4: Encoding untuk klasifikasi nanti ---
# Label encoding kategori dan subreddit
df_cleaned["Category_Label"] = df_cleaned["Category"].astype("category").cat.codes

# Tampilkan hasil akhir sebagian
df_cleaned[["Processed_Text", "Category", "Category_Label", "Date", "Subreddit"]].head()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\andif\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andif\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Processed_Text,Category,Category_Label,Date,Subreddit
0,man cut rest outdoor inside gon na wan na lol lit,No Slang,0,2025-03-18 12:36:49,soccer
1,four,No Slang,0,2025-03-15 16:30:40,funny
2,maga baby,No Slang,0,2025-03-18 04:43:41,AskReddit
3,doubt even chatgpt would come story stupid bun...,No Slang,0,2025-03-18 12:48:15,gaming
4,feel thats kinda true time also true smaller t...,No Slang,0,2025-03-18 12:43:17,soccer


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# --- Step 4: Text Vectorization (TF-IDF) ---
# Gunakan TF-IDF pada Cleaned_Text
tfidf_vectorizer = TfidfVectorizer(
    max_features=1000,  # ambil 1000 kata paling penting
    stop_words='english',  # hilangkan stop words
    ngram_range=(1, 2)  # unigram dan bigram
)

tfidf_matrix = tfidf_vectorizer.fit_transform(df_cleaned["Cleaned_Text"])

# Bentuk dari hasil TF-IDF
tfidf_matrix.shape

(992, 1000)

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Split data
X = df_cleaned["Cleaned_Text"]
y = df_cleaned["Category_Label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Build pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000, stop_words='english', ngram_range=(1,2))),
    ('clf', LogisticRegression(max_iter=1000))
])

# 3. Train model
pipeline.fit(X_train, y_train)

# 4. Predict & evaluate
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int8(0)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

# Data
X = df_cleaned["Cleaned_Text"]
y = df_cleaned["Category_Label"]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model pipeline templates
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Linear SVM": LinearSVC()
}

# Loop through models
for name, model in models.items():
    print(f"\n=== {name} ===")
    
    pipeline = Pipeline([
        ("tfidf", TfidfVectorizer(max_features=1000, stop_words='english', ngram_range=(1,2))),
        ("clf", model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred))
