In [5]:
# ---------------------------------------------------------
# COMPLETE NLP PREPROCESSING + TF-IDF FOR CSV FILE
# Works with: FINAL_USO_cleaned.csv
# ---------------------------------------------------------

!pip install -q nltk scikit-learn pandas

import pandas as pd
import numpy as np
import nltk
import os

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# ---------------------------------------------------------
# LOAD CSV (YOUR FILE)
# ---------------------------------------------------------

FILE_PATH = "FINAL_USO_cleaned.csv"   # <-- Correct file name

if not os.path.isfile(FILE_PATH):
    raise FileNotFoundError(f"File not found: {FILE_PATH}. "
                            "Make sure FINAL_USO_cleaned.csv is in the same folder as your notebook.")

df = pd.read_csv(FILE_PATH)


# ---------------------------------------------------------
# FIND TEXT COLUMN
# ---------------------------------------------------------

# Try to find a usable text column automatically
possible_text_cols = [
    "text", "comments", "description", "body", "content", 
    "article", "review", "notes", "document"
]

text_col = None

for col in df.columns:
    if col.lower() in possible_text_cols:
        text_col = col
        break

# If no obvious text column, fall back to first string column
if text_col is None:
    string_cols = [c for c in df.columns if df[c].dtype == object]
    if len(string_cols) == 0:
        raise ValueError(
            "No text column found in CSV. "
            "Please add a column containing text or tell me the correct column name."
        )
    text_col = string_cols[0]

print(f"Using text column: {text_col}")


# ---------------------------------------------------------
# PREPROCESSING FUNCTION
# ---------------------------------------------------------

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha()]               # keep only letters
    tokens = [t for t in tokens if t not in stop_words]       # remove stopwords
    tokens = [lemmatizer.lemmatize(t) for t in tokens]        # lemmatize
    return " ".join(tokens)


# ---------------------------------------------------------
# APPLY PREPROCESSING
# ---------------------------------------------------------

print("Preprocessing text...")

df["clean_text"] = df[text_col].astype(str).apply(preprocess_text)

print("Sample cleaned text:")
print(df["clean_text"].head())


# ---------------------------------------------------------
# TF-IDF VECTORIZATION
# ---------------------------------------------------------

vectorizer = TfidfVectorizer(max_features=2000)
tfidf_matrix = vectorizer.fit_transform(df["clean_text"])

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=vectorizer.get_feature_names_out()
)

# Save output
tfidf_df.to_csv("tfidf_output.csv", index=False)

print("\nTF-IDF shape:", tfidf_df.shape)
print("Saved TF-IDF matrix to tfidf_output.csv\n")

# Show first few features
print("Sample TF-IDF columns:")
print(tfidf_df.iloc[:, :20])

Using text column: Date
Preprocessing text...
Sample cleaned text:
0    
1    
2    
3    
4    
Name: clean_text, dtype: object


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [3]:
!