In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import os

print("Starting Task 2: Text Representation")

input_path = '../data/filtered_complaints.csv'
output_dir = '../data/processed/'

os.makedirs(output_dir, exist_ok=True)

try:
    df_filtered = pd.read_csv(input_path)
    print(f"Loaded filtered data from {input_path}. Shape: {df_filtered.shape}")
    print(df_filtered.head())
except FileNotFoundError:
    print(f"Error: {input_path} not found. Please ensure Task 1 completed successfully.")
    exit()

df_filtered['cleaned_narrative'] = df_filtered['cleaned_narrative'].astype(str).fillna('')
df_filtered = df_filtered[df_filtered['cleaned_narrative'].str.strip() != '']

print(f"Shape after ensuring cleaned narratives: {df_filtered.shape}")

tfidf_vectorizer = TfidfVectorizer(max_features=5000, min_df=5, ngram_range=(1, 2))

print("Fitting TF-IDF Vectorizer and transforming text data...")
X_tfidf = tfidf_vectorizer.fit_transform(df_filtered['cleaned_narrative'])

print(f"TF-IDF matrix shape: {X_tfidf.shape}")
print(f"Number of features (vocabulary size): {len(tfidf_vectorizer.get_feature_names_out())}")

tfidf_matrix_path = os.path.join(output_dir, 'tfidf_matrix.pkl')
tfidf_vectorizer_path = os.path.join(output_dir, 'tfidf_vectorizer.pkl')
df_filtered_path = os.path.join(output_dir, 'df_filtered_with_indices.pkl')

with open(tfidf_matrix_path, 'wb') as f:
    pickle.dump(X_tfidf, f)
print(f"TF-IDF matrix saved to: {tfidf_matrix_path}")

with open(tfidf_vectorizer_path, 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)
print(f"TF-IDF vectorizer saved to: {tfidf_vectorizer_path}")

df_filtered.to_pickle(df_filtered_path)
print(f"Filtered DataFrame with cleaned narratives saved to: {df_filtered_path}")

print("Text Representation completed.")

ModuleNotFoundError: No module named 'sklearn'