In [16]:
import pandas as pd
import ast
from datetime import datetime, timezone

df = pd.read_csv('reduced_output_file.csv')  # Replace 'your_file.csv' with your actual file path

# Create DataFrame
post_data = pd.DataFrame(df)

# Step 1: Combine Title, Text, and Comments Data into separate rows

# Convert Comments Data from string format to list
post_data['Comments Data'] = post_data['Comments Data'].apply(ast.literal_eval)

# Prepare the DataFrame for expanding to multiple rows
expanded_data = []

# Add the Title and Text as separate rows
for _, row in post_data.iterrows():
    # Add the title as a row
    expanded_data.append({'Text': row['Title'], 'Category': 'No Slang', 'Date': datetime.fromtimestamp(row['Timestamp'], tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S'), 'Subreddit': row['Subreddit']})
    
    # Add the text (if available)
    if row['Text']:
        expanded_data.append({'Text': row['Text'], 'Category': 'No Slang', 'Date': datetime.fromtimestamp(row['Timestamp'], tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S'), 'Subreddit': row['Subreddit']})
    
    # Add each comment as a separate row
    for comment in row['Comments Data']:
        expanded_data.append({'Text': comment[0], 'Category': 'No Slang', 'Date': datetime.fromtimestamp(comment[2], tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S'), 'Subreddit': row['Subreddit']})

# Create the expanded DataFrame
expanded_df = pd.DataFrame(expanded_data)

expanded_df.to_csv('cleaned_data.csv', index=False)  # Saves the cleaned data to a new file

print(expanded_df)



                                                    Text  Category  \
0      Are you ok with the DOD removing articles from...  No Slang   
1                                                    NaN  No Slang   
2                Navajo code talkers are still not back.  No Slang   
3      I’m just thinking about how these are the same...  No Slang   
4      Anyone who is okay with this does not respect ...  No Slang   
...                                                  ...       ...   
97785  This is why the DA will usually bring multiple...  No Slang   
97786  True or until the two parties reach a plea agr...  No Slang   
97787  Not all public defenders are state employees. ...  No Slang   
97788  To hear my buddy tell it, that wasn't the issu...  No Slang   
97789  Hmmm, usually ignorance of the law is not a de...  No Slang   

                      Date          Subreddit  
0      2025-03-18 12:05:01          AskReddit  
1      2025-03-18 12:05:01          AskReddit  
2      2025-03-

In [17]:
import pandas as pd
import re

# Remove duplicates
expanded_df = expanded_df.drop_duplicates()

# Handle missing values in 'Text' or 'Category' column by removing rows with missing data
expanded_df = expanded_df.dropna(subset=['Text', 'Category'])

# Clean text (removing special characters, URLs, etc.)
def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove non-alphanumeric characters
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text.lower().strip()

expanded_df['Text'] = expanded_df['Text'].apply(clean_text)


In [21]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Tokenization and stopword removal
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords and non-alphabetical words
    tokens = [word.lower() for word in tokens if word.lower() not in stop_words and word.isalpha()]
    # Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(lemmatized_tokens)

expanded_df['Text'] = expanded_df['Text'].apply(preprocess_text)

# Text Vectorization (TF-IDF)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(expanded_df['Text'])


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\andif\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andif\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andif\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [22]:
from sklearn.decomposition import TruncatedSVD

# Apply Truncated SVD for dimensionality reduction
svd = TruncatedSVD(n_components=100)  # Adjust n_components as needed
X_reduced = svd.fit_transform(X)


In [23]:
def discretize_length(text):
    length = len(text.split())
    if length < 20:
        return 'Short'
    elif length < 50:
        return 'Medium'
    else:
        return 'Long'

expanded_df['Text Length'] = expanded_df['Text'].apply(discretize_length)


KeyError: "None of [Index(['Upvotes'], dtype='object')] are in the [columns]"