In [12]:

!pip install textblob

# Download NLTK data for text processing
import nltk
nltk.download('punkt')
nltk.download('stopwords')




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
import os
import shutil

# Create folder
folder_path = "/content/txt_files"
os.makedirs(folder_path, exist_ok=True)

# Move uploaded .txt files to the new folder
for file in os.listdir("/content"):
    if file.endswith(".txt"):
        shutil.move(file, os.path.join(folder_path, file))

print("✅ All .txt files moved to /content/txt_files")


✅ All .txt files moved to /content/txt_files


In [14]:
import os
import pandas as pd
import re
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_text = ' '.join([word for word in tokens if word not in stop_words])
    return filtered_text

def analyze_sentiment(text):
    return TextBlob(text).sentiment.polarity

def process_txt_files(folder_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                cleaned_text = clean_text(text)
                sentiment = analyze_sentiment(cleaned_text)
                data.append([filename, cleaned_text, sentiment])

    df = pd.DataFrame(data, columns=['Filename', 'Cleaned_Text', 'Sentiment'])
    return df

# Run it
result_df = process_txt_files("/content/txt_files")
result_df.to_csv("/content/cleaned_text_data.csv", index=False)
print("✅ Processing complete. Results saved to cleaned_text_data.csv")

# Preview the results
result_df.head()


✅ Processing complete. Results saved to cleaned_text_data.csv


Unnamed: 0,Filename,Cleaned_Text,Sentiment
0,1tZbvaR66aMJtfmv95v37esLpkyfz-ynl.txt,page changing game compete innovations fashion...,0.106545
1,Voice to Text Scraped.txt,hello everybody welcome drum network podcast a...,0.242316
2,1RwKejaEbKJVAirFfHPjDKg5F6SpDS5Y5.txt,page international journal management science ...,0.19187
3,Website_Scraped_Data.txt,sheet webscrapedataset sourcename column colum...,0.104914
4,1pzFBTu6qky8wz2GexXa7oBJgoUKIRhj7.txt,page technology state fashion page page techno...,0.08992


In [None]:
from google.colab import files
files.download("/content/cleaned_text_data.csv")
