In [1]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
import pandas as pd


In [2]:
# 下载必要资源（首次运行需执行）
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True) 
nltk.download('punkt', quiet=True)



True

In [3]:
# 定义文本清洗函数
def clean_text(text):
    # 1. 去除非字母字符（保留空格）
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # 2. 转为小写
    text = text.lower()
    # 3. 按空格切分（分词）
    tokens = text.split()
    return tokens


# 加载英语停用词
stop_words = set(stopwords.words('english'))


# 定义去停用词函数（含长度过滤）
def filter_stopwords(tokens):
    # 移除停用词 & 长度≤2的单词
    filtered = [word for word in tokens
                if word not in stop_words and len(word) > 2]
    return filtered


# 词性标注辅助函数（将nltk词性转换为wordnet格式）
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # 默认词性为名词


lemmatizer = WordNetLemmatizer()


# 定义词形还原函数
def lemmatize_tokens(tokens):
    # 先进行词性标注
    pos_tags = nltk.pos_tag(tokens)
    # 按词性还原
    lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(tag))
                  for word, tag in pos_tags]
    return lemmatized


In [4]:
# 读取数据集，根据你的路径调整
file_path = r'C:\Users\ASUS\Desktop\LAD\sentimentdataset.csv'
df = pd.read_csv(file_path)


In [5]:
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,2,2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
727,728,732,Collaborating on a science project that receiv...,Happy,2017-08-18 18:20:00,ScienceProjectSuccessHighSchool,Facebook,#ScienceFairWinner #HighSchoolScience,20.0,39.0,UK,2017,8,18,18
728,729,733,Attending a surprise birthday party organized ...,Happy,2018-06-22 14:15:00,BirthdayPartyJoyHighSchool,Instagram,#SurpriseCelebration #HighSchoolFriendship,25.0,48.0,USA,2018,6,22,14
729,730,734,Successfully fundraising for a school charity ...,Happy,2019-04-05 17:30:00,CharityFundraisingTriumphHighSchool,Twitter,#CommunityGiving #HighSchoolPhilanthropy,22.0,42.0,Canada,2019,4,5,17
730,731,735,"Participating in a multicultural festival, cel...",Happy,2020-02-29 20:45:00,MulticulturalFestivalJoyHighSchool,Facebook,#CulturalCelebration #HighSchoolUnity,21.0,43.0,UK,2020,2,29,20


In [8]:

# 应用清洗函数
df['Cleaned_Tokens'] = df['Text'].apply(clean_text)

# 应用过滤函数
df['Filtered_Tokens'] = df['Cleaned_Tokens'].apply(filter_stopwords)

# 应用还原函数
df['Lemmatized_Tokens'] = df['Filtered_Tokens'].apply(lemmatize_tokens)


In [None]:

# 将结果保存为csv文件，可自行调整保存路径
csv_path = r'C:\Users\ASUS\Desktop\LAD\sentimentdataset_preprocessed.csv'
df.to_csv(csv_path)

In [9]:
df['Lemmatized_Tokens']

0                          [enjoy, beautiful, day, park]
1                           [traffic, terrible, morning]
2                               [finish, amaze, workout]
3                  [excited, upcoming, weekend, getaway]
4                    [try, new, recipe, dinner, tonight]
                             ...                        
727    [collaborate, science, project, receive, recog...
728    [attend, surprise, birthday, party, organize, ...
729    [successfully, fundraise, school, charity, ini...
730    [participate, multicultural, festival, celebra...
731    [organize, virtual, talent, show, challenge, t...
Name: Lemmatized_Tokens, Length: 732, dtype: object