In [6]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer

In [7]:
data = pd.read_csv('Tweets.csv')

In [8]:
data['text'] = data['text'].apply(lambda x: re.sub(r'@\w+', '', x))

In [9]:
data['text']

0                                              What  said.
1         plus you've added commercials to the experien...
2         I didn't today... Must mean I need to take an...
3         it's really aggressive to blast obnoxious "en...
4                 and it's a really big bad thing about it
                               ...                        
14635     thank you we got on a different flight to Chi...
14636     leaving over 20 minutes Late Flight. No warni...
14637      Please bring American Airlines to #BlackBerry10
14638     you have my money, you change my flight, and ...
14639     we have 8 ppl so we need 2 know how many seat...
Name: text, Length: 14640, dtype: object

In [10]:
# Remove currency symbols and amounts
data['text'] = data['text'].apply(lambda x: re.sub(r'\$\d+(\.\d{2})?', '', x))

# Remove email addresses
data['text'] = data['text'].apply(lambda x: re.sub(r'\S+@\S+', '', x))

# Remove emojis
data['text'] = data['text'].apply(lambda x: re.sub(r'[^\w\s,]', '', x))

# Remove HTML escaped characters
data['text'] = data['text'].apply(lambda x: re.sub(r'&\w+;', '', x))

# Remove punctuation
data['text'] = data['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Remove times and dates
data['text'] = data['text'].apply(lambda x: re.sub(r'\d{1,2}/\d{1,2} \d{1,2}:\d{2}(am|pm)?', '', x))
data['text'] = data['text'].apply(lambda x: re.sub(r'\d{1,2}/\d{1,2}', '', x))
data['text'] = data['text'].apply(lambda x: re.sub(r'\d{1,2}:\d{2} (AM|PM)', '', x))

# Remove URLs
data['text'] = data['text'].apply(lambda x: re.sub(r'http\S+', '', x))

# Lemmatize verbs
lemmatizer = WordNetLemmatizer()
data['text'] = data['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word, 'v') for word in x.split()]))

# Convert to lowercase
data['text'] = data['text'].apply(lambda x: x.lower())

# Remove extra spaces
data['text'] = data['text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

# Remove numbers
data['text'] = data['text'].apply(lambda x: re.sub(r'\d+', '', x))

# Remove single characters
data['text'] = data['text'].apply(lambda x: re.sub(r'\b\w\b', '', x))

# Convert text to lowercase
data['text'] = data['text'].str.lower()

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Define a set of stop words
stop_words = set(stopwords.words('english'))

# Remove stop words from the text
data['text'] = data['text'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in stop_words])
)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Stell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
data['text']

0                                                      say
1              plus youve add commercials experience tacky
2             didnt today must mean need take another trip
3        really aggressive blast obnoxious entertainmen...
4                                     really big bad thing
                               ...                        
14635                   thank get different flight chicago
14637            please bring american airlines blackberry
14638    money change flight dont answer phone suggesti...
14639    ppl need know many seat next flight plz put us...
Name: text, Length: 14640, dtype: object

In [12]:
# Remove rows with empty tweets
data = data[data['text'].str.strip() != '']

# Remove duplicate rows based on 'text' and 'airline_sentiment'
data = data.drop_duplicates(subset=['text', 'airline_sentiment'])

# Display the cleaned data
data

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0000,,,Virgin America,,cairdin,,0,say,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0000,Virgin America,,jnardino,,0,plus youve add commercials experience tacky,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,didnt today must mean need take another trip,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0000,Bad Flight,0.7033,Virgin America,,jnardino,,0,really aggressive blast obnoxious entertainmen...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0000,Can't Tell,1.0000,Virgin America,,jnardino,,0,really big bad thing,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14635,569587686496825344,positive,0.3487,,0.0000,American,,KristenReenders,,0,thank get different flight chicago,,2015-02-22 12:01:01 -0800,,
14636,569587371693355008,negative,1.0000,Customer Service Issue,1.0000,American,,itsropes,,0,leave minutes late flight warnings communicati...,,2015-02-22 11:59:46 -0800,Texas,
14637,569587242672398336,neutral,1.0000,,,American,,sanyabun,,0,please bring american airlines blackberry,,2015-02-22 11:59:15 -0800,"Nigeria,lagos",
14638,569587188687634433,negative,1.0000,Customer Service Issue,0.6659,American,,SraJackson,,0,money change flight dont answer phone suggesti...,,2015-02-22 11:59:02 -0800,New Jersey,Eastern Time (US & Canada)


In [13]:
data.to_csv('cleaned_tweets.csv', index=False)