In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('Tweets.csv')

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Find the number of unique users
unique_users = data['name'].nunique()
print(f'Number of unique users: {unique_users}')

# Compute top-5 words from their tweets using TF-IDF approach
def get_top_n_words(tfidf_matrix, feature_names, n=5):
    top_n_words = []
    for row in tfidf_matrix:
        sorted_indices = np.argsort(row)[::-1][:n]
        top_n_words.append([feature_names[i] for i in sorted_indices])
    return top_n_words

# Group tweets by user
grouped_tweets = data.groupby('name')['text'].apply(lambda x: ' '.join(x)).reset_index()

# Apply TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(grouped_tweets['text']).toarray()
feature_names = vectorizer.get_feature_names_out()

# Get top-5 words for each user
grouped_tweets['top_5_words'] = get_top_n_words(tfidf_matrix, feature_names)
print(grouped_tweets[['name', 'top_5_words']])

Number of unique users: 7701
               name                                        top_5_words
0     0504Traveller  [usatoday, http, southwestair, p8vcz4xthm, 6rl...
1          09202010               [baggages, 699, rdu, la, vu5lbzxtrx]
2      0veranalyser       [runways, flightled, flights, note, showing]
3           0xjared            [vibe, depends, fair, getting, jetblue]
4           10Eshaa        [lmfaooooo, 4llwi5oxvo, hook, fleek, fleet]
...             ...                                                ...
7696  zombiesausage             [sms, damn, impressed, worth, updates]
7697      zozo24dad           [2413, la, delayed, americanair, flight]
7698       zsalim03      [1hr, aa2450, arpt, vanessaannz, inclemental]
7699       zslick99  [ticketing, unmanned, unorganized, stations, c...
7700      zupshawrl  [luv, feltthelove, television, commercials, tr...

[7701 rows x 2 columns]


In [4]:
# Find the most active user for each airline
most_active_users = data.groupby('airline')['name'].agg(lambda x: x.value_counts().idxmax()).reset_index()

# Merge with the original data to get their tweets, tweeting location, and tweet sentiment
most_active_users_data = pd.merge(most_active_users, data, on=['airline', 'name'], how='left')

# Select relevant columns
most_active_users_data = most_active_users_data[['airline', 'name', 'text', 'tweet_location', 'airline_sentiment']]

print(most_active_users_data)

            airline     name  \
0          American  otisday   
1          American  otisday   
2          American  otisday   
3          American  otisday   
4          American  otisday   
..              ...      ...   
165  Virgin America  wmrrock   
166  Virgin America  wmrrock   
167  Virgin America  wmrrock   
168  Virgin America  wmrrock   
169  Virgin America  wmrrock   

                                                  text tweet_location  \
0    @AmericanAir @praywinn between understaffing, ...          Pekin   
1    @AmericanAir @jameswester see, James, we only ...          Pekin   
2    @AmericanAir @cjdjpdx not a valid response in ...          Pekin   
3    @AmericanAir @kaps12 this is an international ...          Pekin   
4    @AmericanAir @PatrichRuben no, profit maximiza...          Pekin   
..                                                 ...            ...   
165  @VirginAmerica on flight VX399 headed to LA fr...             CT   
166  @VirginAmerica You should 

In [5]:
# Find the number of missing values for tweet_location and user_timezone
missing_tweet_location = data['tweet_location'].isna().sum()
missing_user_timezone = data['user_timezone'].isna().sum()

print(f'Missing values in tweet_location: {missing_tweet_location}')
print(f'Missing values in user_timezone: {missing_user_timezone}')

# Drop rows with missing values in tweet_location and user_timezone
data_cleaned = data.dropna(subset=['tweet_location', 'user_timezone'])

print(f'Number of rows after dropping missing values: {data_cleaned.shape[0]}')

Missing values in tweet_location: 4733
Missing values in user_timezone: 4820
Number of rows after dropping missing values: 7758


In [6]:
# Parse the tweet_created field as datetime
data['tweet_created'] = pd.to_datetime(data['tweet_created'])

# Verify the changes
print(data['tweet_created'].dtypes)
print(data['tweet_created'].head())

datetime64[ns, UTC-08:00]
0   2015-02-24 11:35:52-08:00
1   2015-02-24 11:15:59-08:00
2   2015-02-24 11:15:48-08:00
3   2015-02-24 11:15:36-08:00
4   2015-02-24 11:14:45-08:00
Name: tweet_created, dtype: datetime64[ns, UTC-08:00]


In [7]:
import re

# Define a list of possible variations of "Philadelphia"
philadelphia_variations = [
    r'philadelphia', r'philly', r'phila', r'phillydelphia', r'phillydelphia', r'phillydelphia', r'phillydelphia'
]

# Create a regex pattern to match any of the variations
pattern = re.compile('|'.join(philadelphia_variations), re.IGNORECASE)

# Find all tweets from Philadelphia
philadelphia_tweets = data[data['tweet_location'].str.contains(pattern, na=False)]

# Print the total number of tweets from Philadelphia
total_philadelphia_tweets = philadelphia_tweets.shape[0]
print(f'Total number of tweets from Philadelphia: {total_philadelphia_tweets}')

# Print all different spellings of Philadelphia found in the dataset
unique_spellings = philadelphia_tweets['tweet_location'].unique()
print(f'Different spellings of Philadelphia: {unique_spellings}')

Total number of tweets from Philadelphia: 92
Different spellings of Philadelphia: ['Philadelphia, Pa' 'Philly Burbs' 'Los Angeles, CA (via Philly)'
 'Phila. PA' 'Philadelphia, PA' 'Philadelphia PA ' 'Philadelphia PA'
 'Philly' 'Philadelphia/Cali' 'Philadelphia' 'philadephia, pa'
 'Philadelphia, PA USA' 'Philly Yo' 'Philly Area'
 'Philly, Chicago, MSP, Vegas' 'Philly to NY/NJ' 'Phila, Princeton, NYC. '
 'Old City Philly' 'philadelphia, pa' 'Los Angeles by way of Philly'
 'Philadelphia, USA' 'philadelphia' 'Philadelphia Suburbs' 'Phila, PA']


In [8]:
# Create a subset of the dataset
subset_data = data[data['airline_sentiment_confidence'] > 0.6]

# Save the subset to a CSV file
subset_data.to_csv('subset_data.csv', index=False)