In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import re
from nltk.stem import WordNetLemmatizer

# Load the dataset
file_path = './Tweets.csv'
tweets_data = pd.read_csv(file_path)

# 1. Find number of unique users and compute top-5 words from their tweets using TF-IDF
unique_users = tweets_data['name'].nunique()
print(f"Number of unique users: {unique_users}")

# Create a dictionary to store top-5 words for each user
user_top_words = {}
for user in tweets_data['name'].unique():
    user_tweets = tweets_data[tweets_data['name'] == user]['text']
    if not user_tweets.empty:
        tfidf = TfidfVectorizer(stop_words='english', max_features=5)
        tfidf_matrix = tfidf.fit_transform(user_tweets)
        top_words = tfidf.get_feature_names_out()
        user_top_words[user] = top_words

# 2. For each airline, find the most active users and their details
# 2. For each airline, find the most active users and their details
most_active_users_per_airline = {}
for airline in tweets_data['airline'].unique():
    airline_data = tweets_data[tweets_data['airline'] == airline]
    most_active_user = airline_data['name'].value_counts().idxmax()
    user_tweets = airline_data[airline_data['name'] == most_active_user]
    user_details = {
        "tweets": user_tweets['text'].tolist(),
        "locations": user_tweets['tweet_location'].tolist(),
        "sentiments": user_tweets['airline_sentiment'].tolist()
    }
    most_active_users_per_airline[airline] = {
        "user": most_active_user,
        "details": user_details
    }

# Display a sample result in a specific format
most_active_users_sample = {airline: details for airline, details in list(most_active_users_per_airline.items())[:3]}

# Print in the required format
for airline, info in most_active_users_sample.items():
    print(f"Airline: {airline}")
    print(f"  Most Active User: {info['user']}")
    print(f"  Tweets:")
    for tweet, location, sentiment in zip(
        info['details']['tweets'],  # Displaying first 5 tweets for brevity
        info['details']['locations'],
        info['details']['sentiments']
    ):
        print(f"    - {tweet}")
        print(f"    - {location}")
        print(f"    - {sentiment}")
        print()

# 3. Find missing values in 'tweet_location' and 'user_timezone'
missing_tweet_location = tweets_data['tweet_location'].isnull().sum()
missing_user_timezone = tweets_data['user_timezone'].isnull().sum()
print(f"Missing values in 'tweet_location': {missing_tweet_location}")
print(f"Missing values in 'user_timezone': {missing_user_timezone}")
tweets_data_cleaned = tweets_data.dropna(subset=['tweet_location', 'user_timezone'])

# 4. Parse 'tweet_created' as a date
tweets_data_cleaned['tweet_created'] = pd.to_datetime(tweets_data_cleaned['tweet_created'], errors='coerce')
print(f"tweet_created type: {tweets_data_cleaned['tweet_created'].dtype}")

# 5. Find tweets from Philadelphia with variations in spelling
philadelphia_variations = [
    "Philadelphia", "philadelphia", "Phila", "Philly", "philly",
    "Phildelphia", "Phildelhia", "Philadephia", "Philadelpia", "Philadlephia"
]
philadelphia_tweets = tweets_data_cleaned[
    tweets_data_cleaned['tweet_location'].str.contains('|'.join(philadelphia_variations), na=False, case=False)
]
total_philadelphia_tweets = philadelphia_tweets.shape[0]
unique_variations = philadelphia_tweets['tweet_location'].str.lower().unique()
print(f"Total Philadelphia tweets: {total_philadelphia_tweets}")
print(f"Unique variations: {unique_variations}")

# 6. Create a subset with 'airline_sentiment_confidence' > 0.6
confidence_subset = tweets_data_cleaned[tweets_data_cleaned['airline_sentiment_confidence'] > 0.6]
subset_file_path = './airline_sentiment_confidence_above_0_6.csv'
confidence_subset.to_csv(subset_file_path, index=False)
print(f"Subset saved to: {subset_file_path}, Total rows: {confidence_subset.shape[0]}")


Number of unique users: 7701
Airline: Virgin America
  Most Active User: wmrrock
  Tweets:
    - @VirginAmerica cool picture of another VirginAmerica plane off our wing. What a site! http://t.co/5B2agFd8c4
    - CT
    - positive

    - @VirginAmerica seats in Row 8 don't recline should mention that on your website #soreback
    - CT
    - negative

    - @VirginAmerica flight 404 delayed 2 hours in LA due to mechanical problems. Handle like pros but you could have tossed us a free drink.
    - CT
    - negative

    - @VirginAmerica on VX399 from JFK to LA - dirty plane - not up to your standards.
    - CT
    - negative

    - @VirginAmerica on flight VX399 headed to LA from JFK - dirtiest VA plane I have ever been on. Sad for a great airline.
    - CT
    - negative

    - @VirginAmerica You should still develop an app - then you will be my favorite airline.
    - CT
    - neutral

    - @VirginAmerica got it. All set - Thanks!
    - CT
    - positive

    - @VirginAmerica Only thin

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_data_cleaned['tweet_created'] = pd.to_datetime(tweets_data_cleaned['tweet_created'], errors='coerce')


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import re
from nltk.stem import WordNetLemmatizer

# Load the dataset
file_path = './Tweets.csv'
tweets_data = pd.read_csv(file_path)

# 1. Find number of unique users and compute top-5 words from their tweets using TF-IDF
unique_users = tweets_data['name'].nunique()
print(f"Number of unique users: {unique_users}")

# Create a dictionary to store top-5 words for each user
user_top_words = {}
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5)

for user in tweets_data['name'].unique():
    user_tweets = tweets_data[tweets_data['name'] == user]['text']
    if not user_tweets.empty:
        # Combine all tweets from the same user into one document
        combined_tweets = " ".join(user_tweets)
        tfidf_matrix = tfidf_vectorizer.fit_transform([combined_tweets])  # Fit on combined tweets
        top_words = tfidf_vectorizer.get_feature_names_out()
        user_top_words[user] = top_words

# Display top-5 words for a sample user
sample_user = list(user_top_words.keys())[:10]
print(f"Sample User: {sample_user}, Top-5 Words: {user_top_words[sample_user]}")

# # 2. For each airline, find the most active users and their details
# most_active_users_per_airline = {}
# for airline in tweets_data['airline'].unique():
#     airline_data = tweets_data[tweets_data['airline'] == airline]
#     most_active_user = airline_data['name'].value_counts().idxmax()
#     user_tweets = airline_data[airline_data['name'] == most_active_user]
#     user_details = {
#         "tweets": user_tweets['text'].tolist(),
#         "locations": user_tweets['tweet_location'].tolist(),
#         "sentiments": user_tweets['airline_sentiment'].tolist()
#     }
#     most_active_users_per_airline[airline] = {
#         "user": most_active_user,
#         "details": user_details
#     }

# # Display a sample result in a specific format
# most_active_users_sample = {airline: details for airline, details in list(most_active_users_per_airline.items())[:3]}

# # Print in the required format
# for airline, info in most_active_users_sample.items():
#     print(f"Airline: {airline}")
#     print(f"  Most Active User: {info['user']}")
#     print(f"  Tweets:")
#     for tweet, location, sentiment in zip(
#         info['details']['tweets'],  # Displaying first 5 tweets for brevity
#         info['details']['locations'],
#         info['details']['sentiments']
#     ):
#         print(f"    - {tweet}")
#         print(f"    - {location}")
#         print(f"    - {sentiment}")
#         print()

# # 3. Find missing values in 'tweet_location' and 'user_timezone'
# missing_tweet_location = tweets_data['tweet_location'].isnull().sum()
# missing_user_timezone = tweets_data['user_timezone'].isnull().sum()
# print(f"Missing values in 'tweet_location': {missing_tweet_location}")
# print(f"Missing values in 'user_timezone': {missing_user_timezone}")
# tweets_data_cleaned = tweets_data.dropna(subset=['tweet_location', 'user_timezone'])

# # 4. Parse 'tweet_created' as a date
# tweets_data_cleaned['tweet_created'] = pd.to_datetime(tweets_data_cleaned['tweet_created'], errors='coerce')
# print(f"tweet_created type: {tweets_data_cleaned['tweet_created'].dtype}")

# # 5. Find tweets from Philadelphia with variations in spelling
# philadelphia_variations = [
#     "Philadelphia", "philadelphia", "Phila", "Philly", "philly",
#     "Phildelphia", "Phildelhia", "Philadephia", "Philadelpia", "Philadlephia"
# ]
# philadelphia_tweets = tweets_data_cleaned[
#     tweets_data_cleaned['tweet_location'].str.contains('|'.join(philadelphia_variations), na=False, case=False)
# ]
# total_philadelphia_tweets = philadelphia_tweets.shape[0]
# unique_variations = philadelphia_tweets['tweet_location'].str.lower().unique()
# print(f"Total Philadelphia tweets: {total_philadelphia_tweets}")
# print(f"Unique variations: {unique_variations}")

# # 6. Create a subset with 'airline_sentiment_confidence' > 0.6
# confidence_subset = tweets_data_cleaned[tweets_data_cleaned['airline_sentiment_confidence'] > 0.6]
# subset_file_path = './airline_sentiment_confidence_above_0_6_1206.csv'
# confidence_subset.to_csv(subset_file_path, index=False)
# print(f"Subset saved to: {subset_file_path}, Total rows: {confidence_subset.shape[0]}")


Number of unique users: 7701


TypeError: unhashable type: 'list'

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset
file_path = './Tweets.csv'
tweets_data = pd.read_csv(file_path)

# Find number of unique users
unique_users = tweets_data['name'].nunique()
print(f"Number of unique users: {unique_users}")

# Create a dictionary to store top-5 words for each user
user_top_words = {}
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5)

# Compute top-5 words for each unique user
for user in tweets_data['name'].unique():
    # Get all tweets from the current user
    user_tweets = tweets_data[tweets_data['name'] == user]['text']
    if not user_tweets.empty:
        # Combine all tweets of the user into a single document
        combined_tweets = " ".join(user_tweets)
        # Compute TF-IDF on the combined tweets
        tfidf_matrix = tfidf_vectorizer.fit_transform([combined_tweets])
        # Get top-5 words based on TF-IDF scores
        top_words = tfidf_vectorizer.get_feature_names_out()
        user_top_words[user] = top_words

# Print top-5 words for the first 5 users
print("\nTop-5 Words for First 5 Users:")
for idx, (user, words) in enumerate(user_top_words.items()):
    print(f"User: {user}")
    print(f"Top-5 Words: {', '.join(words)}")
    print()
    if idx == 4:  # Stop after printing 5 users
        break


Number of unique users: 7701

Top-5 Words for First 5 Users:
User: cairdin
Top-5 Words: dhepburn, said, virginamerica

User: jnardino
Top-5 Words: bad, really, thing, va, virginamerica

User: yvonnalynn
Top-5 Words: didn, mean, need, today, trip

User: cjmcginnis
Top-5 Words: away, ear, fly, nearly, time

User: pilot
Top-5 Words: hats, https, men, missed, mwpg7grezp

