# Kaggle Challenge

#### Ashton Prescott and Juan Lucena Fois

## PART I

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

### Section A

##### a)

In [None]:
import pandas as pd

# Load the Excel file
df = pd.read_excel('training_data.xlsx')

# Decode the 'full_text' column
df['decoded_text'] = df['full_text'].apply(lambda x: x.encode().decode('unicode_escape') if isinstance(x, str) else '')

# Calculate tweet length metrics
df['tweet_characters'] = df['decoded_text'].apply(len)
df['tweet_words'] = df['decoded_text'].apply(lambda x: len(x.split()))

# Process hashtags column
df['hashtag_list'] = df['hashtags'].apply(lambda x: x.split() if isinstance(x, str) else [])
df['hashtag_characters'] = df['hashtag_list'].apply(lambda tags: sum(len(tag) for tag in tags))
df['hashtag_words'] = df['hashtag_list'].apply(len)

# Generate descriptive statistics
summary = {
    'Metric': ['Tweet Characters', 'Tweet Words', 'Hashtag Characters', 'Hashtag Words'],
    'Minimum': [
        df['tweet_characters'].min(), df['tweet_words'].min(),
        df['hashtag_characters'].min(), df['hashtag_words'].min()
    ],
    'Average': [
        df['tweet_characters'].mean(), df['tweet_words'].mean(),
        df['hashtag_characters'].mean(), df['hashtag_words'].mean()
    ],
    'Median': [
        df['tweet_characters'].median(), df['tweet_words'].median(),
        df['hashtag_characters'].median(), df['hashtag_words'].median()
    ],
    'Maximum': [
        df['tweet_characters'].max(), df['tweet_words'].max(),
        df['hashtag_characters'].max(), df['hashtag_words'].max()
    ]
}

summary_df = pd.DataFrame(summary)

# Display the summary table
print(summary_df)

##### b)

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

# Dictionary to store the top 10 hashtags by country
top_hashtags_by_country = {}

# Group by country and count hashtags
for country, group in df.groupby('country_user'):
    # Flatten the list of hashtags for this country
    all_hashtags = [hashtag for hashtags in group['hashtag_list'] for hashtag in hashtags]
    
    # Count the occurrences of each hashtag
    hashtag_counts = Counter(all_hashtags)
    
    # Get the top 10 hashtags
    top_10 = hashtag_counts.most_common(10)
    top_hashtags_by_country[country] = top_10

    # Prepare data for pie chart
    labels, values = zip(*top_10)
    plt.figure(figsize=(8, 8))
    plt.pie(values, labels=labels, autopct='%1.1f%%', startangle=140)
    plt.title(f"Top 10 Hashtags in {country}")
    plt.show()

##### c)

In [None]:
# Group by country and political view, calculate counts
political_views_by_country = df.groupby(['country_user', 'pol_spec_user']).size().unstack(fill_value=0)

# Normalize the counts to calculate percentages
political_views_percentages = political_views_by_country.div(political_views_by_country.sum(axis=1), axis=0)

# Plot the stacked bar chart
ax = political_views_percentages.plot(
    kind='bar',
    stacked=True,
    figsize=(12, 8),
    edgecolor='black'
)

# Add titles and labels
plt.title('Percentage Distribution of Political Views by Country', fontsize=16)
plt.ylabel('Percentage', fontsize=14)
plt.xlabel('Country', fontsize=14)
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)

# Add legend and grid
plt.legend(title='Political View', fontsize=12, title_fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Display the plot
plt.tight_layout()
plt.show()

##### d)

In [None]:
# Group by country and gender, calculate counts
gender_by_country = df.groupby(['country_user', 'gender_user']).size().unstack(fill_value=0)

# Normalize the counts to calculate percentages
gender_percentages = gender_by_country.div(gender_by_country.sum(axis=1), axis=0)

# Plot the stacked bar chart
ax = gender_percentages.plot(
    kind='bar',
    stacked=True,
    figsize=(12, 8),
    cmap='coolwarm',  # Use a distinct colormap for genders
    edgecolor='black'
)

# Add titles and labels
plt.title('Percentage Distribution of Gender by Country', fontsize=16)
plt.ylabel('Percentage', fontsize=14)
plt.xlabel('Country', fontsize=14)
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)

# Add legend and grid
plt.legend(title='Gender', fontsize=12, title_fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Display the plot
plt.tight_layout()
plt.show()


### Section B

##### a)

In [None]:
# Import the lemmatization function from lemmatizer.py
from lemmatizer import lemmatize_tweet

# import pandas as pd
# from langdetect import detect
# from langdetect.lang_detect_exception import LangDetectException
# from googletrans import Translator

# translator = Translator()

# # Function to detect language safely
# def detect_language_safe(text):
#     try:
#         return detect(text)
#     except LangDetectException:
#         return "unknown"

# # Add language detection
# df['lang'] = df['decoded_text'].apply(detect_language_safe)

# # Translate and analyze sentiment
# def translate_and_analyze(tweet, lang):
#     if lang != 'en' and lang != "unknown":  # Translate only if not English and language is detected
#         tweet = translator.translate(tweet, src=lang, dest='en').text

# # Translate tweet
# df['translated_text'] = df.apply(lambda row: translate_and_analyze(row['decoded_text'], row['lang']), axis=1)

# Apply the lemmatizer function to the full_text (decoded_text) column
df['text_clean'] = df['decoded_text'].apply(lemmatize_tweet)

# Calculate the number of characters and words for the cleaned text
df['clean_text_characters'] = df['text_clean'].apply(len)
df['clean_text_words'] = df['text_clean'].apply(lambda x: len(x.split()))

# Expand the summary table with statistics for the cleaned text
summary['Metric'].extend(['Clean Text Characters', 'Clean Text Words'])
summary['Minimum'].extend([
    df['clean_text_characters'].min(), 
    df['clean_text_words'].min()
])
summary['Average'].extend([
    df['clean_text_characters'].mean(), 
    df['clean_text_words'].mean()
])
summary['Median'].extend([
    df['clean_text_characters'].median(), 
    df['clean_text_words'].median()
])
summary['Maximum'].extend([
    df['clean_text_characters'].max(), 
    df['clean_text_words'].max()
])

# Convert the expanded summary into a DataFrame
expanded_summary_df = pd.DataFrame(summary)

# Display the expanded summary table
print(expanded_summary_df)
# Export the results to a CSV file
df.to_csv('clean_train.csv', index=False, header=1)


##### b)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

# Vectorize the cleaned text using TF-IDF and CountVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tf_vectorizer = CountVectorizer(max_features=5000, stop_words='english')

# Create TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text_clean'])

# Create term frequency matrix
tf_matrix = tf_vectorizer.fit_transform(df['text_clean'])

# Define the number of topics
n_topics = 10

# Perform Latent Dirichlet Allocation (LDA)
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda_topics = lda.fit_transform(tf_matrix)

# Perform Non-negative Matrix Factorization (NMF)
nmf = NMF(n_components=n_topics, random_state=42)
nmf_topics = nmf.fit_transform(tfidf_matrix)

# Function to display top words for each topic
def display_topics(model, feature_names, num_words=10):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-num_words - 1:-1]]
        topics[f"Topic {topic_idx+1}"] = top_words
    return pd.DataFrame.from_dict(topics, orient='index', columns=[f"Word {i+1}" for i in range(num_words)])

# Display top words for LDA
print("Top words for each LDA topic:")
lda_topics_df = display_topics(lda, tf_vectorizer.get_feature_names_out())
display(lda_topics_df)

# Display top words for NMF
print("Top words for each NMF topic:")
nmf_topics_df = display_topics(nmf, tfidf_vectorizer.get_feature_names_out())
display(nmf_topics_df)

## PART II

In [None]:
import pandas as pd
from lemmatizer import tweet_cleaner

# Load the data
# Apply cleaning function to both datasets
train_data = pd.read_csv("clean_train.csv")
test_data = pd.read_excel("test_data.xlsx")
train_data['full_text'] = train_data['full_text'].fillna("").astype(str).apply(tweet_cleaner)
test_data['full_text'] = test_data['full_text'].fillna("").astype(str).apply(tweet_cleaner)

test_data.to_csv("test_clean.csv", index=False, header=1)
train_data.to_csv("train_clean.csv", index=False, header=1)

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device: ", device)

# Load the data
train_data = pd.read_csv("train_clean.csv")
test_data = pd.read_csv("test_clean.csv")
# Clean train data
train_data['full_text'] = train_data['full_text'].astype(str).fillna("")
# Clean test data
test_data['full_text'] = test_data['full_text'].astype(str).fillna("")
# Encode labels for train_data
label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['pol_spec_user'])

# Use a smaller multilingual model
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

# Create Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_data[['full_text', 'label']])
test_dataset = Dataset.from_pandas(test_data[['Id', 'full_text']])  # Keep Id column for final output

# Tokenize the datasets
def tokenize_function(examples):
    return tokenizer(examples["full_text"], truncation=True, padding="max_length", max_length=128)

# Tokenize and prepare train dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
train_dataset = train_dataset.remove_columns(["full_text"])
train_dataset = train_dataset.rename_column("label", "labels")
train_dataset.set_format("torch")

# Tokenize and prepare test dataset
test_dataset = test_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.remove_columns(["full_text"])
test_dataset.set_format("torch")

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,  # Reduced epochs for faster training
    per_device_train_batch_size=32,  # Larger batch size for faster iterations
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    fp16=False,  # Enable this for mixed precision on GPU
    save_strategy="steps",
    save_steps=500,         # Save every 500 steps
    save_total_limit=5,     # Optional: keep only latest 5 checkpoints
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device: ", device)

# Load the data
train_data = pd.read_csv("train_clean.csv")
test_data = pd.read_csv("test_clean.csv")

# Clean and prepare the data
train_data['full_text'] = train_data['full_text'].astype(str).fillna("")
test_data['full_text'] = test_data['full_text'].astype(str).fillna("")

# Encode labels for train_data
label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['pol_spec_user'])

# Use a smaller multilingual model
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

# Tokenize datasets
def tokenize_function(examples):
    return tokenizer(examples["full_text"], truncation=True, padding="max_length", max_length=128)

# Split training data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_data['full_text'],  # Feature: text data
    train_data['label'],      # Labels: encoded labels
    test_size=0.2,            # 20% for validation
    random_state=42           # Reproducibility
)

# Create Hugging Face Dataset objects
train_dataset = Dataset.from_dict({"full_text": train_texts, "label": train_labels})
val_dataset = Dataset.from_dict({"full_text": val_texts, "label": val_labels})

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
train_dataset = train_dataset.rename_column("label", "labels")
train_dataset = train_dataset.remove_columns(["full_text"])
train_dataset.set_format("torch")

val_dataset = val_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.rename_column("label", "labels")
val_dataset = val_dataset.remove_columns(["full_text"])
val_dataset.set_format("torch")

# Tokenize test dataset
test_dataset = Dataset.from_pandas(test_data[['Id', 'full_text']])
test_dataset = test_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.remove_columns(["full_text"])
test_dataset.set_format("torch")

import evaluate

# Load accuracy metric
accuracy_metric = evaluate.load("accuracy")

# Define the compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    return {"accuracy": accuracy["accuracy"]}

# Updated training arguments with epoch evaluation strategy
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,  # Total number of epochs
    per_device_train_batch_size=32,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=1000,  # Log every 500 steps
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",  # Save the model at the end of each epoch
    save_total_limit=5,  # Keep the latest 3 checkpoints
    load_best_model_at_end=True,  # Automatically load the best checkpoint
    metric_for_best_model="accuracy",  # Use accuracy as the metric
    greater_is_better=True,  # Higher accuracy is better
    seed=42,  # Set seed for reproducibility
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,  # Custom metric function
)

# Training
trainer.train(resume_from_checkpoint=False)

Using device:  cuda


In [28]:
# Predict on the test set
predictions = trainer.predict(test_dataset)
predicted_classes = torch.argmax(torch.tensor(predictions.predictions), axis=1)

# Map predictions back to labels
predicted_labels = label_encoder.inverse_transform(predicted_classes.numpy())

# Add predictions to test_data and save
test_data['predicted_label'] = predicted_labels
test_data[['Id', 'predicted_label']].to_csv("predictions_v3.csv", index=False)

print("Predictions saved to 'predictions.csv'.")


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

Predictions saved to 'predictions.csv'.
