In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# Load the dataset
data = pd.read_csv("/content/training.1600000.processed.noemoticon.csv", encoding='latin-1')
data.columns = ['target', 'id', 'date', 'query', 'user', 'tweet']

In [None]:
data.head()

In [None]:
# Data Preprocessing
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [None]:
# Function to preprocess a single tweet
def preprocess_tweet(tweet):
    # Remove URLs, mentions, and special characters
    tweet = re.sub(r"http\S+|www\S+|@\S+|\W", " ", tweet)
    # Convert to lowercase
    tweet = tweet.lower()
    # Tokenize the tweet
    tokens = nltk.word_tokenize(tweet)
    # Remove stop words and perform stemming
    preprocessed_tweet = [stemmer.stem(word) for word in tokens if word not in stop_words]
    # Join the words back into a single string
    preprocessed_tweet = ' '.join(preprocessed_tweet)
    return preprocessed_tweet

In [None]:
# Preprocess all tweets
data['preprocessed_tweet'] = data['tweet'].apply(preprocess_tweet)

In [None]:
#Feature Extraction
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(data['preprocessed_tweet'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, data['tweet'], test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)


In [None]:
'''
def generate_tweet(seed_sentence):
    seed_sentence = preprocess_tweet(seed_sentence)
    seed_feature = vectorizer.transform([seed_sentence])
    predicted_target = model.predict(seed_feature)
    generated_target = int(predicted_target[0])
    # Filter tweets with the generated target
    filtered_tweets = data[data['target'] == generated_target]['tweet']
    if len(filtered_tweets) == 0:
        return "No matching tweet found for the generated target."
    else:
        # Randomly select a tweet from the filtered tweets
        generated_tweet = filtered_tweets.sample().iloc[0]
        return generated_tweet
        
        '''

def generate_tweet(seed_sentence):
    seed_sentence = preprocess_tweet(seed_sentence)
    seed_feature = vectorizer.transform([seed_sentence])
    predicted_tweet = model.predict(seed_feature)
    return predicted_tweet[0]        
        

In [None]:
seed_sentence = 'the '
generated_tweet = generate_tweet(seed_sentence)
print("Generated target:", generated_tweet)

In [None]:
def calculate_bleu_score(reference, candidate):
    reference = reference.split()
    candidate = candidate.split()
    return sentence_bleu([reference], candidate, smoothing_function=SmoothingFunction().method1)

def calculate_rouge_scores(reference, candidate):
    rouge = Rouge()
    scores = rouge.get_scores(candidate, reference)[0]
    return scores['rouge-1']['f'], scores['rouge-2']['f'], scores['rouge-l']['f']

# Generate a set of tweets and evaluate them
reference_tweets = data['tweet'].sample(100)  # Select a subset of reference tweets
generated_tweets = []
for reference_tweet in reference_tweets:
    seed_sentence = reference_tweet[:20]  # Use the first 20 characters as the seed
    generated_tweet = generate_tweet(seed_sentence)
    generated_tweets.append(generated_tweet)

# Calculate evaluation metrics
bleu_scores = []
rouge_1_scores = []
rouge_2_scores = []
rouge_l_scores = []

for reference_tweet, generated_tweet in zip(reference_tweets, generated_tweets):
        # Calculate evaluation metrics
    bleu_score = calculate_bleu_score(reference_tweet, generated_tweet)
    rouge_1_score, rouge_2_score, rouge_l_score = calculate_rouge_scores(reference_tweet, generated_tweet)

    # Append scores to the lists
    bleu_scores.append(bleu_score)
    rouge_1_scores.append(rouge_1_score)
    rouge_2_scores.append(rouge_2_score)
    rouge_l_scores.append(rouge_l_score)

# Calculate average scores
average_bleu_score = np.mean(bleu_scores)
average_rouge_1_score = np.mean(rouge_1_scores)
average_rouge_2_score = np.mean(rouge_2_scores)
average_rouge_l_score = np.mean(rouge_l_scores)

# Print average scores
print("Average BLEU Score:", average_bleu_score)
print("Average ROUGE-1 Score:", average_rouge_1_score)
print("Average ROUGE-2 Score:", average_rouge_2_score)
print("Average ROUGE-L Score:", average_rouge_l_score)