In [2]:
!pip install reportlab

Collecting reportlab
  Downloading reportlab-4.2.0-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: reportlab
Successfully installed reportlab-4.2.0


In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, PageBreak
from reportlab.lib.styles import getSampleStyleSheet
from wordcloud import WordCloud
from textblob import TextBlob
import os

def load_data(filepath):
    df = pd.read_csv(filepath)
    return df

def generate_visualizations(df):
    sns.set_style("whitegrid")

    # Visualization 1: Distribution of Labels
    plt.figure(figsize=(6, 4))
    sns.countplot(x='label', data=df)
    plt.title('Distribution of Tweets by Label')
    plt.xlabel('Label')
    plt.ylabel('Count')
    plt.savefig('distribution_of_tweets_by_label.png')
    plt.close()

    # Visualization 2: Length of Tweets
    df['tweet_length'] = df['tweet'].apply(len)
    plt.figure(figsize=(10, 6))
    sns.histplot(df['tweet_length'], bins=30, kde=True)
    plt.title('Distribution of Tweet Length')
    plt.xlabel('Tweet Length')
    plt.ylabel('Frequency')
    plt.savefig('tweet_length_distribution.png')
    plt.close()

    # Visualization 3: Boxplot of Tweet Length by Label
    plt.figure(figsize=(6, 4))
    sns.boxplot(x='label', y='tweet_length', data=df)
    plt.title('Tweet Length by Label')
    plt.xlabel('Label')
    plt.ylabel('Tweet Length')
    plt.xticks([0, 1], ['Positive tweets', 'Negative tweets'])
    plt.savefig('tweet_length_by_label.png')
    plt.close()

    # Generate and save wordclouds for each sentiment
    grouped_tweets = df.groupby('sentiment')['tweet'].apply(lambda tweets: ' '.join(tweets)).to_dict()
    for sentiment, tweets in grouped_tweets.items():
        wordcloud = WordCloud(width=800, height=800, background_color='white', min_font_size=10).generate(tweets)
        plt.figure(figsize=(8, 8))
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis("off")
        plt.title(f"Word Cloud for {sentiment.capitalize()} Sentiment")
        plt.savefig(f'wordcloud_{sentiment}.png')
        plt.close()

def train_and_evaluate(df):
    X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['label'], test_size=0.2, random_state=42)
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    report = classification_report(y_test, predictions, output_dict=True)
    return report

def save_report(report):
    with open('classification_report.txt', 'w') as f:
        f.write(str(report))

from PIL import Image as PILImage

def resize_image(image_path, max_width=500, max_height=500):
    with PILImage.open(image_path) as img:
        w_percent = (max_width / float(img.size[0]))
        h_size = int((float(img.size[1]) * float(w_percent)))
        img = img.resize((max_width, h_size), PILImage.ANTIALIAS)

        new_image_path = f"resized_{image_path}"
        img.save(new_image_path)
        return new_image_path

def generate_pdf():
    doc = SimpleDocTemplate("report.pdf", pagesize=letter)
    styles = getSampleStyleSheet()
    story = [Paragraph('Model Evaluation Report', styles['Title'])]

    # Image files to include in the PDF
    image_files = [
        'distribution_of_tweets_by_label.png',
        'tweet_length_distribution.png',
        'tweet_length_by_label.png'
    ]
    sentiments = ['positive', 'negative', 'neutral']
    image_files.extend([f'wordcloud_{sent}.png' for sent in sentiments])

    # Add resized images and text reports to the PDF document
    for image_path in image_files:
        if os.path.exists(image_path):
            resized_path = resize_image(image_path)
            story.append(Image(resized_path))
            story.append(PageBreak())

    with open('classification_report.txt', 'r') as f:
        lines = f.readlines()
    for line in lines:
        story.append(Paragraph(line, styles['BodyText']))
        story.append(Spacer(1, 12))

    doc.build(story)
    print("PDF Generated Successfully.")

def main():
    df = load_data('train_tweets_data.csv')
    df['sentiment'] = df['tweet'].apply(lambda x: 'positive' if TextBlob(x).sentiment.polarity > 0 else 'negative' if TextBlob(x).sentiment.polarity < 0 else 'neutral')
    generate_visualizations(df)
    report = train_and_evaluate(df)
    save_report(report)
    generate_pdf()

if __name__ == "__main__":
    main()

  img = img.resize((max_width, h_size), PILImage.ANTIALIAS)


PDF Generated Successfully.
